1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * GROwing Monsters And Cloning Shrimps
36 /* IMPORTANT FOR DEVELOPERS:
38 * Triclinic pme stuff isn't entirely trivial, and we've experienced
39 * some bugs during development (many of them due to me). To avoid
40 * this in the future, please check the following things if you make
41 * changes in this file:
43 * 1. You should obtain identical (at least to the PME precision)
44 * energies, forces, and virial for
45 * a rectangular box and a triclinic one where the z (or y) axis is
46 * tilted a whole box side. For instance you could use these boxes:
48 * rectangular triclinic
53 * 2. You should check the energy conservation in a triclinic box.
55 * It might seem an overkill, but better safe than sorry.
77 #include "gmxcomplex.h"
81 #include "gmx_fatal.h"
87 #include "gmx_wallcycle.h"
88 #include "gmx_parallel_3dfft.h"
91 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
92 #include "gmx_sse2_single.h"
95 #include "mpelogging.h"
98 /* #define PRT_FORCE */
99 /* conditions for on the fly time-measurement */
100 /* #define TAKETIME (step > 1 && timesteps < 10) */
101 #define TAKETIME FALSE
104 #define mpi_type MPI_DOUBLE
106 #define mpi_type MPI_FLOAT
109 /* Internal datastructures */
125 int *send_id
,*recv_id
;
126 pme_grid_comm_t
*comm_data
;
130 int dimind
; /* The index of the dimension, 0=x, 1=y */
137 int *node_dest
; /* The nodes to send x and q to with DD */
138 int *node_src
; /* The nodes to receive x and q from with DD */
139 int *buf_index
; /* Index for commnode into the buffers */
146 int *count
; /* The number of atoms to send to each node */
147 int *rcount
; /* The number of atoms to receive */
154 gmx_bool bSpread
; /* These coordinates are used for spreading */
156 splinevec theta
,dtheta
;
158 rvec
*fractx
; /* Fractional coordinate relative to the
159 * lower cell boundary
163 typedef struct gmx_pme
{
164 int ndecompdim
; /* The number of decomposition dimensions */
165 int nodeid
; /* Our nodeid in mpi->mpi_comm */
168 int nnodes
; /* The number of nodes doing PME */
173 MPI_Comm mpi_comm_d
[2]; /* Indexed on dimension, 0=x, 1=y */
175 MPI_Datatype rvec_mpi
; /* the pme vector's MPI type */
178 gmx_bool bPPnode
; /* Node also does particle-particle forces */
179 gmx_bool bFEP
; /* Compute Free energy contribution */
180 int nkx
,nky
,nkz
; /* Grid dimensions */
184 real
* pmegridA
; /* Grids on which we do spreading/interpolation, includes overlap */
186 int pmegrid_nx
,pmegrid_ny
,pmegrid_nz
;
187 int pmegrid_start_ix
,pmegrid_start_iy
,pmegrid_start_iz
;
189 real
* pmegrid_sendbuf
;
190 real
* pmegrid_recvbuf
;
192 real
*fftgridA
; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
193 real
*fftgridB
; /* inside the interpolation grid, but separate for 2D PME decomp. */
194 int fftgrid_nx
,fftgrid_ny
,fftgrid_nz
;
196 t_complex
*cfftgridA
; /* Grids for complex FFT data */
197 t_complex
*cfftgridB
;
198 int cfftgrid_nx
,cfftgrid_ny
,cfftgrid_nz
;
200 gmx_parallel_3dfft_t pfft_setupA
;
201 gmx_parallel_3dfft_t pfft_setupB
;
204 real
*fshx
,*fshy
,*fshz
;
206 pme_atomcomm_t atc
[2]; /* Indexed on decomposition index */
210 pme_overlap_t overlap
[2]; /* Indexed on dimension, 0=x, 1=y */
213 pme_atomcomm_t atc_energy
; /* Only for gmx_pme_calc_energy */
215 rvec
*bufv
; /* Communication buffer */
216 real
*bufr
; /* Communication buffer */
217 int buf_nalloc
; /* The communication buffer size */
219 /* work data for solve_pme */
226 real
* work_tmp1_alloc
;
230 /* Work data for PME_redist */
231 gmx_bool redist_init
;
239 int redist_buf_nalloc
;
241 /* Work data for sum_qgrid */
242 real
* sum_qgrid_tmp
;
243 real
* sum_qgrid_dd_tmp
;
247 static void calc_interpolation_idx(gmx_pme_t pme
,pme_atomcomm_t
*atc
)
250 int *idxptr
,tix
,tiy
,tiz
;
251 real
*xptr
,*fptr
,tx
,ty
,tz
;
252 real rxx
,ryx
,ryy
,rzx
,rzy
,rzz
;
254 int start_ix
,start_iy
,start_iz
;
260 start_ix
= pme
->pmegrid_start_ix
;
261 start_iy
= pme
->pmegrid_start_iy
;
262 start_iz
= pme
->pmegrid_start_iz
;
264 rxx
= pme
->recipbox
[XX
][XX
];
265 ryx
= pme
->recipbox
[YY
][XX
];
266 ryy
= pme
->recipbox
[YY
][YY
];
267 rzx
= pme
->recipbox
[ZZ
][XX
];
268 rzy
= pme
->recipbox
[ZZ
][YY
];
269 rzz
= pme
->recipbox
[ZZ
][ZZ
];
271 for(i
=0; (i
<atc
->n
); i
++) {
273 idxptr
= atc
->idx
[i
];
274 fptr
= atc
->fractx
[i
];
276 /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
277 tx
= nx
* ( xptr
[XX
] * rxx
+ xptr
[YY
] * ryx
+ xptr
[ZZ
] * rzx
+ 2.0 );
278 ty
= ny
* ( xptr
[YY
] * ryy
+ xptr
[ZZ
] * rzy
+ 2.0 );
279 tz
= nz
* ( xptr
[ZZ
] * rzz
+ 2.0 );
285 /* Because decomposition only occurs in x and y,
286 * we never have a fraction correction in z.
288 fptr
[XX
] = tx
- tix
+ pme
->fshx
[tix
];
289 fptr
[YY
] = ty
- tiy
+ pme
->fshy
[tiy
];
292 idxptr
[XX
] = pme
->nnx
[tix
];
293 idxptr
[YY
] = pme
->nny
[tiy
];
294 idxptr
[ZZ
] = pme
->nnz
[tiz
];
297 range_check(idxptr
[XX
],0,pme
->pmegrid_nx
);
298 range_check(idxptr
[YY
],0,pme
->pmegrid_ny
);
299 range_check(idxptr
[ZZ
],0,pme
->pmegrid_nz
);
304 static void pme_calc_pidx(int natoms
, matrix recipbox
, rvec x
[],
310 real rxx
,ryx
,rzx
,ryy
,rzy
;
313 /* Calculate PME task index (pidx) for each grid index.
314 * Here we always assign equally sized slabs to each node
315 * for load balancing reasons (the PME grid spacing is not used).
322 /* Reset the count */
323 for(i
=0; i
<nslab
; i
++)
328 if (atc
->dimind
== 0)
330 rxx
= recipbox
[XX
][XX
];
331 ryx
= recipbox
[YY
][XX
];
332 rzx
= recipbox
[ZZ
][XX
];
333 /* Calculate the node index in x-dimension */
334 for(i
=0; (i
<natoms
); i
++)
337 /* Fractional coordinates along box vectors */
338 s
= nslab
*(xptr
[XX
]*rxx
+ xptr
[YY
]*ryx
+ xptr
[ZZ
]*rzx
);
339 si
= (int)(s
+ 2*nslab
) % nslab
;
346 ryy
= recipbox
[YY
][YY
];
347 rzy
= recipbox
[ZZ
][YY
];
348 /* Calculate the node index in y-dimension */
349 for(i
=0; (i
<natoms
); i
++)
352 /* Fractional coordinates along box vectors */
353 s
= nslab
*(xptr
[YY
]*ryy
+ xptr
[ZZ
]*rzy
);
354 si
= (int)(s
+ 2*nslab
) % nslab
;
361 static void pme_realloc_atomcomm_things(pme_atomcomm_t
*atc
)
365 if (atc
->n
> atc
->nalloc
) {
366 nalloc_old
= atc
->nalloc
;
367 atc
->nalloc
= over_alloc_dd(atc
->n
);
369 if (atc
->nslab
> 1) {
370 srenew(atc
->x
,atc
->nalloc
);
371 srenew(atc
->q
,atc
->nalloc
);
372 srenew(atc
->f
,atc
->nalloc
);
373 for(i
=nalloc_old
; i
<atc
->nalloc
; i
++)
375 clear_rvec(atc
->f
[i
]);
380 srenew(atc
->theta
[i
] ,atc
->pme_order
*atc
->nalloc
);
381 srenew(atc
->dtheta
[i
],atc
->pme_order
*atc
->nalloc
);
383 srenew(atc
->fractx
,atc
->nalloc
);
384 srenew(atc
->idx
,atc
->nalloc
);
389 static void pmeredist_pd(gmx_pme_t pme
, gmx_bool forw
,
390 int n
, gmx_bool bXF
, rvec
*x_f
, real
*charge
,
392 /* Redistribute particle data for PME calculation */
393 /* domain decomposition by x coordinate */
398 if(FALSE
== pme
->redist_init
) {
399 snew(pme
->scounts
,atc
->nslab
);
400 snew(pme
->rcounts
,atc
->nslab
);
401 snew(pme
->sdispls
,atc
->nslab
);
402 snew(pme
->rdispls
,atc
->nslab
);
403 snew(pme
->sidx
,atc
->nslab
);
404 pme
->redist_init
= TRUE
;
406 if (n
> pme
->redist_buf_nalloc
) {
407 pme
->redist_buf_nalloc
= over_alloc_dd(n
);
408 srenew(pme
->redist_buf
,pme
->redist_buf_nalloc
*DIM
);
415 /* forward, redistribution from pp to pme */
417 /* Calculate send counts and exchange them with other nodes */
418 for(i
=0; (i
<atc
->nslab
); i
++) pme
->scounts
[i
]=0;
419 for(i
=0; (i
<n
); i
++) pme
->scounts
[pme
->idxa
[i
]]++;
420 MPI_Alltoall( pme
->scounts
, 1, MPI_INT
, pme
->rcounts
, 1, MPI_INT
, atc
->mpi_comm
);
422 /* Calculate send and receive displacements and index into send
427 for(i
=1; i
<atc
->nslab
; i
++) {
428 pme
->sdispls
[i
]=pme
->sdispls
[i
-1]+pme
->scounts
[i
-1];
429 pme
->rdispls
[i
]=pme
->rdispls
[i
-1]+pme
->rcounts
[i
-1];
430 pme
->sidx
[i
]=pme
->sdispls
[i
];
432 /* Total # of particles to be received */
433 atc
->n
= pme
->rdispls
[atc
->nslab
-1] + pme
->rcounts
[atc
->nslab
-1];
435 pme_realloc_atomcomm_things(atc
);
437 /* Copy particle coordinates into send buffer and exchange*/
438 for(i
=0; (i
<n
); i
++) {
439 ii
=DIM
*pme
->sidx
[pme
->idxa
[i
]];
440 pme
->sidx
[pme
->idxa
[i
]]++;
441 pme
->redist_buf
[ii
+XX
]=x_f
[i
][XX
];
442 pme
->redist_buf
[ii
+YY
]=x_f
[i
][YY
];
443 pme
->redist_buf
[ii
+ZZ
]=x_f
[i
][ZZ
];
445 MPI_Alltoallv(pme
->redist_buf
, pme
->scounts
, pme
->sdispls
,
446 pme
->rvec_mpi
, atc
->x
, pme
->rcounts
, pme
->rdispls
,
447 pme
->rvec_mpi
, atc
->mpi_comm
);
450 /* Copy charge into send buffer and exchange*/
451 for(i
=0; i
<atc
->nslab
; i
++) pme
->sidx
[i
]=pme
->sdispls
[i
];
452 for(i
=0; (i
<n
); i
++) {
453 ii
=pme
->sidx
[pme
->idxa
[i
]];
454 pme
->sidx
[pme
->idxa
[i
]]++;
455 pme
->redist_buf
[ii
]=charge
[i
];
457 MPI_Alltoallv(pme
->redist_buf
, pme
->scounts
, pme
->sdispls
, mpi_type
,
458 atc
->q
, pme
->rcounts
, pme
->rdispls
, mpi_type
,
461 else { /* backward, redistribution from pme to pp */
462 MPI_Alltoallv(atc
->f
, pme
->rcounts
, pme
->rdispls
, pme
->rvec_mpi
,
463 pme
->redist_buf
, pme
->scounts
, pme
->sdispls
,
464 pme
->rvec_mpi
, atc
->mpi_comm
);
466 /* Copy data from receive buffer */
467 for(i
=0; i
<atc
->nslab
; i
++)
468 pme
->sidx
[i
] = pme
->sdispls
[i
];
469 for(i
=0; (i
<n
); i
++) {
470 ii
= DIM
*pme
->sidx
[pme
->idxa
[i
]];
471 x_f
[i
][XX
] += pme
->redist_buf
[ii
+XX
];
472 x_f
[i
][YY
] += pme
->redist_buf
[ii
+YY
];
473 x_f
[i
][ZZ
] += pme
->redist_buf
[ii
+ZZ
];
474 pme
->sidx
[pme
->idxa
[i
]]++;
480 static void pme_dd_sendrecv(pme_atomcomm_t
*atc
,
481 gmx_bool bBackward
,int shift
,
482 void *buf_s
,int nbyte_s
,
483 void *buf_r
,int nbyte_r
)
489 if (bBackward
== FALSE
) {
490 dest
= atc
->node_dest
[shift
];
491 src
= atc
->node_src
[shift
];
493 dest
= atc
->node_src
[shift
];
494 src
= atc
->node_dest
[shift
];
497 if (nbyte_s
> 0 && nbyte_r
> 0) {
498 MPI_Sendrecv(buf_s
,nbyte_s
,MPI_BYTE
,
500 buf_r
,nbyte_r
,MPI_BYTE
,
502 atc
->mpi_comm
,&stat
);
503 } else if (nbyte_s
> 0) {
504 MPI_Send(buf_s
,nbyte_s
,MPI_BYTE
,
507 } else if (nbyte_r
> 0) {
508 MPI_Recv(buf_r
,nbyte_r
,MPI_BYTE
,
510 atc
->mpi_comm
,&stat
);
515 static void dd_pmeredist_x_q(gmx_pme_t pme
,
516 int n
, gmx_bool bX
, rvec
*x
, real
*charge
,
519 int *commnode
,*buf_index
;
520 int nnodes_comm
,i
,nsend
,local_pos
,buf_pos
,node
,scount
,rcount
;
522 commnode
= atc
->node_dest
;
523 buf_index
= atc
->buf_index
;
525 nnodes_comm
= min(2*atc
->maxshift
,atc
->nslab
-1);
528 for(i
=0; i
<nnodes_comm
; i
++) {
529 buf_index
[commnode
[i
]] = nsend
;
530 nsend
+= atc
->count
[commnode
[i
]];
533 if (atc
->count
[atc
->nodeid
] + nsend
!= n
)
534 gmx_fatal(FARGS
,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
535 "This usually means that your system is not well equilibrated.",
536 n
- (atc
->count
[atc
->nodeid
] + nsend
),
537 pme
->nodeid
,'x'+atc
->dimind
);
539 if (nsend
> pme
->buf_nalloc
) {
540 pme
->buf_nalloc
= over_alloc_dd(nsend
);
541 srenew(pme
->bufv
,pme
->buf_nalloc
);
542 srenew(pme
->bufr
,pme
->buf_nalloc
);
545 atc
->n
= atc
->count
[atc
->nodeid
];
546 for(i
=0; i
<nnodes_comm
; i
++) {
547 scount
= atc
->count
[commnode
[i
]];
548 /* Communicate the count */
550 fprintf(debug
,"dimind %d PME node %d send to node %d: %d\n",
551 atc
->dimind
,atc
->nodeid
,commnode
[i
],scount
);
552 pme_dd_sendrecv(atc
,FALSE
,i
,
554 &atc
->rcount
[i
],sizeof(int));
555 atc
->n
+= atc
->rcount
[i
];
558 pme_realloc_atomcomm_things(atc
);
564 if (node
== atc
->nodeid
) {
565 /* Copy direct to the receive buffer */
567 copy_rvec(x
[i
],atc
->x
[local_pos
]);
569 atc
->q
[local_pos
] = charge
[i
];
572 /* Copy to the send buffer */
574 copy_rvec(x
[i
],pme
->bufv
[buf_index
[node
]]);
576 pme
->bufr
[buf_index
[node
]] = charge
[i
];
582 for(i
=0; i
<nnodes_comm
; i
++) {
583 scount
= atc
->count
[commnode
[i
]];
584 rcount
= atc
->rcount
[i
];
585 if (scount
> 0 || rcount
> 0) {
587 /* Communicate the coordinates */
588 pme_dd_sendrecv(atc
,FALSE
,i
,
589 pme
->bufv
[buf_pos
],scount
*sizeof(rvec
),
590 atc
->x
[local_pos
],rcount
*sizeof(rvec
));
592 /* Communicate the charges */
593 pme_dd_sendrecv(atc
,FALSE
,i
,
594 pme
->bufr
+buf_pos
,scount
*sizeof(real
),
595 atc
->q
+local_pos
,rcount
*sizeof(real
));
597 local_pos
+= atc
->rcount
[i
];
602 static void dd_pmeredist_f(gmx_pme_t pme
, pme_atomcomm_t
*atc
,
606 int *commnode
,*buf_index
;
607 int nnodes_comm
,local_pos
,buf_pos
,i
,scount
,rcount
,node
;
609 commnode
= atc
->node_dest
;
610 buf_index
= atc
->buf_index
;
612 nnodes_comm
= min(2*atc
->maxshift
,atc
->nslab
-1);
614 local_pos
= atc
->count
[atc
->nodeid
];
616 for(i
=0; i
<nnodes_comm
; i
++) {
617 scount
= atc
->rcount
[i
];
618 rcount
= atc
->count
[commnode
[i
]];
619 if (scount
> 0 || rcount
> 0) {
620 /* Communicate the forces */
621 pme_dd_sendrecv(atc
,TRUE
,i
,
622 atc
->f
[local_pos
],scount
*sizeof(rvec
),
623 pme
->bufv
[buf_pos
],rcount
*sizeof(rvec
));
626 buf_index
[commnode
[i
]] = buf_pos
;
636 if (node
== atc
->nodeid
)
638 /* Add from the local force array */
639 rvec_inc(f
[i
],atc
->f
[local_pos
]);
644 /* Add from the receive buffer */
645 rvec_inc(f
[i
],pme
->bufv
[buf_index
[node
]]);
655 if (node
== atc
->nodeid
)
657 /* Copy from the local force array */
658 copy_rvec(atc
->f
[local_pos
],f
[i
]);
663 /* Copy from the receive buffer */
664 copy_rvec(pme
->bufv
[buf_index
[node
]],f
[i
]);
673 gmx_sum_qgrid_dd(gmx_pme_t pme
, real
*grid
, int direction
)
675 pme_overlap_t
*overlap
;
676 int send_index0
,send_nindex
;
677 int recv_index0
,recv_nindex
;
679 int i
,j
,k
,ix
,iy
,iz
,icnt
;
680 int ipulse
,send_id
,recv_id
,datasize
;
682 real
*sendptr
,*recvptr
;
684 /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
685 overlap
= &pme
->overlap
[1];
687 for(ipulse
=0;ipulse
<overlap
->noverlap_nodes
;ipulse
++)
689 /* Since we have already (un)wrapped the overlap in the z-dimension,
690 * we only have to communicate 0 to nkz (not pmegrid_nz).
692 if (direction
==GMX_SUM_QGRID_FORWARD
)
694 send_id
= overlap
->send_id
[ipulse
];
695 recv_id
= overlap
->recv_id
[ipulse
];
696 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
697 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
698 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
699 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
703 send_id
= overlap
->recv_id
[ipulse
];
704 recv_id
= overlap
->send_id
[ipulse
];
705 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
706 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
707 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
708 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
711 /* Copy data to contiguous send buffer */
714 fprintf(debug
,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
715 pme
->nodeid
,overlap
->nodeid
,send_id
,
716 pme
->pmegrid_start_iy
,
717 send_index0
-pme
->pmegrid_start_iy
,
718 send_index0
-pme
->pmegrid_start_iy
+send_nindex
);
721 for(i
=0;i
<pme
->pmegrid_nx
;i
++)
724 for(j
=0;j
<send_nindex
;j
++)
726 iy
= j
+ send_index0
- pme
->pmegrid_start_iy
;
727 for(k
=0;k
<pme
->nkz
;k
++)
730 pme
->pmegrid_sendbuf
[icnt
++] = grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
];
735 datasize
= pme
->pmegrid_nx
* pme
->nkz
;
737 MPI_Sendrecv(pme
->pmegrid_sendbuf
,send_nindex
*datasize
,GMX_MPI_REAL
,
739 pme
->pmegrid_recvbuf
,recv_nindex
*datasize
,GMX_MPI_REAL
,
741 overlap
->mpi_comm
,&stat
);
743 /* Get data from contiguous recv buffer */
746 fprintf(debug
,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
747 pme
->nodeid
,overlap
->nodeid
,recv_id
,
748 pme
->pmegrid_start_iy
,
749 recv_index0
-pme
->pmegrid_start_iy
,
750 recv_index0
-pme
->pmegrid_start_iy
+recv_nindex
);
753 for(i
=0;i
<pme
->pmegrid_nx
;i
++)
756 for(j
=0;j
<recv_nindex
;j
++)
758 iy
= j
+ recv_index0
- pme
->pmegrid_start_iy
;
759 for(k
=0;k
<pme
->nkz
;k
++)
762 if(direction
==GMX_SUM_QGRID_FORWARD
)
764 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] += pme
->pmegrid_recvbuf
[icnt
++];
768 grid
[ix
*(pme
->pmegrid_ny
*pme
->pmegrid_nz
)+iy
*(pme
->pmegrid_nz
)+iz
] = pme
->pmegrid_recvbuf
[icnt
++];
775 /* Major dimension is easier, no copying required,
776 * but we might have to sum to separate array.
777 * Since we don't copy, we have to communicate up to pmegrid_nz,
778 * not nkz as for the minor direction.
780 overlap
= &pme
->overlap
[0];
782 for(ipulse
=0;ipulse
<overlap
->noverlap_nodes
;ipulse
++)
784 if(direction
==GMX_SUM_QGRID_FORWARD
)
786 send_id
= overlap
->send_id
[ipulse
];
787 recv_id
= overlap
->recv_id
[ipulse
];
788 send_index0
= overlap
->comm_data
[ipulse
].send_index0
;
789 send_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
790 recv_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
791 recv_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
792 recvptr
= pme
->pmegrid_recvbuf
;
796 send_id
= overlap
->recv_id
[ipulse
];
797 recv_id
= overlap
->send_id
[ipulse
];
798 send_index0
= overlap
->comm_data
[ipulse
].recv_index0
;
799 send_nindex
= overlap
->comm_data
[ipulse
].recv_nindex
;
800 recv_index0
= overlap
->comm_data
[ipulse
].send_index0
;
801 recv_nindex
= overlap
->comm_data
[ipulse
].send_nindex
;
802 recvptr
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
805 sendptr
= grid
+ (send_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
806 datasize
= pme
->pmegrid_ny
* pme
->pmegrid_nz
;
810 fprintf(debug
,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
811 pme
->nodeid
,overlap
->nodeid
,send_id
,
812 pme
->pmegrid_start_ix
,
813 send_index0
-pme
->pmegrid_start_ix
,
814 send_index0
-pme
->pmegrid_start_ix
+send_nindex
);
815 fprintf(debug
,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
816 pme
->nodeid
,overlap
->nodeid
,recv_id
,
817 pme
->pmegrid_start_ix
,
818 recv_index0
-pme
->pmegrid_start_ix
,
819 recv_index0
-pme
->pmegrid_start_ix
+recv_nindex
);
822 MPI_Sendrecv(sendptr
,send_nindex
*datasize
,GMX_MPI_REAL
,
824 recvptr
,recv_nindex
*datasize
,GMX_MPI_REAL
,
826 overlap
->mpi_comm
,&stat
);
828 /* ADD data from contiguous recv buffer */
829 if(direction
==GMX_SUM_QGRID_FORWARD
)
831 p
= grid
+ (recv_index0
-pme
->pmegrid_start_ix
)*(pme
->pmegrid_ny
*pme
->pmegrid_nz
);
832 for(i
=0;i
<recv_nindex
*datasize
;i
++)
834 p
[i
] += pme
->pmegrid_recvbuf
[i
];
843 copy_pmegrid_to_fftgrid(gmx_pme_t pme
, real
*pmegrid
, real
*fftgrid
)
845 ivec local_fft_ndata
,local_fft_offset
,local_fft_size
;
850 /* Dimensions should be identical for A/B grid, so we just use A here */
851 gmx_parallel_3dfft_real_limits(pme
->pfft_setupA
,
856 local_pme_size
[0] = pme
->pmegrid_nx
;
857 local_pme_size
[1] = pme
->pmegrid_ny
;
858 local_pme_size
[2] = pme
->pmegrid_nz
;
860 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
861 the offset is identical, and the PME grid always has more data (due to overlap)
866 char fn
[STRLEN
],format
[STRLEN
];
868 sprintf(fn
,"pmegrid%d.pdb",pme
->nodeid
);
870 sprintf(fn
,"pmegrid%d.txt",pme
->nodeid
);
871 fp2
= ffopen(fn
,"w");
872 sprintf(format
,"%s%s\n",pdbformat
,"%6.2f%6.2f");
874 for(ix
=0;ix
<local_fft_ndata
[XX
];ix
++)
876 for(iy
=0;iy
<local_fft_ndata
[YY
];iy
++)
878 for(iz
=0;iz
<local_fft_ndata
[ZZ
];iz
++)
880 pmeidx
= ix
*(local_pme_size
[YY
]*local_pme_size
[ZZ
])+iy
*(local_pme_size
[ZZ
])+iz
;
881 fftidx
= ix
*(local_fft_size
[YY
]*local_fft_size
[ZZ
])+iy
*(local_fft_size
[ZZ
])+iz
;
882 fftgrid
[fftidx
] = pmegrid
[pmeidx
];
884 val
= 100*pmegrid
[pmeidx
];
885 if (pmegrid
[pmeidx
] != 0)
886 fprintf(fp
,format
,"ATOM",pmeidx
,"CA","GLY",' ',pmeidx
,' ',
887 5.0*ix
,5.0*iy
,5.0*iz
,1.0,val
);
888 if (pmegrid
[pmeidx
] != 0)
889 fprintf(fp2
,"%-12s %5d %5d %5d %12.5e\n",
891 pme
->pmegrid_start_ix
+ ix
,
892 pme
->pmegrid_start_iy
+ iy
,
893 pme
->pmegrid_start_iz
+ iz
,
909 copy_fftgrid_to_pmegrid(gmx_pme_t pme
, real
*fftgrid
, real
*pmegrid
)
911 ivec local_fft_ndata
,local_fft_offset
,local_fft_size
;
916 /* Dimensions should be identical for A/B grid, so we just use A here */
917 gmx_parallel_3dfft_real_limits(pme
->pfft_setupA
,
922 local_pme_size
[0] = pme
->pmegrid_nx
;
923 local_pme_size
[1] = pme
->pmegrid_ny
;
924 local_pme_size
[2] = pme
->pmegrid_nz
;
926 /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
927 the offset is identical, and the PME grid always has more data (due to overlap)
929 for(ix
=0;ix
<local_fft_ndata
[XX
];ix
++)
931 for(iy
=0;iy
<local_fft_ndata
[YY
];iy
++)
933 for(iz
=0;iz
<local_fft_ndata
[ZZ
];iz
++)
935 pmeidx
= ix
*(local_pme_size
[YY
]*local_pme_size
[ZZ
])+iy
*(local_pme_size
[ZZ
])+iz
;
936 fftidx
= ix
*(local_fft_size
[YY
]*local_fft_size
[ZZ
])+iy
*(local_fft_size
[ZZ
])+iz
;
937 pmegrid
[pmeidx
] = fftgrid
[fftidx
];
946 wrap_periodic_pmegrid(gmx_pme_t pme
, real
*pmegrid
)
948 int nx
,ny
,nz
,pnx
,pny
,pnz
,ny_x
,overlap
,ix
,iy
,iz
;
954 pnx
= pme
->pmegrid_nx
;
955 pny
= pme
->pmegrid_ny
;
956 pnz
= pme
->pmegrid_nz
;
958 overlap
= pme
->pme_order
- 1;
960 /* Add periodic overlap in z */
961 for(ix
=0; ix
<pnx
; ix
++)
963 for(iy
=0; iy
<pny
; iy
++)
965 for(iz
=0; iz
<overlap
; iz
++)
967 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
968 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
];
973 if (pme
->nnodes_minor
== 1)
975 for(ix
=0; ix
<pnx
; ix
++)
977 for(iy
=0; iy
<overlap
; iy
++)
979 for(iz
=0; iz
<nz
; iz
++)
981 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
982 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
];
988 if (pme
->nnodes_major
== 1)
990 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pny
);
992 for(ix
=0; ix
<overlap
; ix
++)
994 for(iy
=0; iy
<ny_x
; iy
++)
996 for(iz
=0; iz
<nz
; iz
++)
998 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
] +=
999 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
];
1008 unwrap_periodic_pmegrid(gmx_pme_t pme
, real
*pmegrid
)
1010 int nx
,ny
,nz
,pnx
,pny
,pnz
,ny_x
,overlap
,ix
,iy
,iz
;
1016 pnx
= pme
->pmegrid_nx
;
1017 pny
= pme
->pmegrid_ny
;
1018 pnz
= pme
->pmegrid_nz
;
1020 overlap
= pme
->pme_order
- 1;
1022 if (pme
->nnodes_major
== 1)
1024 ny_x
= (pme
->nnodes_minor
== 1 ? ny
: pny
);
1026 for(ix
=0; ix
<overlap
; ix
++)
1028 for(iy
=0; iy
<ny_x
; iy
++)
1030 for(iz
=0; iz
<nz
; iz
++)
1032 pmegrid
[((nx
+ix
)*pny
+iy
)*pnz
+iz
] =
1033 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
1039 if (pme
->nnodes_minor
== 1)
1041 for(ix
=0; ix
<pnx
; ix
++)
1043 for(iy
=0; iy
<overlap
; iy
++)
1045 for(iz
=0; iz
<nz
; iz
++)
1047 pmegrid
[(ix
*pny
+ny
+iy
)*pnz
+iz
] =
1048 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
1054 /* Copy periodic overlap in z */
1055 for(ix
=0; ix
<pnx
; ix
++)
1057 for(iy
=0; iy
<pny
; iy
++)
1059 for(iz
=0; iz
<overlap
; iz
++)
1061 pmegrid
[(ix
*pny
+iy
)*pnz
+nz
+iz
] =
1062 pmegrid
[(ix
*pny
+iy
)*pnz
+iz
];
1069 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
1070 #define DO_BSPLINE(order) \
1071 for(ithx=0; (ithx<order); ithx++) \
1073 index_x = (i0+ithx)*pny*pnz; \
1074 valx = qn*thx[ithx]; \
1076 for(ithy=0; (ithy<order); ithy++) \
1078 valxy = valx*thy[ithy]; \
1079 index_xy = index_x+(j0+ithy)*pnz; \
1081 for(ithz=0; (ithz<order); ithz++) \
1083 index_xyz = index_xy+(k0+ithz); \
1084 grid[index_xyz] += valxy*thz[ithz]; \
1090 static void spread_q_bsplines(gmx_pme_t pme
, pme_atomcomm_t
*atc
,
1094 /* spread charges from home atoms to local grid */
1096 int b
,i
,nn
,n
,ithx
,ithy
,ithz
,i0
,j0
,k0
;
1098 int order
,norder
,index_x
,index_xy
,index_xyz
;
1100 real
*thx
,*thy
,*thz
;
1101 int localsize
, bndsize
;
1103 int pnx
,pny
,pnz
,ndatatot
;
1105 pnx
= pme
->pmegrid_nx
;
1106 pny
= pme
->pmegrid_ny
;
1107 pnz
= pme
->pmegrid_nz
;
1108 ndatatot
= pnx
*pny
*pnz
;
1110 for(i
=0;i
<ndatatot
;i
++)
1115 order
= pme
->pme_order
;
1117 for(nn
=0; (nn
<atc
->n
);nn
++)
1124 idxptr
= atc
->idx
[n
];
1130 thx
= atc
->theta
[XX
] + norder
;
1131 thy
= atc
->theta
[YY
] + norder
;
1132 thz
= atc
->theta
[ZZ
] + norder
;
1135 case 4: DO_BSPLINE(4); break;
1136 case 5: DO_BSPLINE(5); break;
1137 default: DO_BSPLINE(order
); break;
1144 #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
1145 /* Calculate exponentials through SSE in float precision */
1146 #define CALC_EXPONENTIALS(start,end,r_aligned) \
1149 for(kx=0; kx<end; kx+=4) \
1151 tmp_sse = _mm_load_ps(r_aligned+kx); \
1152 tmp_sse = gmx_mm_exp_ps(tmp_sse); \
1153 _mm_store_ps(r_aligned+kx,tmp_sse); \
1157 #define CALC_EXPONENTIALS(start,end,r) \
1158 for(kx=start; kx<end; kx++) \
1160 r[kx] = exp(r[kx]); \
1165 static int solve_pme_yzx(gmx_pme_t pme
,t_complex
*grid
,
1166 real ewaldcoeff
,real vol
,
1167 gmx_bool bEnerVir
,real
*mesh_energy
,matrix vir
)
1169 /* do recip sum over local cells in grid */
1170 /* y major, z middle, x minor or continuous */
1172 int kx
,ky
,kz
,maxkx
,maxky
,maxkz
;
1173 int nx
,ny
,nz
,iy
,iz
,kxstart
,kxend
;
1175 real factor
=M_PI
*M_PI
/(ewaldcoeff
*ewaldcoeff
);
1176 real ets2
,struct2
,vfactor
,ets2vf
;
1177 real eterm
,d1
,d2
,energy
=0;
1179 real virxx
=0,virxy
=0,virxz
=0,viryy
=0,viryz
=0,virzz
=0;
1180 real rxx
,ryx
,ryy
,rzx
,rzy
,rzz
;
1181 real
*mhx
,*mhy
,*mhz
,*m2
,*denom
,*tmp1
,*m2inv
;
1182 real mhxk
,mhyk
,mhzk
,m2k
;
1185 ivec local_ndata
,local_offset
,local_size
;
1191 /* Dimensions should be identical for A/B grid, so we just use A here */
1192 gmx_parallel_3dfft_complex_limits(pme
->pfft_setupA
,
1198 rxx
= pme
->recipbox
[XX
][XX
];
1199 ryx
= pme
->recipbox
[YY
][XX
];
1200 ryy
= pme
->recipbox
[YY
][YY
];
1201 rzx
= pme
->recipbox
[ZZ
][XX
];
1202 rzy
= pme
->recipbox
[ZZ
][YY
];
1203 rzz
= pme
->recipbox
[ZZ
][ZZ
];
1209 mhx
= pme
->work_mhx
;
1210 mhy
= pme
->work_mhy
;
1211 mhz
= pme
->work_mhz
;
1213 denom
= pme
->work_denom
;
1214 tmp1
= pme
->work_tmp1
;
1215 m2inv
= pme
->work_m2inv
;
1217 for(iy
=0;iy
<local_ndata
[YY
];iy
++)
1219 ky
= iy
+ local_offset
[YY
];
1230 by
= M_PI
*vol
*pme
->bsp_mod
[YY
][ky
];
1232 for(iz
=0;iz
<local_ndata
[ZZ
];iz
++)
1234 kz
= iz
+ local_offset
[ZZ
];
1238 bz
= pme
->bsp_mod
[ZZ
][kz
];
1240 /* 0.5 correction for corner points */
1247 p0
= grid
+ iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
1249 /* We should skip the k-space point (0,0,0) */
1250 if (local_offset
[XX
] > 0 ||
1251 local_offset
[YY
] > 0 || ky
> 0 ||
1254 kxstart
= local_offset
[XX
];
1258 kxstart
= local_offset
[XX
] + 1;
1261 kxend
= local_offset
[XX
] + local_ndata
[XX
];
1265 /* More expensive inner loop, especially because of the storage
1266 * of the mh elements in array's.
1267 * Because x is the minor grid index, all mh elements
1268 * depend on kx for triclinic unit cells.
1271 /* Two explicit loops to avoid a conditional inside the loop */
1272 for(kx
=kxstart
; kx
<maxkx
; kx
++)
1277 mhyk
= mx
* ryx
+ my
* ryy
;
1278 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
1279 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
1284 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
1285 tmp1
[kx
] = -factor
*m2k
;
1288 for(kx
=maxkx
; kx
<kxend
; kx
++)
1293 mhyk
= mx
* ryx
+ my
* ryy
;
1294 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
1295 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
1300 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
1301 tmp1
[kx
] = -factor
*m2k
;
1304 for(kx
=kxstart
; kx
<kxend
; kx
++)
1306 m2inv
[kx
] = 1.0/m2
[kx
];
1308 for(kx
=kxstart
; kx
<kxend
; kx
++)
1310 denom
[kx
] = 1.0/denom
[kx
];
1313 CALC_EXPONENTIALS(kxstart
,kxend
,tmp1
);
1315 for(kx
=kxstart
; kx
<kxend
; kx
++,p0
++)
1320 eterm
= ONE_4PI_EPS0
/pme
->epsilon_r
*tmp1
[kx
]*denom
[kx
];
1325 struct2
= 2.0*(d1
*d1
+d2
*d2
);
1327 tmp1
[kx
] = eterm
*struct2
;
1330 for(kx
=kxstart
; kx
<kxend
; kx
++)
1332 ets2
= corner_fac
*tmp1
[kx
];
1333 vfactor
= (factor
*m2
[kx
] + 1.0)*2.0*m2inv
[kx
];
1336 ets2vf
= ets2
*vfactor
;
1337 virxx
+= ets2vf
*mhx
[kx
]*mhx
[kx
] - ets2
;
1338 virxy
+= ets2vf
*mhx
[kx
]*mhy
[kx
];
1339 virxz
+= ets2vf
*mhx
[kx
]*mhz
[kx
];
1340 viryy
+= ets2vf
*mhy
[kx
]*mhy
[kx
] - ets2
;
1341 viryz
+= ets2vf
*mhy
[kx
]*mhz
[kx
];
1342 virzz
+= ets2vf
*mhz
[kx
]*mhz
[kx
] - ets2
;
1347 /* We don't need to calculate the energy and the virial.
1348 * In this case the triclinic overhead is small.
1351 /* Two explicit loops to avoid a conditional inside the loop */
1353 for(kx
=kxstart
; kx
<maxkx
; kx
++)
1358 mhyk
= mx
* ryx
+ my
* ryy
;
1359 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
1360 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
1361 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
1362 tmp1
[kx
] = -factor
*m2k
;
1365 for(kx
=maxkx
; kx
<kxend
; kx
++)
1370 mhyk
= mx
* ryx
+ my
* ryy
;
1371 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
1372 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
1373 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
1374 tmp1
[kx
] = -factor
*m2k
;
1377 for(kx
=kxstart
; kx
<kxend
; kx
++)
1379 denom
[kx
] = 1.0/denom
[kx
];
1382 CALC_EXPONENTIALS(kxstart
,kxend
,tmp1
);
1384 for(kx
=kxstart
; kx
<kxend
; kx
++,p0
++)
1389 eterm
= ONE_4PI_EPS0
/pme
->epsilon_r
*tmp1
[kx
]*denom
[kx
];
1400 /* Update virial with local values.
1401 * The virial is symmetric by definition.
1402 * this virial seems ok for isotropic scaling, but I'm
1403 * experiencing problems on semiisotropic membranes.
1404 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
1406 vir
[XX
][XX
] = 0.25*virxx
;
1407 vir
[YY
][YY
] = 0.25*viryy
;
1408 vir
[ZZ
][ZZ
] = 0.25*virzz
;
1409 vir
[XX
][YY
] = vir
[YY
][XX
] = 0.25*virxy
;
1410 vir
[XX
][ZZ
] = vir
[ZZ
][XX
] = 0.25*virxz
;
1411 vir
[YY
][ZZ
] = vir
[ZZ
][YY
] = 0.25*viryz
;
1413 /* This energy should be corrected for a charged system */
1414 *mesh_energy
= 0.5*energy
;
1417 /* Return the loop count */
1418 return local_ndata
[YY
]*local_ndata
[ZZ
]*local_ndata
[XX
];
1422 #define DO_FSPLINE(order) \
1423 for(ithx=0; (ithx<order); ithx++) \
1425 index_x = (i0+ithx)*pny*pnz; \
1429 for(ithy=0; (ithy<order); ithy++) \
1431 index_xy = index_x+(j0+ithy)*pnz; \
1436 for(ithz=0; (ithz<order); ithz++) \
1438 gval = grid[index_xy+(k0+ithz)]; \
1439 fxy1 += thz[ithz]*gval; \
1440 fz1 += dthz[ithz]*gval; \
1449 void gather_f_bsplines(gmx_pme_t pme
,real
*grid
,
1450 gmx_bool bClearF
,pme_atomcomm_t
*atc
,real scale
)
1452 /* sum forces for local particles */
1453 int nn
,n
,ithx
,ithy
,ithz
,i0
,j0
,k0
;
1454 int index_x
,index_xy
;
1455 int nx
,ny
,nz
,pnx
,pny
,pnz
;
1457 real tx
,ty
,dx
,dy
,qn
;
1460 real
*thx
,*thy
,*thz
,*dthx
,*dthy
,*dthz
;
1462 real rxx
,ryx
,ryy
,rzx
,rzy
,rzz
;
1465 order
= pme
->pme_order
;
1466 thx
= atc
->theta
[XX
];
1467 thy
= atc
->theta
[YY
];
1468 thz
= atc
->theta
[ZZ
];
1469 dthx
= atc
->dtheta
[XX
];
1470 dthy
= atc
->dtheta
[YY
];
1471 dthz
= atc
->dtheta
[ZZ
];
1475 pnx
= pme
->pmegrid_nx
;
1476 pny
= pme
->pmegrid_ny
;
1477 pnz
= pme
->pmegrid_nz
;
1479 rxx
= pme
->recipbox
[XX
][XX
];
1480 ryx
= pme
->recipbox
[YY
][XX
];
1481 ryy
= pme
->recipbox
[YY
][YY
];
1482 rzx
= pme
->recipbox
[ZZ
][XX
];
1483 rzy
= pme
->recipbox
[ZZ
][YY
];
1484 rzz
= pme
->recipbox
[ZZ
][ZZ
];
1486 for(nn
=0; (nn
<atc
->n
); nn
++)
1489 qn
= scale
*atc
->q
[n
];
1502 idxptr
= atc
->idx
[n
];
1509 /* Pointer arithmetic alert, next six statements */
1510 thx
= atc
->theta
[XX
] + norder
;
1511 thy
= atc
->theta
[YY
] + norder
;
1512 thz
= atc
->theta
[ZZ
] + norder
;
1513 dthx
= atc
->dtheta
[XX
] + norder
;
1514 dthy
= atc
->dtheta
[YY
] + norder
;
1515 dthz
= atc
->dtheta
[ZZ
] + norder
;
1518 case 4: DO_FSPLINE(4); break;
1519 case 5: DO_FSPLINE(5); break;
1520 default: DO_FSPLINE(order
); break;
1523 atc
->f
[n
][XX
] += -qn
*( fx
*nx
*rxx
);
1524 atc
->f
[n
][YY
] += -qn
*( fx
*nx
*ryx
+ fy
*ny
*ryy
);
1525 atc
->f
[n
][ZZ
] += -qn
*( fx
*nx
*rzx
+ fy
*ny
*rzy
+ fz
*nz
*rzz
);
1528 /* Since the energy and not forces are interpolated
1529 * the net force might not be exactly zero.
1530 * This can be solved by also interpolating F, but
1531 * that comes at a cost.
1532 * A better hack is to remove the net force every
1533 * step, but that must be done at a higher level
1534 * since this routine doesn't see all atoms if running
1535 * in parallel. Don't know how important it is? EL 990726
1539 static real
gather_energy_bsplines(gmx_pme_t pme
,real
*grid
,
1540 pme_atomcomm_t
*atc
)
1542 int n
,ithx
,ithy
,ithz
,i0
,j0
,k0
;
1543 int index_x
,index_xy
;
1545 real energy
,pot
,tx
,ty
,qn
,gval
;
1546 real
*thx
,*thy
,*thz
;
1551 order
= pme
->pme_order
;
1554 for(n
=0; (n
<atc
->n
); n
++) {
1558 idxptr
= atc
->idx
[n
];
1565 /* Pointer arithmetic alert, next three statements */
1566 thx
= atc
->theta
[XX
] + norder
;
1567 thy
= atc
->theta
[YY
] + norder
;
1568 thz
= atc
->theta
[ZZ
] + norder
;
1571 for(ithx
=0; (ithx
<order
); ithx
++)
1573 index_x
= (i0
+ithx
)*pme
->pmegrid_ny
*pme
->pmegrid_nz
;
1576 for(ithy
=0; (ithy
<order
); ithy
++)
1578 index_xy
= index_x
+(j0
+ithy
)*pme
->pmegrid_nz
;
1581 for(ithz
=0; (ithz
<order
); ithz
++)
1583 gval
= grid
[index_xy
+(k0
+ithz
)];
1584 pot
+= tx
*ty
*thz
[ithz
]*gval
;
1597 void make_bsplines(splinevec theta
,splinevec dtheta
,int order
,
1598 rvec fractx
[],int nr
,real charge
[],
1599 gmx_bool bFreeEnergy
)
1601 /* construct splines for local atoms */
1604 real
*data
,*ddata
,*xptr
;
1606 for(i
=0; (i
<nr
); i
++) {
1607 /* With free energy we do not use the charge check.
1608 * In most cases this will be more efficient than calling make_bsplines
1609 * twice, since usually more than half the particles have charges.
1611 if (bFreeEnergy
|| charge
[i
] != 0.0) {
1613 for(j
=0; (j
<DIM
); j
++) {
1616 /* dr is relative offset from lower cell limit */
1617 data
=&(theta
[j
][i
*order
]);
1622 for(k
=3; (k
<order
); k
++) {
1624 data
[k
-1]=div
*dr
*data
[k
-2];
1625 for(l
=1; (l
<(k
-1)); l
++) {
1626 data
[k
-l
-1]=div
*((dr
+l
)*data
[k
-l
-2]+(k
-l
-dr
)*
1629 data
[0]=div
*(1-dr
)*data
[0];
1632 ddata
= &(dtheta
[j
][i
*order
]);
1633 ddata
[0] = -data
[0];
1634 for(k
=1; (k
<order
); k
++) {
1635 ddata
[k
]=data
[k
-1]-data
[k
];
1639 data
[order
-1]=div
*dr
*data
[order
-2];
1640 for(l
=1; (l
<(order
-1)); l
++) {
1641 data
[order
-l
-1]=div
*((dr
+l
)*data
[order
-l
-2]+
1642 (order
-l
-dr
)*data
[order
-l
-1]);
1644 data
[0]=div
*(1-dr
)*data
[0];
1651 void make_dft_mod(real
*mod
,real
*data
,int ndata
)
1656 for(i
=0;i
<ndata
;i
++) {
1658 for(j
=0;j
<ndata
;j
++) {
1659 arg
=(2.0*M_PI
*i
*j
)/ndata
;
1660 sc
+=data
[j
]*cos(arg
);
1661 ss
+=data
[j
]*sin(arg
);
1665 for(i
=0;i
<ndata
;i
++)
1667 mod
[i
]=(mod
[i
-1]+mod
[i
+1])*0.5;
1672 void make_bspline_moduli(splinevec bsp_mod
,int nx
,int ny
,int nz
,int order
)
1674 int nmax
=max(nx
,max(ny
,nz
));
1675 real
*data
,*ddata
,*bsp_data
;
1681 snew(bsp_data
,nmax
);
1687 for(k
=3;k
<order
;k
++) {
1690 for(l
=1;l
<(k
-1);l
++)
1691 data
[k
-l
-1]=div
*(l
*data
[k
-l
-2]+(k
-l
)*data
[k
-l
-1]);
1692 data
[0]=div
*data
[0];
1696 for(k
=1;k
<order
;k
++)
1697 ddata
[k
]=data
[k
-1]-data
[k
];
1700 for(l
=1;l
<(order
-1);l
++)
1701 data
[order
-l
-1]=div
*(l
*data
[order
-l
-2]+(order
-l
)*data
[order
-l
-1]);
1702 data
[0]=div
*data
[0];
1706 for(i
=1;i
<=order
;i
++)
1707 bsp_data
[i
]=data
[i
-1];
1709 make_dft_mod(bsp_mod
[XX
],bsp_data
,nx
);
1710 make_dft_mod(bsp_mod
[YY
],bsp_data
,ny
);
1711 make_dft_mod(bsp_mod
[ZZ
],bsp_data
,nz
);
1718 static void setup_coordinate_communication(pme_atomcomm_t
*atc
)
1726 for(i
=1; i
<=nslab
/2; i
++) {
1727 fw
= (atc
->nodeid
+ i
) % nslab
;
1728 bw
= (atc
->nodeid
- i
+ nslab
) % nslab
;
1729 if (n
< nslab
- 1) {
1730 atc
->node_dest
[n
] = fw
;
1731 atc
->node_src
[n
] = bw
;
1734 if (n
< nslab
- 1) {
1735 atc
->node_dest
[n
] = bw
;
1736 atc
->node_src
[n
] = fw
;
1742 int gmx_pme_destroy(FILE *log
,gmx_pme_t
*pmedata
)
1746 fprintf(log
,"Destroying PME data structures.\n");
1749 sfree((*pmedata
)->nnx
);
1750 sfree((*pmedata
)->nny
);
1751 sfree((*pmedata
)->nnz
);
1753 sfree((*pmedata
)->pmegridA
);
1754 sfree((*pmedata
)->fftgridA
);
1755 sfree((*pmedata
)->cfftgridA
);
1756 gmx_parallel_3dfft_destroy((*pmedata
)->pfft_setupA
);
1758 if((*pmedata
)->pmegridB
)
1760 sfree((*pmedata
)->pmegridB
);
1761 sfree((*pmedata
)->fftgridB
);
1762 sfree((*pmedata
)->cfftgridB
);
1763 gmx_parallel_3dfft_destroy((*pmedata
)->pfft_setupB
);
1765 sfree((*pmedata
)->work_mhz
);
1766 sfree((*pmedata
)->work_m2
);
1767 sfree((*pmedata
)->work_denom
);
1768 sfree((*pmedata
)->work_tmp1_alloc
);
1769 sfree((*pmedata
)->work_m2inv
);
1777 static int mult_up(int n
,int f
)
1779 return ((n
+ f
- 1)/f
)*f
;
1783 static double pme_load_imbalance(gmx_pme_t pme
)
1788 nma
= pme
->nnodes_major
;
1789 nmi
= pme
->nnodes_minor
;
1791 n1
= mult_up(pme
->nkx
,nma
)*mult_up(pme
->nky
,nmi
)*pme
->nkz
;
1792 n2
= mult_up(pme
->nkx
,nma
)*mult_up(pme
->nkz
,nmi
)*pme
->nky
;
1793 n3
= mult_up(pme
->nky
,nma
)*mult_up(pme
->nkz
,nmi
)*pme
->nkx
;
1795 /* pme_solve is roughly double the cost of an fft */
1797 return (n1
+ n2
+ 3*n3
)/(double)(6*pme
->nkx
*pme
->nky
*pme
->nkz
);
1800 static void init_atomcomm(gmx_pme_t pme
,pme_atomcomm_t
*atc
, t_commrec
*cr
,
1801 int dimind
,gmx_bool bSpread
)
1805 atc
->dimind
= dimind
;
1810 if (pme
->nnodes
> 1)
1812 atc
->mpi_comm
= pme
->mpi_comm_d
[dimind
];
1813 MPI_Comm_size(atc
->mpi_comm
,&atc
->nslab
);
1814 MPI_Comm_rank(atc
->mpi_comm
,&atc
->nodeid
);
1818 fprintf(debug
,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc
->dimind
,atc
->nslab
,atc
->nodeid
);
1822 atc
->bSpread
= bSpread
;
1823 atc
->pme_order
= pme
->pme_order
;
1827 /* These three allocations are not required for particle decomp. */
1828 snew(atc
->node_dest
,atc
->nslab
);
1829 snew(atc
->node_src
,atc
->nslab
);
1830 setup_coordinate_communication(atc
);
1832 snew(atc
->count
,atc
->nslab
);
1833 snew(atc
->rcount
,atc
->nslab
);
1834 snew(atc
->buf_index
,atc
->nslab
);
1839 init_overlap_comm(pme_overlap_t
* ol
,
1848 int lbnd
,rbnd
,maxlr
,b
,i
;
1851 pme_grid_comm_t
*pgc
;
1853 int fft_start
,fft_end
,send_index1
,recv_index1
;
1856 ol
->mpi_comm
= comm
;
1859 ol
->nnodes
= nnodes
;
1860 ol
->nodeid
= nodeid
;
1862 /* Linear translation of the PME grid wo'nt affect reciprocal space
1863 * calculations, so to optimize we only interpolate "upwards",
1864 * which also means we only have to consider overlap in one direction.
1865 * I.e., particles on this node might also be spread to grid indices
1866 * that belong to higher nodes (modulo nnodes)
1869 snew(ol
->s2g0
,ol
->nnodes
+1);
1870 snew(ol
->s2g1
,ol
->nnodes
);
1871 if (debug
) { fprintf(debug
,"PME slab boundaries:"); }
1872 for(i
=0; i
<nnodes
; i
++)
1874 /* s2g0 the local interpolation grid start.
1875 * s2g1 the local interpolation grid end.
1876 * Because grid overlap communication only goes forward,
1877 * the grid the slabs for fft's should be rounded down.
1879 ol
->s2g0
[i
] = ( i
*ndata
+ 0 )/nnodes
;
1880 ol
->s2g1
[i
] = ((i
+1)*ndata
+ nnodes
-1)/nnodes
+ norder
- 1;
1884 fprintf(debug
," %3d %3d",ol
->s2g0
[i
],ol
->s2g1
[i
]);
1887 ol
->s2g0
[nnodes
] = ndata
;
1888 if (debug
) { fprintf(debug
,"\n"); }
1890 /* Determine with how many nodes we need to communicate the grid overlap */
1896 for(i
=0; i
<nnodes
; i
++)
1898 if ((i
+b
< nnodes
&& ol
->s2g1
[i
] > ol
->s2g0
[i
+b
]) ||
1899 (i
+b
>= nnodes
&& ol
->s2g1
[i
] > ol
->s2g0
[i
+b
-nnodes
] + ndata
))
1905 while (bCont
&& b
< nnodes
);
1906 ol
->noverlap_nodes
= b
- 1;
1908 snew(ol
->send_id
,ol
->noverlap_nodes
);
1909 snew(ol
->recv_id
,ol
->noverlap_nodes
);
1910 for(b
=0; b
<ol
->noverlap_nodes
; b
++)
1912 ol
->send_id
[b
] = (ol
->nodeid
+ (b
+ 1)) % ol
->nnodes
;
1913 ol
->recv_id
[b
] = (ol
->nodeid
- (b
+ 1) + ol
->nnodes
) % ol
->nnodes
;
1915 snew(ol
->comm_data
, ol
->noverlap_nodes
);
1917 for(b
=0; b
<ol
->noverlap_nodes
; b
++)
1919 pgc
= &ol
->comm_data
[b
];
1921 fft_start
= ol
->s2g0
[ol
->send_id
[b
]];
1922 fft_end
= ol
->s2g0
[ol
->send_id
[b
]+1];
1923 if (ol
->send_id
[b
] < nodeid
)
1928 send_index1
= ol
->s2g1
[nodeid
];
1929 send_index1
= min(send_index1
,fft_end
);
1930 pgc
->send_index0
= fft_start
;
1931 pgc
->send_nindex
= max(0,send_index1
- pgc
->send_index0
);
1933 /* We always start receiving to the first index of our slab */
1934 fft_start
= ol
->s2g0
[ol
->nodeid
];
1935 fft_end
= ol
->s2g0
[ol
->nodeid
+1];
1936 recv_index1
= ol
->s2g1
[ol
->recv_id
[b
]];
1937 if (ol
->recv_id
[b
] > nodeid
)
1939 recv_index1
-= ndata
;
1941 recv_index1
= min(recv_index1
,fft_end
);
1942 pgc
->recv_index0
= fft_start
;
1943 pgc
->recv_nindex
= max(0,recv_index1
- pgc
->recv_index0
);
1948 make_gridindex5_to_localindex(int n
,int local_start
,int local_range
,
1949 int **global_to_local
,
1950 real
**fraction_shift
)
1958 for(i
=0; (i
<5*n
); i
++)
1960 /* Determine the global to local grid index */
1961 gtl
[i
] = (i
- local_start
+ n
) % n
;
1962 /* For coordinates that fall within the local grid the fraction
1963 * is correct, we don't need to shift it.
1966 if (local_range
< n
)
1968 /* Due to rounding issues i could be 1 beyond the lower or
1969 * upper boundary of the local grid. Correct the index for this.
1970 * If we shift the index, we need to shift the fraction by
1971 * the same amount in the other direction to not affect
1973 * Note that due to this shifting the weights at the end of
1974 * the spline might change, but that will only involve values
1975 * between zero and values close to the precision of a real,
1976 * which is anyhow the accuracy of the whole mesh calculation.
1978 /* With local_range=0 we should not change i=local_start */
1979 if (i
% n
!= local_start
)
1986 else if (gtl
[i
] == local_range
)
1988 gtl
[i
] = local_range
- 1;
1995 *global_to_local
= gtl
;
1996 *fraction_shift
= fsh
;
2000 gmx_pme_check_grid_restrictions(FILE *fplog
,char dim
,int nnodes
,int *nk
)
2004 if (*nk
% nnodes
!= 0)
2006 nk_new
= nnodes
*(*nk
/nnodes
+ 1);
2008 if (2*nk_new
>= 3*(*nk
))
2010 gmx_fatal(FARGS
,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
2011 dim
,*nk
,dim
,nnodes
,dim
);
2016 fprintf(fplog
,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
2017 dim
,*nk
,dim
,nnodes
,dim
,nk_new
,dim
);
2024 int gmx_pme_init(gmx_pme_t
* pmedata
,
2030 gmx_bool bFreeEnergy
,
2031 gmx_bool bReproducible
)
2035 pme_atomcomm_t
*atc
;
2036 int bufsizex
,bufsizey
,bufsize
;
2040 fprintf(debug
,"Creating PME data structures.\n");
2043 pme
->redist_init
= FALSE
;
2044 pme
->sum_qgrid_tmp
= NULL
;
2045 pme
->sum_qgrid_dd_tmp
= NULL
;
2046 pme
->buf_nalloc
= 0;
2047 pme
->redist_buf_nalloc
= 0;
2050 pme
->bPPnode
= TRUE
;
2052 pme
->nnodes_major
= nnodes_major
;
2053 pme
->nnodes_minor
= nnodes_minor
;
2056 if (nnodes_major
*nnodes_minor
> 1 && PAR(cr
))
2058 pme
->mpi_comm
= cr
->mpi_comm_mygroup
;
2060 MPI_Comm_rank(pme
->mpi_comm
,&pme
->nodeid
);
2061 MPI_Comm_size(pme
->mpi_comm
,&pme
->nnodes
);
2062 if (pme
->nnodes
!= nnodes_major
*nnodes_minor
)
2064 gmx_incons("PME node count mismatch");
2069 if (pme
->nnodes
== 1)
2071 pme
->ndecompdim
= 0;
2072 pme
->nodeid_major
= 0;
2073 pme
->nodeid_minor
= 0;
2077 if (nnodes_minor
== 1)
2080 pme
->mpi_comm_d
[0] = pme
->mpi_comm
;
2081 pme
->mpi_comm_d
[1] = NULL
;
2083 pme
->ndecompdim
= 1;
2084 pme
->nodeid_major
= pme
->nodeid
;
2085 pme
->nodeid_minor
= 0;
2088 else if (nnodes_major
== 1)
2091 pme
->mpi_comm_d
[0] = NULL
;
2092 pme
->mpi_comm_d
[1] = pme
->mpi_comm
;
2094 pme
->ndecompdim
= 1;
2095 pme
->nodeid_major
= 0;
2096 pme
->nodeid_minor
= pme
->nodeid
;
2100 if (pme
->nnodes
% nnodes_major
!= 0)
2102 gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
2104 pme
->ndecompdim
= 2;
2107 MPI_Comm_split(pme
->mpi_comm
,pme
->nodeid
% nnodes_minor
,
2108 pme
->nodeid
,&pme
->mpi_comm_d
[0]); /* My communicator along major dimension */
2109 MPI_Comm_split(pme
->mpi_comm
,pme
->nodeid
/nnodes_minor
,
2110 pme
->nodeid
,&pme
->mpi_comm_d
[1]); /* My communicator along minor dimension */
2112 MPI_Comm_rank(pme
->mpi_comm_d
[0],&pme
->nodeid_major
);
2113 MPI_Comm_size(pme
->mpi_comm_d
[0],&pme
->nnodes_major
);
2114 MPI_Comm_rank(pme
->mpi_comm_d
[1],&pme
->nodeid_minor
);
2115 MPI_Comm_size(pme
->mpi_comm_d
[1],&pme
->nnodes_minor
);
2118 pme
->bPPnode
= (cr
->duty
& DUTY_PP
);
2121 if (ir
->ePBC
== epbcSCREW
)
2123 gmx_fatal(FARGS
,"pme does not (yet) work with pbc = screw");
2126 pme
->bFEP
= ((ir
->efep
!= efepNO
) && bFreeEnergy
);
2130 pme
->pme_order
= ir
->pme_order
;
2131 pme
->epsilon_r
= ir
->epsilon_r
;
2133 /* Currently pme.c supports only the fft5d FFT code.
2134 * Therefore the grid always needs to be divisible by nnodes.
2135 * When the old 1D code is also supported again, change this check.
2137 * This check should be done before calling gmx_pme_init
2138 * and fplog should be passed iso stderr.
2140 if (pme->ndecompdim >= 2)
2142 if (pme
->ndecompdim
>= 1)
2145 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2146 'x',nnodes_major,&pme->nkx);
2147 gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
2148 'y',nnodes_minor,&pme->nky);
2152 if (pme
->nkx
<= pme
->pme_order
*(pme
->nnodes_major
> 1 ? 2 : 1) ||
2153 pme
->nky
<= pme
->pme_order
*(pme
->nnodes_minor
> 1 ? 2 : 1) ||
2154 pme
->nkz
<= pme
->pme_order
)
2156 gmx_fatal(FARGS
,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_order for x and/or y",pme
->pme_order
);
2159 if (pme
->nnodes
> 1) {
2163 MPI_Type_contiguous(DIM
, mpi_type
, &(pme
->rvec_mpi
));
2164 MPI_Type_commit(&(pme
->rvec_mpi
));
2167 /* Note that the charge spreading and force gathering, which usually
2168 * takes about the same amount of time as FFT+solve_pme,
2169 * is always fully load balanced
2170 * (unless the charge distribution is inhomogeneous).
2173 imbal
= pme_load_imbalance(pme
);
2174 if (imbal
>= 1.2 && pme
->nodeid_major
== 0 && pme
->nodeid_minor
== 0)
2178 "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
2179 " For optimal PME load balancing\n"
2180 " PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
2181 " and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
2183 (int)((imbal
-1)*100 + 0.5),
2184 pme
->nkx
,pme
->nky
,pme
->nnodes_major
,
2185 pme
->nky
,pme
->nkz
,pme
->nnodes_minor
);
2189 init_overlap_comm(&pme
->overlap
[0],pme
->pme_order
,
2193 pme
->nnodes_major
,pme
->nodeid_major
,pme
->nkx
);
2195 init_overlap_comm(&pme
->overlap
[1],pme
->pme_order
,
2199 pme
->nnodes_minor
,pme
->nodeid_minor
,pme
->nky
);
2201 snew(pme
->bsp_mod
[XX
],pme
->nkx
);
2202 snew(pme
->bsp_mod
[YY
],pme
->nky
);
2203 snew(pme
->bsp_mod
[ZZ
],pme
->nkz
);
2205 /* Allocate data for the interpolation grid, including overlap */
2206 pme
->pmegrid_nx
= pme
->overlap
[0].s2g1
[pme
->nodeid_major
] -
2207 pme
->overlap
[0].s2g0
[pme
->nodeid_major
];
2208 pme
->pmegrid_ny
= pme
->overlap
[1].s2g1
[pme
->nodeid_minor
] -
2209 pme
->overlap
[1].s2g0
[pme
->nodeid_minor
];
2210 pme
->pmegrid_nz
= pme
->nkz
+ pme
->pme_order
- 1;
2212 pme
->pmegrid_start_ix
= pme
->overlap
[0].s2g0
[pme
->nodeid_major
];
2213 pme
->pmegrid_start_iy
= pme
->overlap
[1].s2g0
[pme
->nodeid_minor
];
2214 pme
->pmegrid_start_iz
= 0;
2216 make_gridindex5_to_localindex(pme
->nkx
,
2217 pme
->pmegrid_start_ix
,
2218 pme
->pmegrid_nx
- (pme
->pme_order
-1),
2219 &pme
->nnx
,&pme
->fshx
);
2220 make_gridindex5_to_localindex(pme
->nky
,
2221 pme
->pmegrid_start_iy
,
2222 pme
->pmegrid_ny
- (pme
->pme_order
-1),
2223 &pme
->nny
,&pme
->fshy
);
2224 make_gridindex5_to_localindex(pme
->nkz
,
2225 pme
->pmegrid_start_iz
,
2226 pme
->pmegrid_nz
- (pme
->pme_order
-1),
2227 &pme
->nnz
,&pme
->fshz
);
2229 snew(pme
->pmegridA
,pme
->pmegrid_nx
*pme
->pmegrid_ny
*pme
->pmegrid_nz
);
2231 /* For non-divisible grid we need pme_order iso pme_order-1 */
2232 /* x overlap is copied in place: take padding into account.
2233 * y is always copied through a buffer: we don't need padding in z,
2234 * but we do need the overlap in x because of the communication order.
2236 bufsizex
= pme
->pme_order
*pme
->pmegrid_ny
*pme
->pmegrid_nz
;
2237 bufsizey
= pme
->pme_order
*pme
->pmegrid_nx
*pme
->nkz
;
2238 bufsize
= (bufsizex
>bufsizey
) ? bufsizex
: bufsizey
;
2240 snew(pme
->pmegrid_sendbuf
,bufsize
);
2241 snew(pme
->pmegrid_recvbuf
,bufsize
);
2243 ndata
[0] = pme
->nkx
;
2244 ndata
[1] = pme
->nky
;
2245 ndata
[2] = pme
->nkz
;
2247 /* This routine will allocate the grid data to fit the FFTs */
2248 gmx_parallel_3dfft_init(&pme
->pfft_setupA
,ndata
,
2249 &pme
->fftgridA
,&pme
->cfftgridA
,
2251 pme
->overlap
[0].s2g0
,pme
->overlap
[1].s2g0
,
2256 snew(pme
->pmegridB
,pme
->pmegrid_nx
*pme
->pmegrid_ny
*pme
->pmegrid_nz
);
2257 gmx_parallel_3dfft_init(&pme
->pfft_setupB
,ndata
,
2258 &pme
->fftgridB
,&pme
->cfftgridB
,
2260 pme
->overlap
[0].s2g0
,pme
->overlap
[1].s2g0
,
2264 pme
->pmegridB
= NULL
;
2265 pme
->fftgridB
= NULL
;
2266 pme
->cfftgridB
= NULL
;
2269 make_bspline_moduli(pme
->bsp_mod
,pme
->nkx
,pme
->nky
,pme
->nkz
,pme
->pme_order
);
2271 /* Use atc[0] for spreading */
2272 init_atomcomm(pme
,&pme
->atc
[0],cr
,nnodes_major
> 1 ? 0 : 1,TRUE
);
2273 if (pme
->ndecompdim
>= 2)
2275 init_atomcomm(pme
,&pme
->atc
[1],cr
,1,FALSE
);
2278 if (pme
->nnodes
== 1) {
2279 pme
->atc
[0].n
= homenr
;
2280 pme_realloc_atomcomm_things(&pme
->atc
[0]);
2283 /* Use fft5d, order after FFT is y major, z, x minor */
2284 pme
->work_nalloc
= pme
->nkx
;
2285 snew(pme
->work_mhx
,pme
->work_nalloc
);
2286 snew(pme
->work_mhy
,pme
->work_nalloc
);
2287 snew(pme
->work_mhz
,pme
->work_nalloc
);
2288 snew(pme
->work_m2
,pme
->work_nalloc
);
2289 snew(pme
->work_denom
,pme
->work_nalloc
);
2290 /* Allocate an aligned pointer for SSE operations, including 3 extra
2291 * elements at the end since SSE operates on 4 elements at a time.
2293 snew(pme
->work_tmp1_alloc
,pme
->work_nalloc
+8);
2294 pme
->work_tmp1
= (real
*) (((size_t) pme
->work_tmp1_alloc
+ 16) & (~((size_t) 15)));
2295 snew(pme
->work_m2inv
,pme
->work_nalloc
);
2302 static void spread_on_grid(gmx_pme_t pme
,
2303 pme_atomcomm_t
*atc
,real
*grid
,
2304 gmx_bool bCalcSplines
,gmx_bool bSpread
)
2309 /* Compute fftgrid index for all atoms,
2310 * with help of some extra variables.
2312 calc_interpolation_idx(pme
,atc
);
2314 /* make local bsplines */
2315 make_bsplines(atc
->theta
,atc
->dtheta
,pme
->pme_order
,
2316 atc
->fractx
,atc
->n
,atc
->q
,pme
->bFEP
);
2321 /* put local atoms on grid. */
2322 spread_q_bsplines(pme
,atc
,grid
);
2326 void gmx_pme_calc_energy(gmx_pme_t pme
,int n
,rvec
*x
,real
*q
,real
*V
)
2328 pme_atomcomm_t
*atc
;
2331 if (pme
->nnodes
> 1)
2333 gmx_incons("gmx_pme_calc_energy called in parallel");
2337 gmx_incons("gmx_pme_calc_energy with free energy");
2340 atc
= &pme
->atc_energy
;
2342 atc
->bSpread
= TRUE
;
2343 atc
->pme_order
= pme
->pme_order
;
2345 pme_realloc_atomcomm_things(atc
);
2349 /* We only use the A-charges grid */
2350 grid
= pme
->pmegridA
;
2352 spread_on_grid(pme
,atc
,NULL
,TRUE
,FALSE
);
2354 *V
= gather_energy_bsplines(pme
,grid
,atc
);
2358 static void reset_pmeonly_counters(t_commrec
*cr
,gmx_wallcycle_t wcycle
,
2359 t_nrnb
*nrnb
,t_inputrec
*ir
, gmx_large_int_t step_rel
)
2361 /* Reset all the counters related to performance over the run */
2362 wallcycle_stop(wcycle
,ewcRUN
);
2363 wallcycle_reset_all(wcycle
);
2365 ir
->init_step
+= step_rel
;
2366 ir
->nsteps
-= step_rel
;
2367 wallcycle_start(wcycle
,ewcRUN
);
2371 int gmx_pmeonly(gmx_pme_t pme
,
2372 t_commrec
*cr
, t_nrnb
*nrnb
,
2373 gmx_wallcycle_t wcycle
,
2374 real ewaldcoeff
, gmx_bool bGatherOnly
,
2377 gmx_pme_pp_t pme_pp
;
2380 rvec
*x_pp
=NULL
,*f_pp
=NULL
;
2381 real
*chargeA
=NULL
,*chargeB
=NULL
;
2383 int maxshift_x
=0,maxshift_y
=0;
2384 real energy
,dvdlambda
;
2389 gmx_large_int_t step
,step_rel
;
2392 pme_pp
= gmx_pme_pp_init(cr
);
2397 do /****** this is a quasi-loop over time steps! */
2399 /* Domain decomposition */
2400 natoms
= gmx_pme_recv_q_x(pme_pp
,
2401 &chargeA
,&chargeB
,box
,&x_pp
,&f_pp
,
2402 &maxshift_x
,&maxshift_y
,
2408 /* We should stop: break out of the loop */
2412 step_rel
= step
- ir
->init_step
;
2415 wallcycle_start(wcycle
,ewcRUN
);
2417 wallcycle_start(wcycle
,ewcPMEMESH
);
2421 gmx_pme_do(pme
,0,natoms
,x_pp
,f_pp
,chargeA
,chargeB
,box
,
2422 cr
,maxshift_x
,maxshift_y
,nrnb
,wcycle
,vir
,ewaldcoeff
,
2423 &energy
,lambda
,&dvdlambda
,
2424 GMX_PME_DO_ALL_F
| (bEnerVir
? GMX_PME_CALC_ENER_VIR
: 0));
2426 cycles
= wallcycle_stop(wcycle
,ewcPMEMESH
);
2428 gmx_pme_send_force_vir_ener(pme_pp
,
2429 f_pp
,vir
,energy
,dvdlambda
,
2434 if (step_rel
== wcycle_get_reset_counters(wcycle
))
2436 /* Reset all the counters related to performance over the run */
2437 reset_pmeonly_counters(cr
,wcycle
,nrnb
,ir
,step_rel
);
2438 wcycle_set_reset_counters(wcycle
, 0);
2441 } /***** end of quasi-loop, we stop with the break above */
2447 int gmx_pme_do(gmx_pme_t pme
,
2448 int start
, int homenr
,
2450 real
*chargeA
, real
*chargeB
,
2451 matrix box
, t_commrec
*cr
,
2452 int maxshift_x
, int maxshift_y
,
2453 t_nrnb
*nrnb
, gmx_wallcycle_t wcycle
,
2454 matrix vir
, real ewaldcoeff
,
2455 real
*energy
, real lambda
,
2456 real
*dvdlambda
, int flags
)
2458 int q
,d
,i
,j
,ntot
,npme
;
2462 pme_atomcomm_t
*atc
=NULL
;
2466 real
*charge
=NULL
,*q_d
,vol
;
2470 gmx_parallel_3dfft_t pfft_setup
;
2472 t_complex
* cfftgrid
;
2474 if (pme
->nnodes
> 1) {
2477 if (atc
->npd
> atc
->pd_nalloc
) {
2478 atc
->pd_nalloc
= over_alloc_dd(atc
->npd
);
2479 srenew(atc
->pd
,atc
->pd_nalloc
);
2481 atc
->maxshift
= (atc
->dimind
==0 ? maxshift_x
: maxshift_y
);
2485 /* This could be necessary for TPI */
2486 pme
->atc
[0].n
= homenr
;
2489 for(q
=0; q
<(pme
->bFEP
? 2 : 1); q
++) {
2491 grid
= pme
->pmegridA
;
2492 fftgrid
= pme
->fftgridA
;
2493 cfftgrid
= pme
->cfftgridA
;
2494 pfft_setup
= pme
->pfft_setupA
;
2495 charge
= chargeA
+start
;
2497 grid
= pme
->pmegridB
;
2498 fftgrid
= pme
->fftgridB
;
2499 cfftgrid
= pme
->cfftgridB
;
2500 pfft_setup
= pme
->pfft_setupB
;
2501 charge
= chargeB
+start
;
2503 /* Unpack structure */
2505 fprintf(debug
,"PME: nnodes = %d, nodeid = %d\n",
2506 cr
->nnodes
,cr
->nodeid
);
2507 fprintf(debug
,"Grid = %p\n",(void*)grid
);
2509 gmx_fatal(FARGS
,"No grid!");
2513 m_inv_ur0(box
,pme
->recipbox
);
2515 if (pme
->nnodes
== 1) {
2517 if (DOMAINDECOMP(cr
)) {
2519 pme_realloc_atomcomm_things(atc
);
2525 wallcycle_start(wcycle
,ewcPME_REDISTXF
);
2526 for(d
=pme
->ndecompdim
-1; d
>=0; d
--)
2528 if (d
== pme
->ndecompdim
-1)
2536 n_d
= pme
->atc
[d
+1].n
;
2542 if (atc
->npd
> atc
->pd_nalloc
) {
2543 atc
->pd_nalloc
= over_alloc_dd(atc
->npd
);
2544 srenew(atc
->pd
,atc
->pd_nalloc
);
2546 atc
->maxshift
= (atc
->dimind
==0 ? maxshift_x
: maxshift_y
);
2547 pme_calc_pidx(n_d
,pme
->recipbox
,x_d
,atc
);
2550 GMX_BARRIER(cr
->mpi_comm_mygroup
);
2551 /* Redistribute x (only once) and qA or qB */
2552 if (DOMAINDECOMP(cr
)) {
2553 dd_pmeredist_x_q(pme
, n_d
, q
==0, x_d
, q_d
, atc
);
2555 pmeredist_pd(pme
, TRUE
, n_d
, q
==0, x_d
, q_d
, atc
);
2560 wallcycle_stop(wcycle
,ewcPME_REDISTXF
);
2564 fprintf(debug
,"Node= %6d, pme local particles=%6d\n",
2567 if (flags
& GMX_PME_SPREAD_Q
)
2569 wallcycle_start(wcycle
,ewcPME_SPREADGATHER
);
2571 /* Spread the charges on a grid */
2572 GMX_MPE_LOG(ev_spread_on_grid_start
);
2574 /* Spread the charges on a grid */
2575 spread_on_grid(pme
,&pme
->atc
[0],grid
,q
==0,TRUE
);
2576 GMX_MPE_LOG(ev_spread_on_grid_finish
);
2580 inc_nrnb(nrnb
,eNR_WEIGHTS
,DIM
*atc
->n
);
2582 inc_nrnb(nrnb
,eNR_SPREADQBSP
,
2583 pme
->pme_order
*pme
->pme_order
*pme
->pme_order
*atc
->n
);
2585 wrap_periodic_pmegrid(pme
,grid
);
2587 /* sum contributions to local grid from other nodes */
2589 if (pme
->nnodes
> 1) {
2590 GMX_BARRIER(cr
->mpi_comm_mygroup
);
2591 gmx_sum_qgrid_dd(pme
,grid
,GMX_SUM_QGRID_FORWARD
);
2597 copy_pmegrid_to_fftgrid(pme
,grid
,fftgrid
);
2599 wallcycle_stop(wcycle
,ewcPME_SPREADGATHER
);
2602 if (flags
& GMX_PME_SOLVE
)
2605 GMX_BARRIER(cr
->mpi_comm_mygroup
);
2606 GMX_MPE_LOG(ev_gmxfft3d_start
);
2607 wallcycle_start(wcycle
,ewcPME_FFT
);
2608 gmx_parallel_3dfft_execute(pfft_setup
,GMX_FFT_REAL_TO_COMPLEX
,fftgrid
,cfftgrid
);
2609 wallcycle_stop(wcycle
,ewcPME_FFT
);
2610 GMX_MPE_LOG(ev_gmxfft3d_finish
);
2613 /* solve in k-space for our local cells */
2615 GMX_BARRIER(cr
->mpi_comm_mygroup
);
2616 GMX_MPE_LOG(ev_solve_pme_start
);
2617 wallcycle_start(wcycle
,ewcPME_SOLVE
);
2619 solve_pme_yzx(pme
,cfftgrid
,ewaldcoeff
,vol
,
2620 flags
& GMX_PME_CALC_ENER_VIR
,
2621 &energy_AB
[q
],vir_AB
[q
]);
2622 wallcycle_stop(wcycle
,ewcPME_SOLVE
);
2624 GMX_MPE_LOG(ev_solve_pme_finish
);
2625 inc_nrnb(nrnb
,eNR_SOLVEPME
,loop_count
);
2628 if ((flags
& GMX_PME_CALC_F
) ||
2629 (flags
& GMX_PME_CALC_POT
))
2633 GMX_BARRIER(cr
->mpi_comm_mygroup
);
2634 GMX_MPE_LOG(ev_gmxfft3d_start
);
2636 wallcycle_start(wcycle
,ewcPME_FFT
);
2637 gmx_parallel_3dfft_execute(pfft_setup
,GMX_FFT_COMPLEX_TO_REAL
,cfftgrid
,fftgrid
);
2638 wallcycle_stop(wcycle
,ewcPME_FFT
);
2641 GMX_MPE_LOG(ev_gmxfft3d_finish
);
2643 if (pme
->nodeid
== 0)
2645 ntot
= pme
->nkx
*pme
->nky
*pme
->nkz
;
2646 npme
= ntot
*log((real
)ntot
)/log(2.0);
2647 inc_nrnb(nrnb
,eNR_FFT
,2*npme
);
2650 wallcycle_start(wcycle
,ewcPME_SPREADGATHER
);
2652 copy_fftgrid_to_pmegrid(pme
,fftgrid
,grid
);
2654 /* distribute local grid to all nodes */
2656 if (pme
->nnodes
> 1) {
2657 GMX_BARRIER(cr
->mpi_comm_mygroup
);
2658 gmx_sum_qgrid_dd(pme
,grid
,GMX_SUM_QGRID_BACKWARD
);
2663 unwrap_periodic_pmegrid(pme
,grid
);
2666 if (flags
& GMX_PME_CALC_F
)
2668 /* interpolate forces for our local atoms */
2669 GMX_BARRIER(cr
->mpi_comm_mygroup
);
2670 GMX_MPE_LOG(ev_gather_f_bsplines_start
);
2674 /* If we are running without parallelization,
2675 * atc->f is the actual force array, not a buffer,
2676 * therefore we should not clear it.
2678 bClearF
= (q
== 0 && PAR(cr
));
2679 gather_f_bsplines(pme
,grid
,bClearF
,&pme
->atc
[0],
2680 pme
->bFEP
? (q
==0 ? 1.0-lambda
: lambda
) : 1.0);
2683 GMX_MPE_LOG(ev_gather_f_bsplines_finish
);
2685 inc_nrnb(nrnb
,eNR_GATHERFBSP
,
2686 pme
->pme_order
*pme
->pme_order
*pme
->pme_order
*pme
->atc
[0].n
);
2687 wallcycle_stop(wcycle
,ewcPME_SPREADGATHER
);
2691 if ((flags
& GMX_PME_CALC_F
) && pme
->nnodes
> 1) {
2692 wallcycle_start(wcycle
,ewcPME_REDISTXF
);
2693 for(d
=0; d
<pme
->ndecompdim
; d
++)
2696 if (d
== pme
->ndecompdim
- 1)
2703 n_d
= pme
->atc
[d
+1].n
;
2704 f_d
= pme
->atc
[d
+1].f
;
2706 GMX_BARRIER(cr
->mpi_comm_mygroup
);
2707 if (DOMAINDECOMP(cr
)) {
2708 dd_pmeredist_f(pme
,atc
,n_d
,f_d
,
2709 d
==pme
->ndecompdim
-1 && pme
->bPPnode
);
2711 pmeredist_pd(pme
, FALSE
, n_d
, TRUE
, f_d
, NULL
, atc
);
2715 wallcycle_stop(wcycle
,ewcPME_REDISTXF
);
2720 *energy
= energy_AB
[0];
2721 m_add(vir
,vir_AB
[0],vir
);
2723 *energy
= (1.0-lambda
)*energy_AB
[0] + lambda
*energy_AB
[1];
2724 *dvdlambda
+= energy_AB
[1] - energy_AB
[0];
2725 for(i
=0; i
<DIM
; i
++)
2726 for(j
=0; j
<DIM
; j
++)
2727 vir
[i
][j
] += (1.0-lambda
)*vir_AB
[0][i
][j
] + lambda
*vir_AB
[1][i
][j
];
2731 fprintf(debug
,"PME mesh energy: %g\n",*energy
);