1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of Gromacs Copyright (c) 1991-2008
5 * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
16 * Gnomes, ROck Monsters And Chili Sauce
34 /* Margin for setting up the DD grid */
35 #define DD_GRID_MARGIN_PRES_SCALE 1.05
37 static int factorize(int n
,int **fac
,int **mfac
)
41 /* Decompose n in factors */
50 if (ndiv
== 0 || (*fac
)[ndiv
-1] != d
)
64 static bool fits_pme_ratio(int nnodes
,int npme
,float ratio
)
66 return ((double)npme
/(double)nnodes
> 0.95*ratio
);
69 static bool fits_pp_pme_perf(FILE *fplog
,
70 t_inputrec
*ir
,matrix box
,gmx_mtop_t
*mtop
,
71 int nnodes
,int npme
,float ratio
)
73 int ndiv
,*div
,*mdiv
,ldiv
;
75 ndiv
= factorize(nnodes
-npme
,&div
,&mdiv
);
79 /* The check below gives a reasonable division:
80 * factor 5 allowed at 5 or more PP nodes,
81 * factor 7 allowed at 49 or more PP nodes.
83 if (ldiv
> 3 + (int)(pow(nnodes
-npme
,1.0/3.0) + 0.5))
88 /* Does this division gives a reasonable PME load? */
89 return (fits_pme_ratio(nnodes
,npme
,ratio
) &&
90 pme_inconvenient_nnodes(ir
->nkx
,ir
->nky
,npme
) <= 1);
93 static int guess_npme(FILE *fplog
,gmx_mtop_t
*mtop
,t_inputrec
*ir
,matrix box
,
100 ratio
= pme_load_estimate(mtop
,ir
,box
);
104 fprintf(fplog
,"Guess for relative PME load: %.2f\n",ratio
);
107 /* We assume the optimal node ratio is close to the load ratio.
108 * The communication load is neglected,
109 * but (hopefully) this will balance out between PP and PME.
112 if (!fits_pme_ratio(nnodes
,nnodes
/2,ratio
))
114 /* We would need more than nnodes/2 PME only nodes,
115 * which is not possible. Since the PME load is very high,
116 * we will not loose much performance when all nodes do PME.
122 /* First try to find npme as a factor of nnodes up to nnodes/3.
123 * We start with a minimum PME node fraction of 1/16
124 * and avoid ratios which lead to large prime factors in nnodes-npme.
126 npme
= (nnodes
+ 15)/16;
127 while (npme
<= nnodes
/3) {
128 if (nnodes
% npme
== 0)
130 /* Note that fits_perf might change the PME grid,
131 * in the current implementation it does not.
133 if (fits_pp_pme_perf(fplog
,ir
,box
,mtop
,nnodes
,npme
,ratio
))
142 /* Try any possible number for npme */
144 while (npme
<= nnodes
/2)
146 /* Note that fits_perf may change the PME grid */
147 if (fits_pp_pme_perf(fplog
,ir
,box
,mtop
,nnodes
,npme
,ratio
))
156 gmx_fatal(FARGS
,"Could not find an appropriate number of separate PME nodes. i.e. >= %5f*#nodes (%d) and <= #nodes/2 (%d) and reasonable performance wise (grid_x=%d, grid_y=%d).\n"
157 "Use the -npme option of mdrun or change the number of processors or the PME grid dimensions, see the manual for details.",
158 ratio
,(int)(0.95*ratio
*nnodes
+0.5),nnodes
/2,ir
->nkx
,ir
->nky
);
159 /* Keep the compiler happy */
167 "Will use %d particle-particle and %d PME only nodes\n"
168 "This is a guess, check the performance at the end of the log file\n",
172 "Will use %d particle-particle and %d PME only nodes\n"
173 "This is a guess, check the performance at the end of the log file\n",
180 static int lcd(int n1
,int n2
)
185 for(i
=2; (i
<=n1
&& i
<=n2
); i
++)
187 if (n1
% i
== 0 && n2
% i
== 0)
196 real
comm_box_frac(ivec dd_nc
,real cutoff
,gmx_ddbox_t
*ddbox
)
204 bt
[i
] = ddbox
->box_size
[i
]*ddbox
->skew_fac
[i
];
205 nw
[i
] = dd_nc
[i
]*cutoff
/bt
[i
];
216 for(j
=i
+1; j
<DIM
; j
++)
220 comm_vol
+= nw
[i
]*nw
[j
]*M_PI
/4;
221 for(k
=j
+1; k
<DIM
; k
++)
225 comm_vol
+= nw
[i
]*nw
[j
]*nw
[k
]*M_PI
/6;
232 /* Normalize by the number of PP nodes */
238 static bool inhomogeneous_z(const t_inputrec
*ir
)
240 return ((EEL_PME(ir
->coulombtype
) || ir
->coulombtype
==eelEWALD
) &&
241 ir
->ePBC
==epbcXYZ
&& ir
->ewald_geometry
==eewg3DC
);
244 static float comm_cost_est(gmx_domdec_t
*dd
,real limit
,real cutoff
,
245 matrix box
,gmx_ddbox_t
*ddbox
,t_inputrec
*ir
,
251 float comm_vol
,comm_vol_pme
,cost_pbcdx
;
252 /* This is the cost of a pbc_dx call relative to the cost
253 * of communicating the coordinate and force of an atom.
254 * This will be machine dependent.
255 * These factors are for x86 with SMP or Infiniband.
257 float pbcdx_rect_fac
= 0.1;
258 float pbcdx_tric_fac
= 0.2;
260 /* Check the DD algorithm restrictions */
261 if ((ir
->ePBC
== epbcXY
&& ir
->nwall
< 2 && nc
[ZZ
] > 1) ||
262 (ir
->ePBC
== epbcSCREW
&& (nc
[XX
] == 1 || nc
[YY
] > 1 || nc
[ZZ
] > 1)))
267 if (inhomogeneous_z(ir
) && nc
[ZZ
] > 1)
272 /* Check if the triclinic requirements are met */
275 for(j
=i
+1; j
<ddbox
->npbcdim
; j
++)
277 if (box
[j
][i
] != 0 || ir
->deform
[j
][i
] != 0 ||
278 (ir
->epc
!= epcNO
&& ir
->compress
[j
][i
] != 0))
280 if (nc
[j
] > 1 && nc
[i
] == 1)
292 bt
[i
] = ddbox
->box_size
[i
]*ddbox
->skew_fac
[i
];
294 /* Without PBC there are no cell size limits with 2 cells */
295 if (!(i
>= ddbox
->npbcdim
&& nc
[i
] <= 2) && bt
[i
] < nc
[i
]*limit
)
301 /* When two dimensions are (nearly) equal, use more cells
302 * for the smallest index, so the decomposition does not
303 * depend sensitively on the rounding of the box elements.
307 if (npme
== 0 || i
!= XX
)
309 for(j
=i
+1; j
<DIM
; j
++)
311 if (fabs(bt
[j
] - bt
[i
]) < 0.01*bt
[i
] && nc
[j
] > nc
[i
])
319 comm_vol
= comm_box_frac(nc
,cutoff
,ddbox
);
321 /* Determine the largest volume that a PME only needs to communicate */
323 if ((npme
> 0) && (nc
[XX
] % npme
!= 0))
327 comm_vol_pme
= (npme
==2 ? 1.0/3.0 : 0.5);
331 comm_vol_pme
= 1.0 - lcd(nc
[XX
],npme
)/(double)npme
;
333 /* Normalize by the number of PME only nodes */
334 comm_vol_pme
/= npme
;
337 /* Add cost of pbc_dx for bondeds */
339 if ((nc
[XX
] == 1 || nc
[YY
] == 1) || (nc
[ZZ
] == 1 && ir
->ePBC
!= epbcXY
))
341 if ((ddbox
->tric_dir
[XX
] && nc
[XX
] == 1) ||
342 (ddbox
->tric_dir
[YY
] && nc
[YY
] == 1))
344 cost_pbcdx
= pbcdxr
*pbcdx_tric_fac
/npp
;
348 cost_pbcdx
= pbcdxr
*pbcdx_rect_fac
/npp
;
355 "nc %2d %2d %2d vol pp %6.4f pbcdx %6.4f pme %6.4f tot %6.4f\n",
356 nc
[XX
],nc
[YY
],nc
[ZZ
],
357 comm_vol
,cost_pbcdx
,comm_vol_pme
,
358 comm_vol
+ cost_pbcdx
+ comm_vol_pme
);
361 return comm_vol
+ cost_pbcdx
+ comm_vol_pme
;
364 static void assign_factors(gmx_domdec_t
*dd
,
365 real limit
,real cutoff
,
366 matrix box
,gmx_ddbox_t
*ddbox
,t_inputrec
*ir
,
367 float pbcdxr
,int npme
,
368 int ndiv
,int *div
,int *mdiv
,ivec ir_try
,ivec opt
)
375 ce
= comm_cost_est(dd
,limit
,cutoff
,box
,ddbox
,ir
,pbcdxr
,npme
,ir_try
);
376 if (ce
>= 0 && (opt
[XX
] == 0 ||
377 ce
< comm_cost_est(dd
,limit
,cutoff
,box
,ddbox
,ir
,pbcdxr
,
380 copy_ivec(ir_try
,opt
);
386 for(x
=mdiv
[0]; x
>=0; x
--)
390 ir_try
[XX
] *= div
[0];
392 for(y
=mdiv
[0]-x
; y
>=0; y
--)
396 ir_try
[YY
] *= div
[0];
398 for(i
=0; i
<mdiv
[0]-x
-y
; i
++)
400 ir_try
[ZZ
] *= div
[0];
404 assign_factors(dd
,limit
,cutoff
,box
,ddbox
,ir
,pbcdxr
,npme
,
405 ndiv
-1,div
+1,mdiv
+1,ir_try
,opt
);
407 for(i
=0; i
<mdiv
[0]-x
-y
; i
++)
409 ir_try
[ZZ
] /= div
[0];
413 ir_try
[YY
] /= div
[0];
418 ir_try
[XX
] /= div
[0];
423 static real
optimize_ncells(FILE *fplog
,
424 int nnodes_tot
,int npme_only
,
425 bool bDynLoadBal
,real dlb_scale
,
426 gmx_mtop_t
*mtop
,matrix box
,gmx_ddbox_t
*ddbox
,
429 real cellsize_limit
,real cutoff
,
430 bool bInterCGBondeds
,bool bInterCGMultiBody
,
433 int npp
,npme
,ndiv
,*div
,*mdiv
,d
,nmax
;
439 limit
= cellsize_limit
;
445 npp
= nnodes_tot
- npme_only
;
446 if (EEL_PME(ir
->coulombtype
))
448 npme
= (npme_only
> 0 ? npme_only
: npp
);
457 /* For Ewald exclusions pbc_dx is not called */
459 (IR_EXCL_FORCES(*ir
) && !EEL_FULL(ir
->coulombtype
));
460 pbcdxr
= (double)n_bonded_dx(mtop
,bExcl_pbcdx
)/(double)mtop
->natoms
;
464 /* Every molecule is a single charge group: no pbc required */
467 /* Add a margin for DLB and/or pressure scaling */
470 if (dlb_scale
>= 1.0)
472 gmx_fatal(FARGS
,"The value for option -dds should be smaller than 1");
476 fprintf(fplog
,"Scaling the initial minimum size with 1/%g (option -dds) = %g\n",dlb_scale
,1/dlb_scale
);
480 else if (ir
->epc
!= epcNO
)
484 fprintf(fplog
,"To account for pressure scaling, scaling the initial minimum size with %g\n",DD_GRID_MARGIN_PRES_SCALE
);
485 limit
*= DD_GRID_MARGIN_PRES_SCALE
;
491 fprintf(fplog
,"Optimizing the DD grid for %d cells with a minimum initial size of %.3f nm\n",npp
,limit
);
493 if (inhomogeneous_z(ir
))
495 fprintf(fplog
,"Ewald_geometry=%s: assuming inhomogeneous particle distribution in z, will not decompose in z.\n",eewg_names
[ir
->ewald_geometry
]);
500 fprintf(fplog
,"The maximum allowed number of cells is:");
503 nmax
= (int)(ddbox
->box_size
[d
]*ddbox
->skew_fac
[d
]/limit
);
504 if (d
>= ddbox
->npbcdim
&& nmax
< 2)
508 if (d
== ZZ
&& inhomogeneous_z(ir
))
512 fprintf(fplog
," %c %d",'X' + d
,nmax
);
520 fprintf(debug
,"Average nr of pbc_dx calls per atom %.2f\n",pbcdxr
);
523 /* Decompose npp in factors */
524 ndiv
= factorize(npp
,&div
,&mdiv
);
530 assign_factors(dd
,limit
,cutoff
,box
,ddbox
,ir
,pbcdxr
,
531 npme
,ndiv
,div
,mdiv
,itry
,nc
);
539 real
dd_choose_grid(FILE *fplog
,
540 t_commrec
*cr
,gmx_domdec_t
*dd
,t_inputrec
*ir
,
541 gmx_mtop_t
*mtop
,matrix box
,gmx_ddbox_t
*ddbox
,
542 bool bDynLoadBal
,real dlb_scale
,
543 real cellsize_limit
,real cutoff_dd
,
544 bool bInterCGBondeds
,bool bInterCGMultiBody
)
551 if (EEL_PME(ir
->coulombtype
))
553 if (cr
->npmenodes
>= 0)
555 if (cr
->nnodes
<= 2 && cr
->npmenodes
> 0)
558 "Can not have separate PME nodes with 2 or less nodes");
563 if (cr
->nnodes
< 12 &&
564 pme_inconvenient_nnodes(ir
->nkx
,ir
->nky
,cr
->nnodes
) == 0)
570 cr
->npmenodes
= guess_npme(fplog
,mtop
,ir
,box
,cr
->nnodes
);
575 fprintf(fplog
,"Using %d separate PME nodes\n",cr
->npmenodes
);
580 if (cr
->npmenodes
< 0)
586 limit
= optimize_ncells(fplog
,cr
->nnodes
,cr
->npmenodes
,
587 bDynLoadBal
,dlb_scale
,
588 mtop
,box
,ddbox
,ir
,dd
,
589 cellsize_limit
,cutoff_dd
,
590 bInterCGBondeds
,bInterCGMultiBody
,
597 /* Communicate the information set by the master to all nodes */
598 gmx_bcast(sizeof(dd
->nc
),dd
->nc
,cr
);
599 if (EEL_PME(ir
->coulombtype
))
601 gmx_bcast(sizeof(ir
->nkx
),&ir
->nkx
,cr
);
602 gmx_bcast(sizeof(ir
->nky
),&ir
->nky
,cr
);
603 gmx_bcast(sizeof(cr
->npmenodes
),&cr
->npmenodes
,cr
);