src/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "vec.h"
  31 #include "domdec.h"
  32 #include "domdec_network.h"
  33 #include "nrnb.h"
  34 #include "pbc.h"
  35 #include "chargegroup.h"
  36 #include "constr.h"
  37 #include "mdatoms.h"
  38 #include "names.h"
  39 #include "pdbio.h"
  40 #include "futil.h"
  41 #include "force.h"
  42 #include "pme.h"
  43 #include "pull.h"
  44 #include "gmx_wallcycle.h"
  45 #include "mdrun.h"
  46 #include "nsgrid.h"
  47 #include "shellfc.h"
  48 #include "mtop_util.h"
  49 #include "gmxfio.h"
  50 #include "gmx_ga2la.h"
  51
  52 #ifdef GMX_LIB_MPI
  53 #include <mpi.h>
  54 #endif
  55 #ifdef GMX_THREADS
  56 #include "tmpi.h"
  57 #endif
  58
  59 #define DDRANK(dd,rank)    (rank)
  60 #define DDMASTERRANK(dd)   (dd->masterrank)
  61
  62 typedef struct gmx_domdec_master
  63 {
  64     /* The cell boundaries */
  65     real **cell_x;
  66     /* The global charge group division */
  67     int  *ncg;     /* Number of home charge groups for each node */
  68     int  *index;   /* Index of nnodes+1 into cg */
  69     int  *cg;      /* Global charge group index */
  70     int  *nat;     /* Number of home atoms for each node. */
  71     int  *ibuf;    /* Buffer for communication */
  72     rvec *vbuf;    /* Buffer for state scattering and gathering */
  73 } gmx_domdec_master_t;
  74
  75 typedef struct
  76 {
  77     /* The numbers of charge groups to send and receive for each cell
  78      * that requires communication, the last entry contains the total
  79      * number of atoms that needs to be communicated.
  80      */
  81     int nsend[DD_MAXIZONE+2];
  82     int nrecv[DD_MAXIZONE+2];
  83     /* The charge groups to send */
  84     int *index;
  85     int nalloc;
  86     /* The atom range for non-in-place communication */
  87     int cell2at0[DD_MAXIZONE];
  88     int cell2at1[DD_MAXIZONE];
  89 } gmx_domdec_ind_t;
  90
  91 typedef struct
  92 {
  93     int  np;                   /* Number of grid pulses in this dimension */
  94     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
  95     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
  96     int  np_nalloc;
  97     bool bInPlace;             /* Can we communicate in place?            */
  98 } gmx_domdec_comm_dim_t;
  99
 100 typedef struct
 101 {
 102     bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 103     real *cell_f;      /* State var.: cell boundaries, box relative      */
 104     real *old_cell_f;  /* Temp. var.: old cell size                      */
 105     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 106     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 107     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 108     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 109     bool bLimited;     /* State var.: is DLB limited in this dim and row */
 110     real *buf_ncd;     /* Temp. var.                                     */
 111 } gmx_domdec_root_t;
 112
 113 #define DD_NLOAD_MAX 9
 114
 115 /* Here floats are accurate enough, since these variables
 116  * only influence the load balancing, not the actual MD results.
 117  */
 118 typedef struct
 119 {
 120     int  nload;
 121     float *load;
 122     float sum;
 123     float max;
 124     float sum_m;
 125     float cvol_min;
 126     float mdf;
 127     float pme;
 128     int   flags;
 129 } gmx_domdec_load_t;
 130
 131 typedef struct
 132 {
 133     int  nsc;
 134     int  ind_gl;
 135     int  ind;
 136 } gmx_cgsort_t;
 137
 138 typedef struct
 139 {
 140     gmx_cgsort_t *sort1,*sort2;
 141     int  sort_nalloc;
 142     gmx_cgsort_t *sort_new;
 143     int  sort_new_nalloc;
 144     int  *ibuf;
 145     int  ibuf_nalloc;
 146 } gmx_domdec_sort_t;
 147
 148 typedef struct
 149 {
 150     rvec *v;
 151     int  nalloc;
 152 } vec_rvec_t;
 153
 154 /* This enum determines the order of the coordinates.
 155  * ddnatHOME and ddnatZONE should be first and second,
 156  * the others can be ordered as wanted.
 157  */
 158 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 159
 160 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 161 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 162
 163 typedef struct
 164 {
 165     int  dimind;   /* The dimension index                                    */
 166     int  nslab;    /* The number of PME slabs in this dimension              */
 167     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 168     int  *pp_min;  /* The minimum pp node location, size nslab               */
 169     int  *pp_max;  /* The maximum pp node location,size nslab                */
 170     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 171 } gmx_ddpme_t;
 172
 173 typedef struct
 174 {
 175     real min0;    /* The minimum bottom of this zone                        */
 176     real max1;    /* The maximum top of this zone                           */
 177     real mch0;    /* The maximum bottom communicaton height for this zone   */
 178     real mch1;    /* The maximum top communicaton height for this zone      */
 179     real p1_0;    /* The bottom value of the first cell in this zone        */
 180     real p1_1;    /* The top value of the first cell in this zone           */
 181 } gmx_ddzone_t;
 182
 183 typedef struct gmx_domdec_comm
 184 {
 185     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 186      * unless stated otherwise.
 187      */
 188
 189     /* The number of decomposition dimensions for PME, 0: no PME */
 190     int  npmedecompdim;
 191     /* The number of nodes doing PME (PP/PME or only PME) */
 192     int  npmenodes;
 193     int  npmenodes_major;
 194     int  npmenodes_minor;
 195     /* The communication setup including the PME only nodes */
 196     bool bCartesianPP_PME;
 197     ivec ntot;
 198     int  cartpmedim;
 199     int  *pmenodes;          /* size npmenodes                         */
 200     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 201                               * but with bCartesianPP_PME              */
 202     gmx_ddpme_t ddpme[2];
 203
 204     /* The DD particle-particle nodes only */
 205     bool bCartesianPP;
 206     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 207
 208     /* The global charge groups */
 209     t_block cgs_gl;
 210
 211     /* Should we sort the cgs */
 212     int  nstSortCG;
 213     gmx_domdec_sort_t *sort;
 214
 215     /* Are there bonded and multi-body interactions between charge groups? */
 216     bool bInterCGBondeds;
 217     bool bInterCGMultiBody;
 218
 219     /* Data for the optional bonded interaction atom communication range */
 220     bool bBondComm;
 221     t_blocka *cglink;
 222     char *bLocalCG;
 223
 224     /* The DLB option */
 225     int  eDLB;
 226     /* Are we actually using DLB? */
 227     bool bDynLoadBal;
 228
 229     /* Cell sizes for static load balancing, first index cartesian */
 230     real **slb_frac;
 231
 232     /* The width of the communicated boundaries */
 233     real cutoff_mbody;
 234     real cutoff;
 235     /* The minimum cell size (including triclinic correction) */
 236     rvec cellsize_min;
 237     /* For dlb, for use with edlbAUTO */
 238     rvec cellsize_min_dlb;
 239     /* The lower limit for the DD cell size with DLB */
 240     real cellsize_limit;
 241     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 242     bool bVacDLBNoLimit;
 243
 244     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 245     ivec tric_dir;
 246     /* box0 and box_size are required with dim's without pbc and -gcom */
 247     rvec box0;
 248     rvec box_size;
 249
 250     /* The cell boundaries */
 251     rvec cell_x0;
 252     rvec cell_x1;
 253
 254     /* The old location of the cell boundaries, to check cg displacements */
 255     rvec old_cell_x0;
 256     rvec old_cell_x1;
 257
 258     /* The communication setup and charge group boundaries for the zones */
 259     gmx_domdec_zones_t zones;
 260
 261     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 262      * cell boundaries of neighboring cells for dynamic load balancing.
 263      */
 264     gmx_ddzone_t zone_d1[2];
 265     gmx_ddzone_t zone_d2[2][2];
 266
 267     /* The coordinate/force communication setup and indices */
 268     gmx_domdec_comm_dim_t cd[DIM];
 269     /* The maximum number of cells to communicate with in one dimension */
 270     int  maxpulse;
 271
 272     /* Which cg distribution is stored on the master node */
 273     int master_cg_ddp_count;
 274
 275     /* The number of cg's received from the direct neighbors */
 276     int  zone_ncg1[DD_MAXZONE];
 277
 278     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 279     int  nat[ddnatNR];
 280
 281     /* Communication buffer for general use */
 282     int  *buf_int;
 283     int  nalloc_int;
 284
 285      /* Communication buffer for general use */
 286     vec_rvec_t vbuf;
 287
 288     /* Communication buffers only used with multiple grid pulses */
 289     int  *buf_int2;
 290     int  nalloc_int2;
 291     vec_rvec_t vbuf2;
 292
 293     /* Communication buffers for local redistribution */
 294     int  **cggl_flag;
 295     int  cggl_flag_nalloc[DIM*2];
 296     rvec **cgcm_state;
 297     int  cgcm_state_nalloc[DIM*2];
 298
 299     /* Cell sizes for dynamic load balancing */
 300     gmx_domdec_root_t **root;
 301     real *cell_f_row;
 302     real cell_f0[DIM];
 303     real cell_f1[DIM];
 304     real cell_f_max0[DIM];
 305     real cell_f_min1[DIM];
 306
 307     /* Stuff for load communication */
 308     bool bRecordLoad;
 309     gmx_domdec_load_t *load;
 310 #ifdef GMX_MPI
 311     MPI_Comm *mpi_comm_load;
 312 #endif
 313     /* Cycle counters */
 314     float cycl[ddCyclNr];
 315     int   cycl_n[ddCyclNr];
 316     float cycl_max[ddCyclNr];
 317     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 318     int eFlop;
 319     double flop;
 320     int    flop_n;
 321     /* Have often have did we have load measurements */
 322     int    n_load_have;
 323     /* Have often have we collected the load measurements */
 324     int    n_load_collect;
 325
 326     /* Statistics */
 327     double sum_nat[ddnatNR-ddnatZONE];
 328     int    ndecomp;
 329     int    nload;
 330     double load_step;
 331     double load_sum;
 332     double load_max;
 333     ivec   load_lim;
 334     double load_mdf;
 335     double load_pme;
 336
 337     /* The last partition step */
 338     gmx_large_int_t partition_step;
 339
 340     /* Debugging */
 341     int  nstDDDump;
 342     int  nstDDDumpGrid;
 343     int  DD_debug;
 344 } gmx_domdec_comm_t;
 345
 346 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 347 #define DD_CGIBS 2
 348
 349 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 350 #define DD_FLAG_NRCG  65535
 351 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 352 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 353
 354 /* Zone permutation required to obtain consecutive charge groups
 355  * for neighbor searching.
 356  */
 357 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 358
 359 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 360  * components see only j zones with that component 0.
 361  */
 362
 363 /* The DD zone order */
 364 static const ivec dd_zo[DD_MAXZONE] =
 365   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 366
 367 /* The 3D setup */
 368 #define dd_z3n  8
 369 #define dd_zp3n 4
 370 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 371
 372 /* The 2D setup */
 373 #define dd_z2n  4
 374 #define dd_zp2n 2
 375 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 376
 377 /* The 1D setup */
 378 #define dd_z1n  2
 379 #define dd_zp1n 1
 380 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 381
 382 /* Factors used to avoid problems due to rounding issues */
 383 #define DD_CELL_MARGIN       1.0001
 384 #define DD_CELL_MARGIN2      1.00005
 385 /* Factor to account for pressure scaling during nstlist steps */
 386 #define DD_PRES_SCALE_MARGIN 1.02
 387
 388 /* Allowed performance loss before we DLB or warn */
 389 #define DD_PERF_LOSS 0.05
 390
 391 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 392
 393 /* Use separate MPI send and receive commands
 394  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 395  * This saves memory (and some copying for small nnodes).
 396  * For high parallelization scatter and gather calls are used.
 397  */
 398 #define GMX_DD_NNODES_SENDRECV 4
 399
 400
 401 /*
 402 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 403
 404 static void index2xyz(ivec nc,int ind,ivec xyz)
 405 {
 406   xyz[XX] = ind % nc[XX];
 407   xyz[YY] = (ind / nc[XX]) % nc[YY];
 408   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 409 }
 410 */
 411
 412 /* This order is required to minimize the coordinate communication in PME
 413  * which uses decomposition in the x direction.
 414  */
 415 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 416
 417 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 418 {
 419     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 420     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 421     xyz[ZZ] = ind % nc[ZZ];
 422 }
 423
 424 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 425 {
 426     int ddindex;
 427     int ddnodeid=-1;
 428
 429     ddindex = dd_index(dd->nc,c);
 430     if (dd->comm->bCartesianPP_PME)
 431     {
 432         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 433     }
 434     else if (dd->comm->bCartesianPP)
 435     {
 436 #ifdef GMX_MPI
 437         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 438 #endif
 439     }
 440     else
 441     {
 442         ddnodeid = ddindex;
 443     }
 444
 445     return ddnodeid;
 446 }
 447
 448 static bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 449 {
 450     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 451 }
 452
 453 int ddglatnr(gmx_domdec_t *dd,int i)
 454 {
 455     int atnr;
 456
 457     if (dd == NULL)
 458     {
 459         atnr = i + 1;
 460     }
 461     else
 462     {
 463         if (i >= dd->comm->nat[ddnatNR-1])
 464         {
 465             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 466         }
 467         atnr = dd->gatindex[i] + 1;
 468     }
 469
 470     return atnr;
 471 }
 472
 473 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 474 {
 475     return &dd->comm->cgs_gl;
 476 }
 477
 478 static void vec_rvec_init(vec_rvec_t *v)
 479 {
 480     v->nalloc = 0;
 481     v->v      = NULL;
 482 }
 483
 484 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 485 {
 486     if (n > v->nalloc)
 487     {
 488         v->nalloc = over_alloc_dd(n);
 489         srenew(v->v,v->nalloc);
 490     }
 491 }
 492
 493 void dd_store_state(gmx_domdec_t *dd,t_state *state)
 494 {
 495     int i;
 496
 497     if (state->ddp_count != dd->ddp_count)
 498     {
 499         gmx_incons("The state does not the domain decomposition state");
 500     }
 501
 502     state->ncg_gl = dd->ncg_home;
 503     if (state->ncg_gl > state->cg_gl_nalloc)
 504     {
 505         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 506         srenew(state->cg_gl,state->cg_gl_nalloc);
 507     }
 508     for(i=0; i<state->ncg_gl; i++)
 509     {
 510         state->cg_gl[i] = dd->index_gl[i];
 511     }
 512
 513     state->ddp_count_cg_gl = dd->ddp_count;
 514 }
 515
 516 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 517 {
 518     return &dd->comm->zones;
 519 }
 520
 521 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 522                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 523 {
 524     gmx_domdec_zones_t *zones;
 525     int izone,d,dim;
 526
 527     zones = &dd->comm->zones;
 528
 529     izone = 0;
 530     while (icg >= zones->izone[izone].cg1)
 531     {
 532         izone++;
 533     }
 534
 535     if (izone == 0)
 536     {
 537         *jcg0 = icg;
 538     }
 539     else if (izone < zones->nizone)
 540     {
 541         *jcg0 = zones->izone[izone].jcg0;
 542     }
 543     else
 544     {
 545         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 546                   icg,izone,zones->nizone);
 547     }
 548
 549     *jcg1 = zones->izone[izone].jcg1;
 550
 551     for(d=0; d<dd->ndim; d++)
 552     {
 553         dim = dd->dim[d];
 554         shift0[dim] = zones->izone[izone].shift0[dim];
 555         shift1[dim] = zones->izone[izone].shift1[dim];
 556         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 557         {
 558             /* A conservative approach, this can be optimized */
 559             shift0[dim] -= 1;
 560             shift1[dim] += 1;
 561         }
 562     }
 563 }
 564
 565 int dd_natoms_vsite(gmx_domdec_t *dd)
 566 {
 567     return dd->comm->nat[ddnatVSITE];
 568 }
 569
 570 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 571 {
 572     *at_start = dd->comm->nat[ddnatCON-1];
 573     *at_end   = dd->comm->nat[ddnatCON];
 574 }
 575
 576 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 577 {
 578     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 579     int  *index,*cgindex;
 580     gmx_domdec_comm_t *comm;
 581     gmx_domdec_comm_dim_t *cd;
 582     gmx_domdec_ind_t *ind;
 583     rvec shift={0,0,0},*buf,*rbuf;
 584     bool bPBC,bScrew;
 585
 586     comm = dd->comm;
 587
 588     cgindex = dd->cgindex;
 589
 590     buf = comm->vbuf.v;
 591
 592     nzone = 1;
 593     nat_tot = dd->nat_home;
 594     for(d=0; d<dd->ndim; d++)
 595     {
 596         bPBC   = (dd->ci[dd->dim[d]] == 0);
 597         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 598         if (bPBC)
 599         {
 600             copy_rvec(box[dd->dim[d]],shift);
 601         }
 602         cd = &comm->cd[d];
 603         for(p=0; p<cd->np; p++)
 604         {
 605             ind = &cd->ind[p];
 606             index = ind->index;
 607             n = 0;
 608             if (!bPBC)
 609             {
 610                 for(i=0; i<ind->nsend[nzone]; i++)
 611                 {
 612                     at0 = cgindex[index[i]];
 613                     at1 = cgindex[index[i]+1];
 614                     for(j=at0; j<at1; j++)
 615                     {
 616                         copy_rvec(x[j],buf[n]);
 617                         n++;
 618                     }
 619                 }
 620             }
 621             else if (!bScrew)
 622             {
 623                 for(i=0; i<ind->nsend[nzone]; i++)
 624                 {
 625                     at0 = cgindex[index[i]];
 626                     at1 = cgindex[index[i]+1];
 627                     for(j=at0; j<at1; j++)
 628                     {
 629                         /* We need to shift the coordinates */
 630                         rvec_add(x[j],shift,buf[n]);
 631                         n++;
 632                     }
 633                 }
 634             }
 635             else
 636             {
 637                 for(i=0; i<ind->nsend[nzone]; i++)
 638                 {
 639                     at0 = cgindex[index[i]];
 640                     at1 = cgindex[index[i]+1];
 641                     for(j=at0; j<at1; j++)
 642                     {
 643                         /* Shift x */
 644                         buf[n][XX] = x[j][XX] + shift[XX];
 645                         /* Rotate y and z.
 646                          * This operation requires a special shift force
 647                          * treatment, which is performed in calc_vir.
 648                          */
 649                         buf[n][YY] = box[YY][YY] - x[j][YY];
 650                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 651                         n++;
 652                     }
 653                 }
 654             }
 655
 656             if (cd->bInPlace)
 657             {
 658                 rbuf = x + nat_tot;
 659             }
 660             else
 661             {
 662                 rbuf = comm->vbuf2.v;
 663             }
 664             /* Send and receive the coordinates */
 665             dd_sendrecv_rvec(dd, d, dddirBackward,
 666                              buf,  ind->nsend[nzone+1],
 667                              rbuf, ind->nrecv[nzone+1]);
 668             if (!cd->bInPlace)
 669             {
 670                 j = 0;
 671                 for(zone=0; zone<nzone; zone++)
 672                 {
 673                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 674                     {
 675                         copy_rvec(rbuf[j],x[i]);
 676                         j++;
 677                     }
 678                 }
 679             }
 680             nat_tot += ind->nrecv[nzone+1];
 681         }
 682         nzone += nzone;
 683     }
 684 }
 685
 686 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 687 {
 688     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 689     int  *index,*cgindex;
 690     gmx_domdec_comm_t *comm;
 691     gmx_domdec_comm_dim_t *cd;
 692     gmx_domdec_ind_t *ind;
 693     rvec *buf,*sbuf;
 694     ivec vis;
 695     int  is;
 696     bool bPBC,bScrew;
 697
 698     comm = dd->comm;
 699
 700     cgindex = dd->cgindex;
 701
 702     buf = comm->vbuf.v;
 703
 704     n = 0;
 705     nzone = comm->zones.n/2;
 706     nat_tot = dd->nat_tot;
 707     for(d=dd->ndim-1; d>=0; d--)
 708     {
 709         bPBC   = (dd->ci[dd->dim[d]] == 0);
 710         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 711         if (fshift == NULL && !bScrew)
 712         {
 713             bPBC = FALSE;
 714         }
 715         /* Determine which shift vector we need */
 716         clear_ivec(vis);
 717         vis[dd->dim[d]] = 1;
 718         is = IVEC2IS(vis);
 719
 720         cd = &comm->cd[d];
 721         for(p=cd->np-1; p>=0; p--) {
 722             ind = &cd->ind[p];
 723             nat_tot -= ind->nrecv[nzone+1];
 724             if (cd->bInPlace)
 725             {
 726                 sbuf = f + nat_tot;
 727             }
 728             else
 729             {
 730                 sbuf = comm->vbuf2.v;
 731                 j = 0;
 732                 for(zone=0; zone<nzone; zone++)
 733                 {
 734                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 735                     {
 736                         copy_rvec(f[i],sbuf[j]);
 737                         j++;
 738                     }
 739                 }
 740             }
 741             /* Communicate the forces */
 742             dd_sendrecv_rvec(dd, d, dddirForward,
 743                              sbuf, ind->nrecv[nzone+1],
 744                              buf,  ind->nsend[nzone+1]);
 745             index = ind->index;
 746             /* Add the received forces */
 747             n = 0;
 748             if (!bPBC)
 749             {
 750                 for(i=0; i<ind->nsend[nzone]; i++)
 751                 {
 752                     at0 = cgindex[index[i]];
 753                     at1 = cgindex[index[i]+1];
 754                     for(j=at0; j<at1; j++)
 755                     {
 756                         rvec_inc(f[j],buf[n]);
 757                         n++;
 758                     }
 759                 }
 760             }
 761             else if (!bScrew)
 762             {
 763                 for(i=0; i<ind->nsend[nzone]; i++)
 764                 {
 765                     at0 = cgindex[index[i]];
 766                     at1 = cgindex[index[i]+1];
 767                     for(j=at0; j<at1; j++)
 768                     {
 769                         rvec_inc(f[j],buf[n]);
 770                         /* Add this force to the shift force */
 771                         rvec_inc(fshift[is],buf[n]);
 772                         n++;
 773                     }
 774                 }
 775             }
 776             else
 777             {
 778                 for(i=0; i<ind->nsend[nzone]; i++)
 779                 {
 780                     at0 = cgindex[index[i]];
 781                     at1 = cgindex[index[i]+1];
 782                     for(j=at0; j<at1; j++)
 783                     {
 784                         /* Rotate the force */
 785                         f[j][XX] += buf[n][XX];
 786                         f[j][YY] -= buf[n][YY];
 787                         f[j][ZZ] -= buf[n][ZZ];
 788                         if (fshift)
 789                         {
 790                             /* Add this force to the shift force */
 791                             rvec_inc(fshift[is],buf[n]);
 792                         }
 793                         n++;
 794                     }
 795                 }
 796             }
 797         }
 798         nzone /= 2;
 799     }
 800 }
 801
 802 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 803 {
 804     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 805     int  *index,*cgindex;
 806     gmx_domdec_comm_t *comm;
 807     gmx_domdec_comm_dim_t *cd;
 808     gmx_domdec_ind_t *ind;
 809     real *buf,*rbuf;
 810
 811     comm = dd->comm;
 812
 813     cgindex = dd->cgindex;
 814
 815     buf = &comm->vbuf.v[0][0];
 816
 817     nzone = 1;
 818     nat_tot = dd->nat_home;
 819     for(d=0; d<dd->ndim; d++)
 820     {
 821         cd = &comm->cd[d];
 822         for(p=0; p<cd->np; p++)
 823         {
 824             ind = &cd->ind[p];
 825             index = ind->index;
 826             n = 0;
 827             for(i=0; i<ind->nsend[nzone]; i++)
 828             {
 829                 at0 = cgindex[index[i]];
 830                 at1 = cgindex[index[i]+1];
 831                 for(j=at0; j<at1; j++)
 832                 {
 833                     buf[n] = v[j];
 834                     n++;
 835                 }
 836             }
 837
 838             if (cd->bInPlace)
 839             {
 840                 rbuf = v + nat_tot;
 841             }
 842             else
 843             {
 844                 rbuf = &comm->vbuf2.v[0][0];
 845             }
 846             /* Send and receive the coordinates */
 847             dd_sendrecv_real(dd, d, dddirBackward,
 848                              buf,  ind->nsend[nzone+1],
 849                              rbuf, ind->nrecv[nzone+1]);
 850             if (!cd->bInPlace)
 851             {
 852                 j = 0;
 853                 for(zone=0; zone<nzone; zone++)
 854                 {
 855                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 856                     {
 857                         v[i] = rbuf[j];
 858                         j++;
 859                     }
 860                 }
 861             }
 862             nat_tot += ind->nrecv[nzone+1];
 863         }
 864         nzone += nzone;
 865     }
 866 }
 867
 868 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 869 {
 870     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 871     int  *index,*cgindex;
 872     gmx_domdec_comm_t *comm;
 873     gmx_domdec_comm_dim_t *cd;
 874     gmx_domdec_ind_t *ind;
 875     real *buf,*sbuf;
 876
 877     comm = dd->comm;
 878
 879     cgindex = dd->cgindex;
 880
 881     buf = &comm->vbuf.v[0][0];
 882
 883     n = 0;
 884     nzone = comm->zones.n/2;
 885     nat_tot = dd->nat_tot;
 886     for(d=dd->ndim-1; d>=0; d--)
 887     {
 888         cd = &comm->cd[d];
 889         for(p=cd->np-1; p>=0; p--) {
 890             ind = &cd->ind[p];
 891             nat_tot -= ind->nrecv[nzone+1];
 892             if (cd->bInPlace)
 893             {
 894                 sbuf = v + nat_tot;
 895             }
 896             else
 897             {
 898                 sbuf = &comm->vbuf2.v[0][0];
 899                 j = 0;
 900                 for(zone=0; zone<nzone; zone++)
 901                 {
 902                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 903                     {
 904                         sbuf[j] = v[i];
 905                         j++;
 906                     }
 907                 }
 908             }
 909             /* Communicate the forces */
 910             dd_sendrecv_real(dd, d, dddirForward,
 911                              sbuf, ind->nrecv[nzone+1],
 912                              buf,  ind->nsend[nzone+1]);
 913             index = ind->index;
 914             /* Add the received forces */
 915             n = 0;
 916             for(i=0; i<ind->nsend[nzone]; i++)
 917             {
 918                 at0 = cgindex[index[i]];
 919                 at1 = cgindex[index[i]+1];
 920                 for(j=at0; j<at1; j++)
 921                 {
 922                     v[j] += buf[n];
 923                     n++;
 924                 }
 925             }
 926         }
 927         nzone /= 2;
 928     }
 929 }
 930
 931 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 932 {
 933     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 934             d,i,j,
 935             zone->min0,zone->max1,
 936             zone->mch0,zone->mch0,
 937             zone->p1_0,zone->p1_1);
 938 }
 939
 940 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 941                                int ddimind,int direction,
 942                                gmx_ddzone_t *buf_s,int n_s,
 943                                gmx_ddzone_t *buf_r,int n_r)
 944 {
 945     rvec vbuf_s[5*2],vbuf_r[5*2];
 946     int i;
 947
 948     for(i=0; i<n_s; i++)
 949     {
 950         vbuf_s[i*2  ][0] = buf_s[i].min0;
 951         vbuf_s[i*2  ][1] = buf_s[i].max1;
 952         vbuf_s[i*2  ][2] = buf_s[i].mch0;
 953         vbuf_s[i*2+1][0] = buf_s[i].mch1;
 954         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
 955         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
 956     }
 957
 958     dd_sendrecv_rvec(dd, ddimind, direction,
 959                      vbuf_s, n_s*2,
 960                      vbuf_r, n_r*2);
 961
 962     for(i=0; i<n_r; i++)
 963     {
 964         buf_r[i].min0 = vbuf_r[i*2  ][0];
 965         buf_r[i].max1 = vbuf_r[i*2  ][1];
 966         buf_r[i].mch0 = vbuf_r[i*2  ][2];
 967         buf_r[i].mch1 = vbuf_r[i*2+1][0];
 968         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
 969         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
 970     }
 971 }
 972
 973 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 974                           rvec cell_ns_x0,rvec cell_ns_x1)
 975 {
 976     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 977     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
 978     rvec extr_s[2],extr_r[2];
 979     rvec dh;
 980     real dist_d,c=0,det;
 981     gmx_domdec_comm_t *comm;
 982     bool bPBC,bUse;
 983
 984     comm = dd->comm;
 985
 986     for(d=1; d<dd->ndim; d++)
 987     {
 988         dim = dd->dim[d];
 989         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 990         zp->min0 = cell_ns_x0[dim];
 991         zp->max1 = cell_ns_x1[dim];
 992         zp->mch0 = cell_ns_x0[dim];
 993         zp->mch1 = cell_ns_x1[dim];
 994         zp->p1_0 = cell_ns_x0[dim];
 995         zp->p1_1 = cell_ns_x1[dim];
 996     }
 997
 998     for(d=dd->ndim-2; d>=0; d--)
 999     {
1000         dim  = dd->dim[d];
1001         bPBC = (dim < ddbox->npbcdim);
1002
1003         /* Use an rvec to store two reals */
1004         extr_s[d][0] = comm->cell_f0[d+1];
1005         extr_s[d][1] = comm->cell_f1[d+1];
1006         extr_s[d][2] = 0;
1007
1008         pos = 0;
1009         /* Store the extremes in the backward sending buffer,
1010          * so the get updated separately from the forward communication.
1011          */
1012         for(d1=d; d1<dd->ndim-1; d1++)
1013         {
1014             /* We invert the order to be able to use the same loop for buf_e */
1015             buf_s[pos].min0 = extr_s[d1][1];
1016             buf_s[pos].max1 = extr_s[d1][0];
1017             buf_s[pos].mch0 = 0;
1018             buf_s[pos].mch1 = 0;
1019             /* Store the cell corner of the dimension we communicate along */
1020             buf_s[pos].p1_0 = comm->cell_x0[dim];
1021             buf_s[pos].p1_1 = 0;
1022             pos++;
1023         }
1024
1025         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1026         pos++;
1027
1028         if (dd->ndim == 3 && d == 0)
1029         {
1030             buf_s[pos] = comm->zone_d2[0][1];
1031             pos++;
1032             buf_s[pos] = comm->zone_d1[0];
1033             pos++;
1034         }
1035
1036         /* We only need to communicate the extremes
1037          * in the forward direction
1038          */
1039         npulse = comm->cd[d].np;
1040         if (bPBC)
1041         {
1042             /* Take the minimum to avoid double communication */
1043             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1044         }
1045         else
1046         {
1047             /* Without PBC we should really not communicate over
1048              * the boundaries, but implementing that complicates
1049              * the communication setup and therefore we simply
1050              * do all communication, but ignore some data.
1051              */
1052             npulse_min = npulse;
1053         }
1054         for(p=0; p<npulse_min; p++)
1055         {
1056             /* Communicate the extremes forward */
1057             bUse = (bPBC || dd->ci[dim] > 0);
1058
1059             dd_sendrecv_rvec(dd, d, dddirForward,
1060                              extr_s+d, dd->ndim-d-1,
1061                              extr_r+d, dd->ndim-d-1);
1062
1063             if (bUse)
1064             {
1065                 for(d1=d; d1<dd->ndim-1; d1++)
1066                 {
1067                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1068                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1069                 }
1070             }
1071         }
1072
1073         buf_size = pos;
1074         for(p=0; p<npulse; p++)
1075         {
1076             /* Communicate all the zone information backward */
1077             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1078
1079             dd_sendrecv_ddzone(dd, d, dddirBackward,
1080                                buf_s, buf_size,
1081                                buf_r, buf_size);
1082
1083             clear_rvec(dh);
1084             if (p > 0)
1085             {
1086                 for(d1=d+1; d1<dd->ndim; d1++)
1087                 {
1088                     /* Determine the decrease of maximum required
1089                      * communication height along d1 due to the distance along d,
1090                      * this avoids a lot of useless atom communication.
1091                      */
1092                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1093
1094                     if (ddbox->tric_dir[dim])
1095                     {
1096                         /* c is the off-diagonal coupling between the cell planes
1097                          * along directions d and d1.
1098                          */
1099                         c = ddbox->v[dim][dd->dim[d1]][dim];
1100                     }
1101                     else
1102                     {
1103                         c = 0;
1104                     }
1105                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1106                     if (det > 0)
1107                     {
1108                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1109                     }
1110                     else
1111                     {
1112                         /* A negative value signals out of range */
1113                         dh[d1] = -1;
1114                     }
1115                 }
1116             }
1117
1118             /* Accumulate the extremes over all pulses */
1119             for(i=0; i<buf_size; i++)
1120             {
1121                 if (p == 0)
1122                 {
1123                     buf_e[i] = buf_r[i];
1124                 }
1125                 else
1126                 {
1127                     if (bUse)
1128                     {
1129                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1130                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1131                     }
1132
1133                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1134                     {
1135                         d1 = 1;
1136                     }
1137                     else
1138                     {
1139                         d1 = d + 1;
1140                     }
1141                     if (bUse && dh[d1] >= 0)
1142                     {
1143                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1144                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1145                     }
1146                 }
1147                 /* Copy the received buffer to the send buffer,
1148                  * to pass the data through with the next pulse.
1149                  */
1150                 buf_s[i] = buf_r[i];
1151             }
1152             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1153                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1154             {
1155                 /* Store the extremes */
1156                 pos = 0;
1157
1158                 for(d1=d; d1<dd->ndim-1; d1++)
1159                 {
1160                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1161                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1162                     pos++;
1163                 }
1164
1165                 if (d == 1 || (d == 0 && dd->ndim == 3))
1166                 {
1167                     for(i=d; i<2; i++)
1168                     {
1169                         comm->zone_d2[1-d][i] = buf_e[pos];
1170                         pos++;
1171                     }
1172                 }
1173                 if (d == 0)
1174                 {
1175                     comm->zone_d1[1] = buf_e[pos];
1176                     pos++;
1177                 }
1178             }
1179         }
1180     }
1181
1182     if (dd->ndim >= 2)
1183     {
1184         dim = dd->dim[1];
1185         for(i=0; i<2; i++)
1186         {
1187             if (debug)
1188             {
1189                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1190             }
1191             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1192             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1193         }
1194     }
1195     if (dd->ndim >= 3)
1196     {
1197         dim = dd->dim[2];
1198         for(i=0; i<2; i++)
1199         {
1200             for(j=0; j<2; j++)
1201             {
1202                 if (debug)
1203                 {
1204                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1205                 }
1206                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1207                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1208             }
1209         }
1210     }
1211     for(d=1; d<dd->ndim; d++)
1212     {
1213         comm->cell_f_max0[d] = extr_s[d-1][0];
1214         comm->cell_f_min1[d] = extr_s[d-1][1];
1215         if (debug)
1216         {
1217             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1218                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1219         }
1220     }
1221 }
1222
1223 static void dd_collect_cg(gmx_domdec_t *dd,
1224                           t_state *state_local)
1225 {
1226     gmx_domdec_master_t *ma=NULL;
1227     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1228     t_block *cgs_gl;
1229
1230     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1231     {
1232         /* The master has the correct distribution */
1233         return;
1234     }
1235
1236     if (state_local->ddp_count == dd->ddp_count)
1237     {
1238         ncg_home = dd->ncg_home;
1239         cg       = dd->index_gl;
1240         nat_home = dd->nat_home;
1241     }
1242     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1243     {
1244         cgs_gl = &dd->comm->cgs_gl;
1245
1246         ncg_home = state_local->ncg_gl;
1247         cg       = state_local->cg_gl;
1248         nat_home = 0;
1249         for(i=0; i<ncg_home; i++)
1250         {
1251             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1252         }
1253     }
1254     else
1255     {
1256         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1257     }
1258
1259     buf2[0] = dd->ncg_home;
1260     buf2[1] = dd->nat_home;
1261     if (DDMASTER(dd))
1262     {
1263         ma = dd->ma;
1264         ibuf = ma->ibuf;
1265     }
1266     else
1267     {
1268         ibuf = NULL;
1269     }
1270     /* Collect the charge group and atom counts on the master */
1271     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1272
1273     if (DDMASTER(dd))
1274     {
1275         ma->index[0] = 0;
1276         for(i=0; i<dd->nnodes; i++)
1277         {
1278             ma->ncg[i] = ma->ibuf[2*i];
1279             ma->nat[i] = ma->ibuf[2*i+1];
1280             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1281
1282         }
1283         /* Make byte counts and indices */
1284         for(i=0; i<dd->nnodes; i++)
1285         {
1286             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1287             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1288         }
1289         if (debug)
1290         {
1291             fprintf(debug,"Initial charge group distribution: ");
1292             for(i=0; i<dd->nnodes; i++)
1293                 fprintf(debug," %d",ma->ncg[i]);
1294             fprintf(debug,"\n");
1295         }
1296     }
1297
1298     /* Collect the charge group indices on the master */
1299     dd_gatherv(dd,
1300                dd->ncg_home*sizeof(int),dd->index_gl,
1301                DDMASTER(dd) ? ma->ibuf : NULL,
1302                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1303                DDMASTER(dd) ? ma->cg : NULL);
1304
1305     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1306 }
1307
1308 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1309                                     rvec *lv,rvec *v)
1310 {
1311     gmx_domdec_master_t *ma;
1312     int  n,i,c,a,nalloc=0;
1313     rvec *buf=NULL;
1314     t_block *cgs_gl;
1315
1316     ma = dd->ma;
1317
1318     if (!DDMASTER(dd))
1319     {
1320 #ifdef GMX_MPI
1321         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1322                  dd->rank,dd->mpi_comm_all);
1323 #endif
1324     } else {
1325         /* Copy the master coordinates to the global array */
1326         cgs_gl = &dd->comm->cgs_gl;
1327
1328         n = DDMASTERRANK(dd);
1329         a = 0;
1330         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1331         {
1332             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1333             {
1334                 copy_rvec(lv[a++],v[c]);
1335             }
1336         }
1337
1338         for(n=0; n<dd->nnodes; n++)
1339         {
1340             if (n != dd->rank)
1341             {
1342                 if (ma->nat[n] > nalloc)
1343                 {
1344                     nalloc = over_alloc_dd(ma->nat[n]);
1345                     srenew(buf,nalloc);
1346                 }
1347 #ifdef GMX_MPI
1348                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1349                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1350 #endif
1351                 a = 0;
1352                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1353                 {
1354                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1355                     {
1356                         copy_rvec(buf[a++],v[c]);
1357                     }
1358                 }
1359             }
1360         }
1361         sfree(buf);
1362     }
1363 }
1364
1365 static void get_commbuffer_counts(gmx_domdec_t *dd,
1366                                   int **counts,int **disps)
1367 {
1368     gmx_domdec_master_t *ma;
1369     int n;
1370
1371     ma = dd->ma;
1372
1373     /* Make the rvec count and displacment arrays */
1374     *counts  = ma->ibuf;
1375     *disps   = ma->ibuf + dd->nnodes;
1376     for(n=0; n<dd->nnodes; n++)
1377     {
1378         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1379         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1380     }
1381 }
1382
1383 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1384                                    rvec *lv,rvec *v)
1385 {
1386     gmx_domdec_master_t *ma;
1387     int  *rcounts=NULL,*disps=NULL;
1388     int  n,i,c,a;
1389     rvec *buf=NULL;
1390     t_block *cgs_gl;
1391
1392     ma = dd->ma;
1393
1394     if (DDMASTER(dd))
1395     {
1396         get_commbuffer_counts(dd,&rcounts,&disps);
1397
1398         buf = ma->vbuf;
1399     }
1400
1401     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1402
1403     if (DDMASTER(dd))
1404     {
1405         cgs_gl = &dd->comm->cgs_gl;
1406
1407         a = 0;
1408         for(n=0; n<dd->nnodes; n++)
1409         {
1410             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1411             {
1412                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1413                 {
1414                     copy_rvec(buf[a++],v[c]);
1415                 }
1416             }
1417         }
1418     }
1419 }
1420
1421 void dd_collect_vec(gmx_domdec_t *dd,
1422                     t_state *state_local,rvec *lv,rvec *v)
1423 {
1424     gmx_domdec_master_t *ma;
1425     int  n,i,c,a,nalloc=0;
1426     rvec *buf=NULL;
1427
1428     dd_collect_cg(dd,state_local);
1429
1430     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1431     {
1432         dd_collect_vec_sendrecv(dd,lv,v);
1433     }
1434     else
1435     {
1436         dd_collect_vec_gatherv(dd,lv,v);
1437     }
1438 }
1439
1440
1441 void dd_collect_state(gmx_domdec_t *dd,
1442                       t_state *state_local,t_state *state)
1443 {
1444     int est,i,j,nh;
1445
1446     nh = state->nhchainlength;
1447
1448     if (DDMASTER(dd))
1449     {
1450         state->lambda = state_local->lambda;
1451         state->veta = state_local->veta;
1452         state->vol0 = state_local->vol0;
1453         copy_mat(state_local->box,state->box);
1454         copy_mat(state_local->boxv,state->boxv);
1455         copy_mat(state_local->svir_prev,state->svir_prev);
1456         copy_mat(state_local->fvir_prev,state->fvir_prev);
1457         copy_mat(state_local->pres_prev,state->pres_prev);
1458
1459
1460         for(i=0; i<state_local->ngtc; i++)
1461         {
1462             for(j=0; j<nh; j++) {
1463                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1464                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1465             }
1466             state->therm_integral[i] = state_local->therm_integral[i];
1467         }
1468         for(i=0; i<state_local->nnhpres; i++)
1469         {
1470             for(j=0; j<nh; j++) {
1471                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1472                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1473             }
1474         }
1475     }
1476     for(est=0; est<estNR; est++)
1477     {
1478         if (EST_DISTR(est) && state_local->flags & (1<<est))
1479         {
1480             switch (est) {
1481             case estX:
1482                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1483                 break;
1484             case estV:
1485                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1486                 break;
1487             case estSDX:
1488                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1489                 break;
1490             case estCGP:
1491                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1492                 break;
1493             case estLD_RNG:
1494                 if (state->nrngi == 1)
1495                 {
1496                     if (DDMASTER(dd))
1497                     {
1498                         for(i=0; i<state_local->nrng; i++)
1499                         {
1500                             state->ld_rng[i] = state_local->ld_rng[i];
1501                         }
1502                     }
1503                 }
1504                 else
1505                 {
1506                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1507                               state_local->ld_rng,state->ld_rng);
1508                 }
1509                 break;
1510             case estLD_RNGI:
1511                 if (state->nrngi == 1)
1512                 {
1513                    if (DDMASTER(dd))
1514                     {
1515                         state->ld_rngi[0] = state_local->ld_rngi[0];
1516                     }
1517                 }
1518                 else
1519                 {
1520                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1521                               state_local->ld_rngi,state->ld_rngi);
1522                 }
1523                 break;
1524             case estDISRE_INITF:
1525             case estDISRE_RM3TAV:
1526             case estORIRE_INITF:
1527             case estORIRE_DTAV:
1528                 break;
1529             default:
1530                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1531             }
1532         }
1533     }
1534 }
1535
1536 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1537 {
1538     if (debug)
1539     {
1540         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1541     }
1542     fr->cg_nalloc = over_alloc_dd(nalloc);
1543     srenew(fr->cg_cm,fr->cg_nalloc);
1544     srenew(fr->cginfo,fr->cg_nalloc);
1545 }
1546
1547 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1548 {
1549     int est;
1550
1551     if (debug)
1552     {
1553         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1554     }
1555
1556     state->nalloc = over_alloc_dd(nalloc);
1557
1558     for(est=0; est<estNR; est++)
1559     {
1560         if (EST_DISTR(est) && state->flags & (1<<est))
1561         {
1562             switch(est) {
1563             case estX:
1564                 srenew(state->x,state->nalloc);
1565                 break;
1566             case estV:
1567                 srenew(state->v,state->nalloc);
1568                 break;
1569             case estSDX:
1570                 srenew(state->sd_X,state->nalloc);
1571                 break;
1572             case estCGP:
1573                 srenew(state->cg_p,state->nalloc);
1574                 break;
1575             case estLD_RNG:
1576             case estLD_RNGI:
1577             case estDISRE_INITF:
1578             case estDISRE_RM3TAV:
1579             case estORIRE_INITF:
1580             case estORIRE_DTAV:
1581                 /* No reallocation required */
1582                 break;
1583             default:
1584                 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1585             }
1586         }
1587     }
1588
1589     if (f != NULL)
1590     {
1591         srenew(*f,state->nalloc);
1592     }
1593 }
1594
1595 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1596                                        rvec *v,rvec *lv)
1597 {
1598     gmx_domdec_master_t *ma;
1599     int  n,i,c,a,nalloc=0;
1600     rvec *buf=NULL;
1601
1602     if (DDMASTER(dd))
1603     {
1604         ma  = dd->ma;
1605
1606         for(n=0; n<dd->nnodes; n++)
1607         {
1608             if (n != dd->rank)
1609             {
1610                 if (ma->nat[n] > nalloc)
1611                 {
1612                     nalloc = over_alloc_dd(ma->nat[n]);
1613                     srenew(buf,nalloc);
1614                 }
1615                 /* Use lv as a temporary buffer */
1616                 a = 0;
1617                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1618                 {
1619                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1620                     {
1621                         copy_rvec(v[c],buf[a++]);
1622                     }
1623                 }
1624                 if (a != ma->nat[n])
1625                 {
1626                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1627                               a,ma->nat[n]);
1628                 }
1629
1630 #ifdef GMX_MPI
1631                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1632                          DDRANK(dd,n),n,dd->mpi_comm_all);
1633 #endif
1634             }
1635         }
1636         sfree(buf);
1637         n = DDMASTERRANK(dd);
1638         a = 0;
1639         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1640         {
1641             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1642             {
1643                 copy_rvec(v[c],lv[a++]);
1644             }
1645         }
1646     }
1647     else
1648     {
1649 #ifdef GMX_MPI
1650         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1651                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1652 #endif
1653     }
1654 }
1655
1656 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1657                                        rvec *v,rvec *lv)
1658 {
1659     gmx_domdec_master_t *ma;
1660     int  *scounts=NULL,*disps=NULL;
1661     int  n,i,c,a,nalloc=0;
1662     rvec *buf=NULL;
1663
1664     if (DDMASTER(dd))
1665     {
1666         ma  = dd->ma;
1667
1668         get_commbuffer_counts(dd,&scounts,&disps);
1669
1670         buf = ma->vbuf;
1671         a = 0;
1672         for(n=0; n<dd->nnodes; n++)
1673         {
1674             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1675             {
1676                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1677                 {
1678                     copy_rvec(v[c],buf[a++]);
1679                 }
1680             }
1681         }
1682     }
1683
1684     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1685 }
1686
1687 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1688 {
1689     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1690     {
1691         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1692     }
1693     else
1694     {
1695         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1696     }
1697 }
1698
1699 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1700                                 t_state *state,t_state *state_local,
1701                                 rvec **f)
1702 {
1703     int  i,j,ngtch,ngtcp,nh;
1704
1705     nh = state->nhchainlength;
1706
1707     if (DDMASTER(dd))
1708     {
1709         state_local->lambda = state->lambda;
1710         state_local->veta   = state->veta;
1711         state_local->vol0   = state->vol0;
1712         copy_mat(state->box,state_local->box);
1713         copy_mat(state->box_rel,state_local->box_rel);
1714         copy_mat(state->boxv,state_local->boxv);
1715         copy_mat(state->svir_prev,state_local->svir_prev);
1716         copy_mat(state->fvir_prev,state_local->fvir_prev);
1717         for(i=0; i<state_local->ngtc; i++)
1718         {
1719             for(j=0; j<nh; j++) {
1720                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1721                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1722             }
1723             state_local->therm_integral[i] = state->therm_integral[i];
1724         }
1725         for(i=0; i<state_local->nnhpres; i++)
1726         {
1727             for(j=0; j<nh; j++) {
1728                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1729                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1730             }
1731         }
1732     }
1733     dd_bcast(dd,sizeof(real),&state_local->lambda);
1734     dd_bcast(dd,sizeof(real),&state_local->veta);
1735     dd_bcast(dd,sizeof(real),&state_local->vol0);
1736     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1737     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1738     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1739     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1740     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1741     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1742     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1743     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1744     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1745     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1746
1747     if (dd->nat_home > state_local->nalloc)
1748     {
1749         dd_realloc_state(state_local,f,dd->nat_home);
1750     }
1751     for(i=0; i<estNR; i++)
1752     {
1753         if (EST_DISTR(i) && state_local->flags & (1<<i))
1754         {
1755             switch (i) {
1756             case estX:
1757                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1758                 break;
1759             case estV:
1760                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1761                 break;
1762             case estSDX:
1763                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1764                 break;
1765             case estCGP:
1766                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1767                 break;
1768             case estLD_RNG:
1769                 if (state->nrngi == 1)
1770                 {
1771                     dd_bcastc(dd,
1772                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1773                               state->ld_rng,state_local->ld_rng);
1774                 }
1775                 else
1776                 {
1777                     dd_scatter(dd,
1778                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1779                                state->ld_rng,state_local->ld_rng);
1780                 }
1781                 break;
1782             case estLD_RNGI:
1783                 if (state->nrngi == 1)
1784                 {
1785                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1786                               state->ld_rngi,state_local->ld_rngi);
1787                 }
1788                 else
1789                 {
1790                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1791                                state->ld_rngi,state_local->ld_rngi);
1792                 }
1793                 break;
1794             case estDISRE_INITF:
1795             case estDISRE_RM3TAV:
1796             case estORIRE_INITF:
1797             case estORIRE_DTAV:
1798                 /* Not implemented yet */
1799                 break;
1800             default:
1801                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1802             }
1803         }
1804     }
1805 }
1806
1807 static char dim2char(int dim)
1808 {
1809     char c='?';
1810
1811     switch (dim)
1812     {
1813     case XX: c = 'X'; break;
1814     case YY: c = 'Y'; break;
1815     case ZZ: c = 'Z'; break;
1816     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1817     }
1818
1819     return c;
1820 }
1821
1822 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1823                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1824 {
1825     rvec grid_s[2],*grid_r=NULL,cx,r;
1826     char fname[STRLEN],format[STRLEN],buf[22];
1827     FILE *out;
1828     int  a,i,d,z,y,x;
1829     matrix tric;
1830     real vol;
1831
1832     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1833     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1834
1835     if (DDMASTER(dd))
1836     {
1837         snew(grid_r,2*dd->nnodes);
1838     }
1839
1840     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1841
1842     if (DDMASTER(dd))
1843     {
1844         for(d=0; d<DIM; d++)
1845         {
1846             for(i=0; i<DIM; i++)
1847             {
1848                 if (d == i)
1849                 {
1850                     tric[d][i] = 1;
1851                 }
1852                 else
1853                 {
1854                     if (dd->nc[d] > 1 && d < ddbox->npbcdim)
1855                     {
1856                         tric[d][i] = box[i][d]/box[i][i];
1857                     }
1858                     else
1859                     {
1860                         tric[d][i] = 0;
1861                     }
1862                 }
1863             }
1864         }
1865         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1866         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1867         out = gmx_fio_fopen(fname,"w");
1868         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1869         a = 1;
1870         for(i=0; i<dd->nnodes; i++)
1871         {
1872             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1873             for(d=0; d<DIM; d++)
1874             {
1875                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1876             }
1877             for(z=0; z<2; z++)
1878             {
1879                 for(y=0; y<2; y++)
1880                 {
1881                     for(x=0; x<2; x++)
1882                     {
1883                         cx[XX] = grid_r[i*2+x][XX];
1884                         cx[YY] = grid_r[i*2+y][YY];
1885                         cx[ZZ] = grid_r[i*2+z][ZZ];
1886                         mvmul(tric,cx,r);
1887                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1888                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1889                     }
1890                 }
1891             }
1892             for(d=0; d<DIM; d++)
1893             {
1894                 for(x=0; x<4; x++)
1895                 {
1896                     switch(d)
1897                     {
1898                     case 0: y = 1 + i*8 + 2*x; break;
1899                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1900                     case 2: y = 1 + i*8 + x; break;
1901                     }
1902                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1903                 }
1904             }
1905         }
1906         gmx_fio_fclose(out);
1907         sfree(grid_r);
1908     }
1909 }
1910
1911 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1912                   gmx_mtop_t *mtop,t_commrec *cr,
1913                   int natoms,rvec x[],matrix box)
1914 {
1915     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1916     FILE *out;
1917     int  i,ii,resnr,c;
1918     char *atomname,*resname;
1919     real b;
1920     gmx_domdec_t *dd;
1921
1922     dd = cr->dd;
1923     if (natoms == -1)
1924     {
1925         natoms = dd->comm->nat[ddnatVSITE];
1926     }
1927
1928     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1929
1930     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1931     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1932
1933     out = gmx_fio_fopen(fname,"w");
1934
1935     fprintf(out,"TITLE     %s\n",title);
1936     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1937     for(i=0; i<natoms; i++)
1938     {
1939         ii = dd->gatindex[i];
1940         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1941         if (i < dd->comm->nat[ddnatZONE])
1942         {
1943             c = 0;
1944             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1945             {
1946                 c++;
1947             }
1948             b = c;
1949         }
1950         else if (i < dd->comm->nat[ddnatVSITE])
1951         {
1952             b = dd->comm->zones.n;
1953         }
1954         else
1955         {
1956             b = dd->comm->zones.n + 1;
1957         }
1958         fprintf(out,strlen(atomname)<4 ? format : format4,
1959                 "ATOM",(ii+1)%100000,
1960                 atomname,resname,' ',(resnr+1)%10000,' ',
1961                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1962     }
1963     fprintf(out,"TER\n");
1964
1965     gmx_fio_fclose(out);
1966 }
1967
1968 real dd_cutoff_mbody(gmx_domdec_t *dd)
1969 {
1970     gmx_domdec_comm_t *comm;
1971     int  di;
1972     real r;
1973
1974     comm = dd->comm;
1975
1976     r = -1;
1977     if (comm->bInterCGBondeds)
1978     {
1979         if (comm->cutoff_mbody > 0)
1980         {
1981             r = comm->cutoff_mbody;
1982         }
1983         else
1984         {
1985             /* cutoff_mbody=0 means we do not have DLB */
1986             r = comm->cellsize_min[dd->dim[0]];
1987             for(di=1; di<dd->ndim; di++)
1988             {
1989                 r = min(r,comm->cellsize_min[dd->dim[di]]);
1990             }
1991             if (comm->bBondComm)
1992             {
1993                 r = max(r,comm->cutoff_mbody);
1994             }
1995             else
1996             {
1997                 r = min(r,comm->cutoff);
1998             }
1999         }
2000     }
2001
2002     return r;
2003 }
2004
2005 real dd_cutoff_twobody(gmx_domdec_t *dd)
2006 {
2007     real r_mb;
2008
2009     r_mb = dd_cutoff_mbody(dd);
2010
2011     return max(dd->comm->cutoff,r_mb);
2012 }
2013
2014
2015 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2016 {
2017     int nc,ntot;
2018
2019     nc   = dd->nc[dd->comm->cartpmedim];
2020     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2021     copy_ivec(coord,coord_pme);
2022     coord_pme[dd->comm->cartpmedim] =
2023         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2024 }
2025
2026 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2027 {
2028     /* Here we assign a PME node to communicate with this DD node
2029      * by assuming that the major index of both is x.
2030      * We add cr->npmenodes/2 to obtain an even distribution.
2031      */
2032     return (ddindex*npme + npme/2)/ndd;
2033 }
2034
2035 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2036 {
2037     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2038 }
2039
2040 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2041 {
2042     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2043 }
2044
2045 static int *dd_pmenodes(t_commrec *cr)
2046 {
2047     int *pmenodes;
2048     int n,i,p0,p1;
2049
2050     snew(pmenodes,cr->npmenodes);
2051     n = 0;
2052     for(i=0; i<cr->dd->nnodes; i++) {
2053         p0 = cr_ddindex2pmeindex(cr,i);
2054         p1 = cr_ddindex2pmeindex(cr,i+1);
2055         if (i+1 == cr->dd->nnodes || p1 > p0) {
2056             if (debug)
2057                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2058             pmenodes[n] = i + 1 + n;
2059             n++;
2060         }
2061     }
2062
2063     return pmenodes;
2064 }
2065
2066 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2067 {
2068     gmx_domdec_t *dd;
2069     ivec coords,coords_pme,nc;
2070     int  slab;
2071
2072     dd = cr->dd;
2073     /*
2074       if (dd->comm->bCartesian) {
2075       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2076       dd_coords2pmecoords(dd,coords,coords_pme);
2077       copy_ivec(dd->ntot,nc);
2078       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2079       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2080
2081       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2082       } else {
2083       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2084       }
2085     */
2086     coords[XX] = x;
2087     coords[YY] = y;
2088     coords[ZZ] = z;
2089     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2090
2091     return slab;
2092 }
2093
2094 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2095 {
2096     gmx_domdec_comm_t *comm;
2097     ivec coords;
2098     int  ddindex,nodeid=-1;
2099
2100     comm = cr->dd->comm;
2101
2102     coords[XX] = x;
2103     coords[YY] = y;
2104     coords[ZZ] = z;
2105     if (comm->bCartesianPP_PME)
2106     {
2107 #ifdef GMX_MPI
2108         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2109 #endif
2110     }
2111     else
2112     {
2113         ddindex = dd_index(cr->dd->nc,coords);
2114         if (comm->bCartesianPP)
2115         {
2116             nodeid = comm->ddindex2simnodeid[ddindex];
2117         }
2118         else
2119         {
2120             if (comm->pmenodes)
2121             {
2122                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2123             }
2124             else
2125             {
2126                 nodeid = ddindex;
2127             }
2128         }
2129     }
2130
2131     return nodeid;
2132 }
2133
2134 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2135 {
2136     gmx_domdec_t *dd;
2137     gmx_domdec_comm_t *comm;
2138     ivec coord,coord_pme;
2139     int  i;
2140     int  pmenode=-1;
2141
2142     dd = cr->dd;
2143     comm = dd->comm;
2144
2145     /* This assumes a uniform x domain decomposition grid cell size */
2146     if (comm->bCartesianPP_PME)
2147     {
2148 #ifdef GMX_MPI
2149         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2150         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2151         {
2152             /* This is a PP node */
2153             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2154             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2155         }
2156 #endif
2157     }
2158     else if (comm->bCartesianPP)
2159     {
2160         if (sim_nodeid < dd->nnodes)
2161         {
2162             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2163         }
2164     }
2165     else
2166     {
2167         /* This assumes DD cells with identical x coordinates
2168          * are numbered sequentially.
2169          */
2170         if (dd->comm->pmenodes == NULL)
2171         {
2172             if (sim_nodeid < dd->nnodes)
2173             {
2174                 /* The DD index equals the nodeid */
2175                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2176             }
2177         }
2178         else
2179         {
2180             i = 0;
2181             while (sim_nodeid > dd->comm->pmenodes[i])
2182             {
2183                 i++;
2184             }
2185             if (sim_nodeid < dd->comm->pmenodes[i])
2186             {
2187                 pmenode = dd->comm->pmenodes[i];
2188             }
2189         }
2190     }
2191
2192     return pmenode;
2193 }
2194
2195 bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2196 {
2197     bool bPMEOnlyNode;
2198
2199     if (DOMAINDECOMP(cr))
2200     {
2201         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2202     }
2203     else
2204     {
2205         bPMEOnlyNode = FALSE;
2206     }
2207
2208     return bPMEOnlyNode;
2209 }
2210
2211 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2212                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2213 {
2214     gmx_domdec_t *dd;
2215     int x,y,z;
2216     ivec coord,coord_pme;
2217
2218     dd = cr->dd;
2219
2220     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2221
2222     *nmy_ddnodes = 0;
2223     for(x=0; x<dd->nc[XX]; x++)
2224     {
2225         for(y=0; y<dd->nc[YY]; y++)
2226         {
2227             for(z=0; z<dd->nc[ZZ]; z++)
2228             {
2229                 if (dd->comm->bCartesianPP_PME)
2230                 {
2231                     coord[XX] = x;
2232                     coord[YY] = y;
2233                     coord[ZZ] = z;
2234                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2235                     if (dd->ci[XX] == coord_pme[XX] &&
2236                         dd->ci[YY] == coord_pme[YY] &&
2237                         dd->ci[ZZ] == coord_pme[ZZ])
2238                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2239                 }
2240                 else
2241                 {
2242                     /* The slab corresponds to the nodeid in the PME group */
2243                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2244                     {
2245                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2246                     }
2247                 }
2248             }
2249         }
2250     }
2251
2252     /* The last PP-only node is the peer node */
2253     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2254
2255     if (debug)
2256     {
2257         fprintf(debug,"Receive coordinates from PP nodes:");
2258         for(x=0; x<*nmy_ddnodes; x++)
2259         {
2260             fprintf(debug," %d",(*my_ddnodes)[x]);
2261         }
2262         fprintf(debug,"\n");
2263     }
2264 }
2265
2266 static bool receive_vir_ener(t_commrec *cr)
2267 {
2268     gmx_domdec_comm_t *comm;
2269     int  pmenode,coords[DIM],rank;
2270     bool bReceive;
2271
2272     bReceive = TRUE;
2273     if (cr->npmenodes < cr->dd->nnodes)
2274     {
2275         comm = cr->dd->comm;
2276         if (comm->bCartesianPP_PME)
2277         {
2278             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2279 #ifdef GMX_MPI
2280             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2281             coords[comm->cartpmedim]++;
2282             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2283             {
2284                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2285                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2286                 {
2287                     /* This is not the last PP node for pmenode */
2288                     bReceive = FALSE;
2289                 }
2290             }
2291 #endif
2292         }
2293         else
2294         {
2295             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2296             if (cr->sim_nodeid+1 < cr->nnodes &&
2297                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2298             {
2299                 /* This is not the last PP node for pmenode */
2300                 bReceive = FALSE;
2301             }
2302         }
2303     }
2304
2305     return bReceive;
2306 }
2307
2308 static void set_zones_ncg_home(gmx_domdec_t *dd)
2309 {
2310     gmx_domdec_zones_t *zones;
2311     int i;
2312
2313     zones = &dd->comm->zones;
2314
2315     zones->cg_range[0] = 0;
2316     for(i=1; i<zones->n+1; i++)
2317     {
2318         zones->cg_range[i] = dd->ncg_home;
2319     }
2320 }
2321
2322 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2323 {
2324     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2325
2326     ind = state->cg_gl;
2327     dd_cg_gl = dd->index_gl;
2328     cgindex  = dd->cgindex;
2329     nat = 0;
2330     cgindex[0] = nat;
2331     for(i=0; i<state->ncg_gl; i++)
2332     {
2333         cgindex[i] = nat;
2334         cg_gl = ind[i];
2335         dd_cg_gl[i] = cg_gl;
2336         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2337     }
2338     cgindex[i] = nat;
2339
2340     dd->ncg_home = state->ncg_gl;
2341     dd->nat_home = nat;
2342
2343     set_zones_ncg_home(dd);
2344 }
2345
2346 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2347 {
2348     while (cg >= cginfo_mb->cg_end)
2349     {
2350         cginfo_mb++;
2351     }
2352
2353     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2354 }
2355
2356 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2357                           t_forcerec *fr,char *bLocalCG)
2358 {
2359     cginfo_mb_t *cginfo_mb;
2360     int *cginfo;
2361     int cg;
2362
2363     if (fr != NULL)
2364     {
2365         cginfo_mb = fr->cginfo_mb;
2366         cginfo    = fr->cginfo;
2367
2368         for(cg=cg0; cg<cg1; cg++)
2369         {
2370             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2371         }
2372     }
2373
2374     if (bLocalCG != NULL)
2375     {
2376         for(cg=cg0; cg<cg1; cg++)
2377         {
2378             bLocalCG[index_gl[cg]] = TRUE;
2379         }
2380     }
2381 }
2382
2383 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2384 {
2385     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2386     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2387     gmx_ga2la_t *ga2la;
2388     char *bLocalCG;
2389
2390     bLocalCG = dd->comm->bLocalCG;
2391
2392     if (dd->nat_tot > dd->gatindex_nalloc)
2393     {
2394         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2395         srenew(dd->gatindex,dd->gatindex_nalloc);
2396     }
2397
2398     nzone      = dd->comm->zones.n;
2399     zone2cg    = dd->comm->zones.cg_range;
2400     zone_ncg1  = dd->comm->zone_ncg1;
2401     index_gl   = dd->index_gl;
2402     gatindex   = dd->gatindex;
2403
2404     if (zone2cg[1] != dd->ncg_home)
2405     {
2406         gmx_incons("dd->ncg_zone is not up to date");
2407     }
2408
2409     /* Make the local to global and global to local atom index */
2410     a = dd->cgindex[cg_start];
2411     for(zone=0; zone<nzone; zone++)
2412     {
2413         if (zone == 0)
2414         {
2415             cg0 = cg_start;
2416         }
2417         else
2418         {
2419             cg0 = zone2cg[zone];
2420         }
2421         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2422         {
2423             zone1 = zone;
2424             if (cg - cg0 >= zone_ncg1[zone])
2425             {
2426                 /* Signal that this cg is from more than one zone away */
2427                 zone1 += nzone;
2428             }
2429             cg_gl = index_gl[cg];
2430             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2431             {
2432                 gatindex[a] = a_gl;
2433                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2434                 a++;
2435             }
2436         }
2437     }
2438 }
2439
2440 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2441                           const char *where)
2442 {
2443     int ncg,i,ngl,nerr;
2444
2445     nerr = 0;
2446     if (bLocalCG == NULL)
2447     {
2448         return nerr;
2449     }
2450     for(i=0; i<dd->ncg_tot; i++)
2451     {
2452         if (!bLocalCG[dd->index_gl[i]])
2453         {
2454             fprintf(stderr,
2455                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2456             nerr++;
2457         }
2458     }
2459     ngl = 0;
2460     for(i=0; i<ncg_sys; i++)
2461     {
2462         if (bLocalCG[i])
2463         {
2464             ngl++;
2465         }
2466     }
2467     if (ngl != dd->ncg_tot)
2468     {
2469         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2470         nerr++;
2471     }
2472
2473     return nerr;
2474 }
2475
2476 static void check_index_consistency(gmx_domdec_t *dd,
2477                                     int natoms_sys,int ncg_sys,
2478                                     const char *where)
2479 {
2480     int  nerr,ngl,i,a,cell;
2481     int  *have;
2482
2483     nerr = 0;
2484
2485     if (dd->comm->DD_debug > 1)
2486     {
2487         snew(have,natoms_sys);
2488         for(a=0; a<dd->nat_tot; a++)
2489         {
2490             if (have[dd->gatindex[a]] > 0)
2491             {
2492                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2493             }
2494             else
2495             {
2496                 have[dd->gatindex[a]] = a + 1;
2497             }
2498         }
2499         sfree(have);
2500     }
2501
2502     snew(have,dd->nat_tot);
2503
2504     ngl  = 0;
2505     for(i=0; i<natoms_sys; i++)
2506     {
2507         if (ga2la_get(dd->ga2la,i,&a,&cell))
2508         {
2509             if (a >= dd->nat_tot)
2510             {
2511                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2512                 nerr++;
2513             }
2514             else
2515             {
2516                 have[a] = 1;
2517                 if (dd->gatindex[a] != i)
2518                 {
2519                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2520                     nerr++;
2521                 }
2522             }
2523             ngl++;
2524         }
2525     }
2526     if (ngl != dd->nat_tot)
2527     {
2528         fprintf(stderr,
2529                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2530                 dd->rank,where,ngl,dd->nat_tot);
2531     }
2532     for(a=0; a<dd->nat_tot; a++)
2533     {
2534         if (have[a] == 0)
2535         {
2536             fprintf(stderr,
2537                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2538                     dd->rank,where,a+1,dd->gatindex[a]+1);
2539         }
2540     }
2541     sfree(have);
2542
2543     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2544
2545     if (nerr > 0) {
2546         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2547                   dd->rank,where,nerr);
2548     }
2549 }
2550
2551 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2552 {
2553     int  i;
2554     char *bLocalCG;
2555
2556     if (a_start == 0)
2557     {
2558         /* Clear the whole list without searching */
2559         ga2la_clear(dd->ga2la);
2560     }
2561     else
2562     {
2563         for(i=a_start; i<dd->nat_tot; i++)
2564         {
2565             ga2la_del(dd->ga2la,dd->gatindex[i]);
2566         }
2567     }
2568
2569     bLocalCG = dd->comm->bLocalCG;
2570     if (bLocalCG)
2571     {
2572         for(i=cg_start; i<dd->ncg_tot; i++)
2573         {
2574             bLocalCG[dd->index_gl[i]] = FALSE;
2575         }
2576     }
2577
2578     dd_clear_local_vsite_indices(dd);
2579
2580     if (dd->constraints)
2581     {
2582         dd_clear_local_constraint_indices(dd);
2583     }
2584 }
2585
2586 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2587 {
2588     real grid_jump_limit;
2589
2590     /* The distance between the boundaries of cells at distance
2591      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2592      * and by the fact that cells should not be shifted by more than
2593      * half their size, such that cg's only shift by one cell
2594      * at redecomposition.
2595      */
2596     grid_jump_limit = comm->cellsize_limit;
2597     if (!comm->bVacDLBNoLimit)
2598     {
2599         grid_jump_limit = max(grid_jump_limit,
2600                               comm->cutoff/comm->cd[dim_ind].np);
2601     }
2602
2603     return grid_jump_limit;
2604 }
2605
2606 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2607 {
2608     gmx_domdec_comm_t *comm;
2609     int  d,dim;
2610     real limit,bfac;
2611
2612     comm = dd->comm;
2613
2614     for(d=1; d<dd->ndim; d++)
2615     {
2616         dim = dd->dim[d];
2617         limit = grid_jump_limit(comm,d);
2618         bfac = ddbox->box_size[dim];
2619         if (ddbox->tric_dir[dim])
2620         {
2621             bfac *= ddbox->skew_fac[dim];
2622         }
2623         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2624             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2625         {
2626             char buf[22];
2627             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2628                       gmx_step_str(step,buf),
2629                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2630         }
2631     }
2632 }
2633
2634 static int dd_load_count(gmx_domdec_comm_t *comm)
2635 {
2636     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2637 }
2638
2639 static float dd_force_load(gmx_domdec_comm_t *comm)
2640 {
2641     float load;
2642
2643     if (comm->eFlop)
2644     {
2645         load = comm->flop;
2646         if (comm->eFlop > 1)
2647         {
2648             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2649         }
2650     }
2651     else
2652     {
2653         load = comm->cycl[ddCyclF];
2654         if (comm->cycl_n[ddCyclF] > 1)
2655         {
2656             /* Subtract the maximum of the last n cycle counts
2657              * to get rid of possible high counts due to other soures,
2658              * for instance system activity, that would otherwise
2659              * affect the dynamic load balancing.
2660              */
2661             load -= comm->cycl_max[ddCyclF];
2662         }
2663     }
2664
2665     return load;
2666 }
2667
2668 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dimind,real **dim_f)
2669 {
2670     gmx_domdec_comm_t *comm;
2671     int i;
2672
2673     comm = dd->comm;
2674
2675     if (dd->dim[dimind] != dimind)
2676     {
2677         *dim_f = NULL;
2678         return;
2679     }
2680
2681     snew(*dim_f,dd->nc[dimind]+1);
2682     (*dim_f)[0] = 0;
2683     for(i=1; i<dd->nc[dimind]; i++)
2684     {
2685         if (comm->slb_frac[dimind])
2686         {
2687             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dimind][i-1];
2688         }
2689         else
2690         {
2691             (*dim_f)[i] = (real)i/(real)dd->nc[dimind];
2692         }
2693     }
2694     (*dim_f)[dd->nc[dimind]] = 1;
2695 }
2696
2697 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2698                        int dimind,int nslab)
2699 {
2700     int  pmeindex,slab,nso,i;
2701     ivec xyz;
2702
2703     ddpme->dimind = dimind;
2704     ddpme->nslab  = nslab;
2705
2706     if (nslab <= 1)
2707     {
2708         return;
2709     }
2710
2711     nso = dd->comm->npmenodes/nslab;
2712     /* Determine for each PME slab the PP location range for dimension dim */
2713     snew(ddpme->pp_min,nslab);
2714     snew(ddpme->pp_max,nslab);
2715     for(slab=0; slab<nslab; slab++) {
2716         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2717         ddpme->pp_max[slab] = 0;
2718     }
2719     for(i=0; i<dd->nnodes; i++) {
2720         ddindex2xyz(dd->nc,i,xyz);
2721         /* For y only use our y/z slab.
2722          * This assumes that the PME x grid size matches the DD grid size.
2723          */
2724         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2725             pmeindex = ddindex2pmeindex(dd,i);
2726             if (dimind == 0) {
2727                 slab = pmeindex/nso;
2728             } else {
2729                 slab = pmeindex % nslab;
2730             }
2731             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2732             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2733         }
2734     }
2735
2736     set_slb_pme_dim_f(dd,ddpme->dimind,&ddpme->slb_dim_f);
2737 }
2738
2739 int dd_pme_maxshift0(gmx_domdec_t *dd)
2740 {
2741     return dd->comm->ddpme[0].maxshift;
2742 }
2743
2744 int dd_pme_maxshift1(gmx_domdec_t *dd)
2745 {
2746     return dd->comm->ddpme[1].maxshift;
2747 }
2748
2749 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2750                              bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2751 {
2752     gmx_domdec_comm_t *comm;
2753     int  dim,nc,ns,s;
2754     int  *xmin,*xmax;
2755     real range,pme_boundary;
2756     int  sh;
2757
2758     comm = dd->comm;
2759     dim = ddpme->dimind;
2760     nc  = dd->nc[dim];
2761     ns  = ddpme->nslab;
2762
2763     if (dd->dim[dim] != dim)
2764     {
2765         /* PP decomposition is not along dim: the worst situation */
2766         sh = ns/2;
2767     }
2768     else if (ns <= 3 || (bUniform && ns == nc))
2769     {
2770         /* The optimal situation */
2771         sh = 1;
2772     }
2773     else
2774     {
2775         /* We need to check for all pme nodes which nodes they
2776          * could possibly need to communicate with.
2777          */
2778         xmin = ddpme->pp_min;
2779         xmax = ddpme->pp_max;
2780         /* Allow for atoms to be maximally 2/3 times the cut-off
2781          * out of their DD cell. This is a reasonable balance between
2782          * between performance and support for most charge-group/cut-off
2783          * combinations.
2784          */
2785         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[dim];
2786         /* Avoid extra communication when we are exactly at a boundary */
2787         range *= 0.999;
2788
2789         sh = 1;
2790         for(s=0; s<ns; s++)
2791         {
2792             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2793             pme_boundary = (real)s/ns;
2794             while (sh+1 < ns &&
2795                    ((s-(sh+1) >= 0 &&
2796                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2797                     (s-(sh+1) <  0 &&
2798                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2799             {
2800                 sh++;
2801             }
2802             pme_boundary = (real)(s+1)/ns;
2803             while (sh+1 < ns &&
2804                    ((s+(sh+1) <  ns &&
2805                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2806                     (s+(sh+1) >= ns &&
2807                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2808             {
2809                 sh++;
2810             }
2811         }
2812     }
2813
2814     ddpme->maxshift = sh;
2815
2816     if (debug)
2817     {
2818         fprintf(debug,"PME slab communication range for dimind %d is %d\n",
2819                 ddpme->dimind,ddpme->maxshift);
2820     }
2821 }
2822
2823 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2824 {
2825     int d,dim;
2826
2827     for(d=0; d<dd->ndim; d++)
2828     {
2829         dim = dd->dim[d];
2830         if (dim < ddbox->nboundeddim &&
2831             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2832             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2833         {
2834             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2835                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2836                       dd->nc[dim],dd->comm->cellsize_limit);
2837         }
2838     }
2839 }
2840
2841 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2842                                   bool bMaster,ivec npulse)
2843 {
2844     gmx_domdec_comm_t *comm;
2845     int  d,j;
2846     rvec cellsize_min;
2847     real *cell_x,cell_dx,cellsize;
2848
2849     comm = dd->comm;
2850
2851     for(d=0; d<DIM; d++)
2852     {
2853         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2854         npulse[d] = 1;
2855         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2856         {
2857             /* Uniform grid */
2858             cell_dx = ddbox->box_size[d]/dd->nc[d];
2859             if (bMaster)
2860             {
2861                 for(j=0; j<dd->nc[d]+1; j++)
2862                 {
2863                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2864                 }
2865             }
2866             else
2867             {
2868                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2869                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2870             }
2871             cellsize = cell_dx*ddbox->skew_fac[d];
2872             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2873             {
2874                 npulse[d]++;
2875             }
2876             cellsize_min[d] = cellsize;
2877         }
2878         else
2879         {
2880             /* Statically load balanced grid */
2881             /* Also when we are not doing a master distribution we determine
2882              * all cell borders in a loop to obtain identical values
2883              * to the master distribution case and to determine npulse.
2884              */
2885             if (bMaster)
2886             {
2887                 cell_x = dd->ma->cell_x[d];
2888             }
2889             else
2890             {
2891                 snew(cell_x,dd->nc[d]+1);
2892             }
2893             cell_x[0] = ddbox->box0[d];
2894             for(j=0; j<dd->nc[d]; j++)
2895             {
2896                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2897                 cell_x[j+1] = cell_x[j] + cell_dx;
2898                 cellsize = cell_dx*ddbox->skew_fac[d];
2899                 while (cellsize*npulse[d] < comm->cutoff &&
2900                        npulse[d] < dd->nc[d]-1)
2901                 {
2902                     npulse[d]++;
2903                 }
2904                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2905             }
2906             if (!bMaster)
2907             {
2908                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2909                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2910                 sfree(cell_x);
2911             }
2912         }
2913         /* The following limitation is to avoid that a cell would receive
2914          * some of its own home charge groups back over the periodic boundary.
2915          * Double charge groups cause trouble with the global indices.
2916          */
2917         if (d < ddbox->npbcdim &&
2918             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2919         {
2920             gmx_fatal_collective(FARGS,DDMASTER(dd),
2921                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2922                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2923                                  comm->cutoff,
2924                                  dd->nc[d],dd->nc[d],
2925                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2926         }
2927     }
2928
2929     if (!comm->bDynLoadBal)
2930     {
2931         copy_rvec(cellsize_min,comm->cellsize_min);
2932     }
2933
2934     for(d=0; d<comm->npmedecompdim; d++)
2935     {
2936         set_pme_maxshift(dd,&comm->ddpme[d],
2937                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2938                          comm->ddpme[d].slb_dim_f);
2939     }
2940 }
2941
2942
2943 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2944                                        int d,int dim,gmx_domdec_root_t *root,
2945                                        gmx_ddbox_t *ddbox,
2946                                        bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2947 {
2948     gmx_domdec_comm_t *comm;
2949     int  ncd,i,j,nmin,nmin_old;
2950     bool bLimLo,bLimHi;
2951     real *cell_size;
2952     real fac,halfway,cellsize_limit_f_i,region_size;
2953     bool bPBC,bLastHi=FALSE;
2954     int nrange[]={range[0],range[1]};
2955
2956     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
2957
2958     comm = dd->comm;
2959
2960     ncd = dd->nc[dim];
2961
2962     bPBC = (dim < ddbox->npbcdim);
2963
2964     cell_size = root->buf_ncd;
2965
2966     if (debug)
2967     {
2968         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2969     }
2970
2971     /* First we need to check if the scaling does not make cells
2972      * smaller than the smallest allowed size.
2973      * We need to do this iteratively, since if a cell is too small,
2974      * it needs to be enlarged, which makes all the other cells smaller,
2975      * which could in turn make another cell smaller than allowed.
2976      */
2977     for(i=range[0]; i<range[1]; i++)
2978     {
2979         root->bCellMin[i] = FALSE;
2980     }
2981     nmin = 0;
2982     do
2983     {
2984         nmin_old = nmin;
2985         /* We need the total for normalization */
2986         fac = 0;
2987         for(i=range[0]; i<range[1]; i++)
2988         {
2989             if (root->bCellMin[i] == FALSE)
2990             {
2991                 fac += cell_size[i];
2992             }
2993         }
2994         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
2995         /* Determine the cell boundaries */
2996         for(i=range[0]; i<range[1]; i++)
2997         {
2998             if (root->bCellMin[i] == FALSE)
2999             {
3000                 cell_size[i] *= fac;
3001                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3002                 {
3003                     cellsize_limit_f_i = 0;
3004                 }
3005                 else
3006                 {
3007                     cellsize_limit_f_i = cellsize_limit_f;
3008                 }
3009                 if (cell_size[i] < cellsize_limit_f_i)
3010                 {
3011                     root->bCellMin[i] = TRUE;
3012                     cell_size[i] = cellsize_limit_f_i;
3013                     nmin++;
3014                 }
3015             }
3016             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3017         }
3018     }
3019     while (nmin > nmin_old);
3020
3021     i=range[1]-1;
3022     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3023     /* For this check we should not use DD_CELL_MARGIN,
3024      * but a slightly smaller factor,
3025      * since rounding could get use below the limit.
3026      */
3027     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3028     {
3029         char buf[22];
3030         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3031                   gmx_step_str(step,buf),
3032                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3033                   ncd,comm->cellsize_min[dim]);
3034     }
3035
3036     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3037
3038     if (!bUniform)
3039     {
3040         /* Check if the boundary did not displace more than halfway
3041          * each of the cells it bounds, as this could cause problems,
3042          * especially when the differences between cell sizes are large.
3043          * If changes are applied, they will not make cells smaller
3044          * than the cut-off, as we check all the boundaries which
3045          * might be affected by a change and if the old state was ok,
3046          * the cells will at most be shrunk back to their old size.
3047          */
3048         for(i=range[0]+1; i<range[1]; i++)
3049         {
3050             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3051             if (root->cell_f[i] < halfway)
3052             {
3053                 root->cell_f[i] = halfway;
3054                 /* Check if the change also causes shifts of the next boundaries */
3055                 for(j=i+1; j<range[1]; j++)
3056                 {
3057                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3058                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3059                 }
3060             }
3061             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3062             if (root->cell_f[i] > halfway)
3063             {
3064                 root->cell_f[i] = halfway;
3065                 /* Check if the change also causes shifts of the next boundaries */
3066                 for(j=i-1; j>=range[0]+1; j--)
3067                 {
3068                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3069                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3070                 }
3071             }
3072         }
3073     }
3074
3075     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3076     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3077      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3078      * for a and b nrange is used */
3079     if (d > 0)
3080     {
3081         /* Take care of the staggering of the cell boundaries */
3082         if (bUniform)
3083         {
3084             for(i=range[0]; i<range[1]; i++)
3085             {
3086                 root->cell_f_max0[i] = root->cell_f[i];
3087                 root->cell_f_min1[i] = root->cell_f[i+1];
3088             }
3089         }
3090         else
3091         {
3092             for(i=range[0]+1; i<range[1]; i++)
3093             {
3094                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3095                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3096                 if (bLimLo && bLimHi)
3097                 {
3098                     /* Both limits violated, try the best we can */
3099                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3100                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3101                     nrange[0]=range[0];
3102                     nrange[1]=i;
3103                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3104
3105                     nrange[0]=i;
3106                     nrange[1]=range[1];
3107                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3108
3109                     return;
3110                 }
3111                 else if (bLimLo)
3112                 {
3113                     /* root->cell_f[i] = root->bound_min[i]; */
3114                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3115                     bLastHi=FALSE;
3116                 }
3117                 else if (bLimHi && !bLastHi)
3118                 {
3119                     bLastHi=TRUE;
3120                     if (nrange[1] < range[1])   /* found a LimLo before */
3121                     {
3122                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3123                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3124                         nrange[0]=nrange[1];
3125                     }
3126                     root->cell_f[i] = root->bound_max[i];
3127                     nrange[1]=i;
3128                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3129                     nrange[0]=i;
3130                     nrange[1]=range[1];
3131                 }
3132             }
3133             if (nrange[1] < range[1])   /* found last a LimLo */
3134             {
3135                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3136                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3137                 nrange[0]=nrange[1];
3138                 nrange[1]=range[1];
3139                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3140             }
3141             else if (nrange[0] > range[0]) /* found at least one LimHi */
3142             {
3143                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3144             }
3145         }
3146     }
3147 }
3148
3149
3150 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3151                                        int d,int dim,gmx_domdec_root_t *root,
3152                                        gmx_ddbox_t *ddbox,bool bDynamicBox,
3153                                        bool bUniform,gmx_large_int_t step)
3154 {
3155     gmx_domdec_comm_t *comm;
3156     int  ncd,d1,i,j,pos;
3157     real *cell_size;
3158     real load_aver,load_i,imbalance,change,change_max,sc;
3159     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3160     real change_limit = 0.1;
3161     real relax = 0.5;
3162     bool bPBC;
3163     int range[] = { 0, 0 };
3164
3165     comm = dd->comm;
3166
3167     ncd = dd->nc[dim];
3168
3169     bPBC = (dim < ddbox->npbcdim);
3170
3171     cell_size = root->buf_ncd;
3172
3173     /* Store the original boundaries */
3174     for(i=0; i<ncd+1; i++)
3175     {
3176         root->old_cell_f[i] = root->cell_f[i];
3177     }
3178     if (bUniform) {
3179         for(i=0; i<ncd; i++)
3180         {
3181             cell_size[i] = 1.0/ncd;
3182         }
3183     }
3184     else if (dd_load_count(comm))
3185     {
3186         load_aver = comm->load[d].sum_m/ncd;
3187         change_max = 0;
3188         for(i=0; i<ncd; i++)
3189         {
3190             /* Determine the relative imbalance of cell i */
3191             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3192             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3193             /* Determine the change of the cell size using underrelaxation */
3194             change = -relax*imbalance;
3195             change_max = max(change_max,max(change,-change));
3196         }
3197         /* Limit the amount of scaling.
3198          * We need to use the same rescaling for all cells in one row,
3199          * otherwise the load balancing might not converge.
3200          */
3201         sc = relax;
3202         if (change_max > change_limit)
3203         {
3204             sc *= change_limit/change_max;
3205         }
3206         for(i=0; i<ncd; i++)
3207         {
3208             /* Determine the relative imbalance of cell i */
3209             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3210             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3211             /* Determine the change of the cell size using underrelaxation */
3212             change = -sc*imbalance;
3213             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3214         }
3215     }
3216
3217     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3218     cellsize_limit_f *= DD_CELL_MARGIN;
3219     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3220     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3221     if (ddbox->tric_dir[dim])
3222     {
3223         cellsize_limit_f /= ddbox->skew_fac[dim];
3224         dist_min_f       /= ddbox->skew_fac[dim];
3225     }
3226     if (bDynamicBox && d > 0)
3227     {
3228         dist_min_f *= DD_PRES_SCALE_MARGIN;
3229     }
3230     if (d > 0 && !bUniform)
3231     {
3232         /* Make sure that the grid is not shifted too much */
3233         for(i=1; i<ncd; i++) {
3234             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3235             {
3236                 gmx_incons("Inconsistent DD boundary staggering limits!");
3237             }
3238             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3239             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3240             if (space > 0) {
3241                 root->bound_min[i] += 0.5*space;
3242             }
3243             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3244             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3245             if (space < 0) {
3246                 root->bound_max[i] += 0.5*space;
3247             }
3248             if (debug)
3249             {
3250                 fprintf(debug,
3251                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3252                         d,i,
3253                         root->cell_f_max0[i-1] + dist_min_f,
3254                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3255                         root->cell_f_min1[i] - dist_min_f);
3256             }
3257         }
3258     }
3259     range[1]=ncd;
3260     root->cell_f[0] = 0;
3261     root->cell_f[ncd] = 1;
3262     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3263
3264
3265     /* After the checks above, the cells should obey the cut-off
3266      * restrictions, but it does not hurt to check.
3267      */
3268     for(i=0; i<ncd; i++)
3269     {
3270         if (debug)
3271         {
3272             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3273                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3274         }
3275
3276         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3277             root->cell_f[i+1] - root->cell_f[i] <
3278             cellsize_limit_f/DD_CELL_MARGIN)
3279         {
3280             char buf[22];
3281             fprintf(stderr,
3282                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3283                     gmx_step_str(step,buf),dim2char(dim),i,
3284                     (root->cell_f[i+1] - root->cell_f[i])
3285                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3286         }
3287     }
3288
3289     pos = ncd + 1;
3290     /* Store the cell boundaries of the lower dimensions at the end */
3291     for(d1=0; d1<d; d1++)
3292     {
3293         root->cell_f[pos++] = comm->cell_f0[d1];
3294         root->cell_f[pos++] = comm->cell_f1[d1];
3295     }
3296
3297     if (d < comm->npmedecompdim)
3298     {
3299         /* The master determines the maximum shift for
3300          * the coordinate communication between separate PME nodes.
3301          */
3302         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3303     }
3304     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3305     if (d >= 1)
3306     {
3307         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3308     }
3309 }
3310
3311 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3312                                              gmx_ddbox_t *ddbox,int dimind)
3313 {
3314     gmx_domdec_comm_t *comm;
3315     int dim;
3316
3317     comm = dd->comm;
3318
3319     /* Set the cell dimensions */
3320     dim = dd->dim[dimind];
3321     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3322     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3323     if (dim >= ddbox->nboundeddim)
3324     {
3325         comm->cell_x0[dim] += ddbox->box0[dim];
3326         comm->cell_x1[dim] += ddbox->box0[dim];
3327     }
3328 }
3329
3330 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3331                                          int d,int dim,real *cell_f_row,
3332                                          gmx_ddbox_t *ddbox)
3333 {
3334     gmx_domdec_comm_t *comm;
3335     int d1,dim1,pos;
3336
3337     comm = dd->comm;
3338
3339 #ifdef GMX_MPI
3340     /* Each node would only need to know two fractions,
3341      * but it is probably cheaper to broadcast the whole array.
3342      */
3343     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3344               0,comm->mpi_comm_load[d]);
3345 #endif
3346     /* Copy the fractions for this dimension from the buffer */
3347     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3348     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3349     /* The whole array was communicated, so set the buffer position */
3350     pos = dd->nc[dim] + 1;
3351     for(d1=0; d1<=d; d1++)
3352     {
3353         if (d1 < d)
3354         {
3355             /* Copy the cell fractions of the lower dimensions */
3356             comm->cell_f0[d1] = cell_f_row[pos++];
3357             comm->cell_f1[d1] = cell_f_row[pos++];
3358         }
3359         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3360     }
3361     /* Convert the communicated shift from float to int */
3362     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3363     if (d >= 1)
3364     {
3365         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3366     }
3367 }
3368
3369 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3370                                          gmx_ddbox_t *ddbox,bool bDynamicBox,
3371                                          bool bUniform,gmx_large_int_t step)
3372 {
3373     gmx_domdec_comm_t *comm;
3374     int d,dim,d1;
3375     bool bRowMember,bRowRoot;
3376     real *cell_f_row;
3377
3378     comm = dd->comm;
3379
3380     for(d=0; d<dd->ndim; d++)
3381     {
3382         dim = dd->dim[d];
3383         bRowMember = TRUE;
3384         bRowRoot = TRUE;
3385         for(d1=d; d1<dd->ndim; d1++)
3386         {
3387             if (dd->ci[dd->dim[d1]] > 0)
3388             {
3389                 if (d1 > d)
3390                 {
3391                     bRowMember = FALSE;
3392                 }
3393                 bRowRoot = FALSE;
3394             }
3395         }
3396         if (bRowMember)
3397         {
3398             if (bRowRoot)
3399             {
3400                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3401                                            ddbox,bDynamicBox,bUniform,step);
3402                 cell_f_row = comm->root[d]->cell_f;
3403             }
3404             else
3405             {
3406                 cell_f_row = comm->cell_f_row;
3407             }
3408             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3409         }
3410     }
3411 }
3412
3413 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3414 {
3415     int d;
3416
3417     /* This function assumes the box is static and should therefore
3418      * not be called when the box has changed since the last
3419      * call to dd_partition_system.
3420      */
3421     for(d=0; d<dd->ndim; d++)
3422     {
3423         relative_to_absolute_cell_bounds(dd,ddbox,d);
3424     }
3425 }
3426
3427
3428
3429 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3430                                   gmx_ddbox_t *ddbox,bool bDynamicBox,
3431                                   bool bUniform,bool bDoDLB,gmx_large_int_t step,
3432                                   gmx_wallcycle_t wcycle)
3433 {
3434     gmx_domdec_comm_t *comm;
3435     int dim;
3436
3437     comm = dd->comm;
3438
3439     if (bDoDLB)
3440     {
3441         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3442         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3443         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3444     }
3445     else if (bDynamicBox)
3446     {
3447         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3448     }
3449
3450     /* Set the dimensions for which no DD is used */
3451     for(dim=0; dim<DIM; dim++) {
3452         if (dd->nc[dim] == 1) {
3453             comm->cell_x0[dim] = 0;
3454             comm->cell_x1[dim] = ddbox->box_size[dim];
3455             if (dim >= ddbox->nboundeddim)
3456             {
3457                 comm->cell_x0[dim] += ddbox->box0[dim];
3458                 comm->cell_x1[dim] += ddbox->box0[dim];
3459             }
3460         }
3461     }
3462 }
3463
3464 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3465 {
3466     int d,np,i;
3467     gmx_domdec_comm_dim_t *cd;
3468
3469     for(d=0; d<dd->ndim; d++)
3470     {
3471         cd = &dd->comm->cd[d];
3472         np = npulse[dd->dim[d]];
3473         if (np > cd->np_nalloc)
3474         {
3475             if (debug)
3476             {
3477                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3478                         dim2char(dd->dim[d]),np);
3479             }
3480             if (DDMASTER(dd) && cd->np_nalloc > 0)
3481             {
3482                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3483             }
3484             srenew(cd->ind,np);
3485             for(i=cd->np_nalloc; i<np; i++)
3486             {
3487                 cd->ind[i].index  = NULL;
3488                 cd->ind[i].nalloc = 0;
3489             }
3490             cd->np_nalloc = np;
3491         }
3492         cd->np = np;
3493     }
3494 }
3495
3496
3497 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3498                               gmx_ddbox_t *ddbox,bool bDynamicBox,
3499                               bool bUniform,bool bDoDLB,gmx_large_int_t step,
3500                               gmx_wallcycle_t wcycle)
3501 {
3502     gmx_domdec_comm_t *comm;
3503     int  d;
3504     ivec npulse;
3505
3506     comm = dd->comm;
3507
3508     /* Copy the old cell boundaries for the cg displacement check */
3509     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3510     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3511
3512     if (comm->bDynLoadBal)
3513     {
3514         if (DDMASTER(dd))
3515         {
3516             check_box_size(dd,ddbox);
3517         }
3518         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3519     }
3520     else
3521     {
3522         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3523         realloc_comm_ind(dd,npulse);
3524     }
3525
3526     if (debug)
3527     {
3528         for(d=0; d<DIM; d++)
3529         {
3530             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3531                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3532         }
3533     }
3534 }
3535
3536 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3537                                   gmx_ddbox_t *ddbox,
3538                                   rvec cell_ns_x0,rvec cell_ns_x1,
3539                                   gmx_large_int_t step)
3540 {
3541     gmx_domdec_comm_t *comm;
3542     int dim_ind,dim;
3543
3544     comm = dd->comm;
3545
3546     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3547     {
3548         dim = dd->dim[dim_ind];
3549
3550         /* Without PBC we don't have restrictions on the outer cells */
3551         if (!(dim >= ddbox->npbcdim &&
3552               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3553             comm->bDynLoadBal &&
3554             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3555             comm->cellsize_min[dim])
3556         {
3557             char buf[22];
3558             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3559                       gmx_step_str(step,buf),dim2char(dim),
3560                       comm->cell_x1[dim] - comm->cell_x0[dim],
3561                       ddbox->skew_fac[dim],
3562                       dd->comm->cellsize_min[dim],
3563                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3564         }
3565     }
3566
3567     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3568     {
3569         /* Communicate the boundaries and update cell_ns_x0/1 */
3570         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3571         if (dd->bGridJump && dd->ndim > 1)
3572         {
3573             check_grid_jump(step,dd,ddbox);
3574         }
3575     }
3576 }
3577
3578 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3579 {
3580     if (YY < npbcdim)
3581     {
3582         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3583     }
3584     else
3585     {
3586         tcm[YY][XX] = 0;
3587     }
3588     if (ZZ < npbcdim)
3589     {
3590         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3591         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3592     }
3593     else
3594     {
3595         tcm[ZZ][XX] = 0;
3596         tcm[ZZ][YY] = 0;
3597     }
3598 }
3599
3600 static void check_screw_box(matrix box)
3601 {
3602     /* Mathematical limitation */
3603     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3604     {
3605         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3606     }
3607
3608     /* Limitation due to the asymmetry of the eighth shell method */
3609     if (box[ZZ][YY] != 0)
3610     {
3611         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3612     }
3613 }
3614
3615 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3616                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3617                           gmx_domdec_t *dd)
3618 {
3619     gmx_domdec_master_t *ma;
3620     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3621     int  i,icg,j,k,k0,k1,d,npbcdim;
3622     matrix tcm;
3623     rvec box_size,cg_cm;
3624     ivec ind;
3625     real nrcg,inv_ncg,pos_d;
3626     atom_id *cgindex;
3627     bool bUnbounded,bScrew;
3628
3629     ma = dd->ma;
3630
3631     if (tmp_ind == NULL)
3632     {
3633         snew(tmp_nalloc,dd->nnodes);
3634         snew(tmp_ind,dd->nnodes);
3635         for(i=0; i<dd->nnodes; i++)
3636         {
3637             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3638             snew(tmp_ind[i],tmp_nalloc[i]);
3639         }
3640     }
3641
3642     /* Clear the count */
3643     for(i=0; i<dd->nnodes; i++)
3644     {
3645         ma->ncg[i] = 0;
3646         ma->nat[i] = 0;
3647     }
3648
3649     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3650
3651     cgindex = cgs->index;
3652
3653     /* Compute the center of geometry for all charge groups */
3654     for(icg=0; icg<cgs->nr; icg++)
3655     {
3656         k0      = cgindex[icg];
3657         k1      = cgindex[icg+1];
3658         nrcg    = k1 - k0;
3659         if (nrcg == 1)
3660         {
3661             copy_rvec(pos[k0],cg_cm);
3662         }
3663         else
3664         {
3665             inv_ncg = 1.0/nrcg;
3666
3667             clear_rvec(cg_cm);
3668             for(k=k0; (k<k1); k++)
3669             {
3670                 rvec_inc(cg_cm,pos[k]);
3671             }
3672             for(d=0; (d<DIM); d++)
3673             {
3674                 cg_cm[d] *= inv_ncg;
3675             }
3676         }
3677         /* Put the charge group in the box and determine the cell index */
3678         for(d=DIM-1; d>=0; d--) {
3679             pos_d = cg_cm[d];
3680             if (d < dd->npbcdim)
3681             {
3682                 bScrew = (dd->bScrewPBC && d == XX);
3683                 if (tric_dir[d] && dd->nc[d] > 1)
3684                 {
3685                     /* Use triclinic coordintates for this dimension */
3686                     for(j=d+1; j<DIM; j++)
3687                     {
3688                         pos_d += cg_cm[j]*tcm[j][d];
3689                     }
3690                 }
3691                 while(pos_d >= box[d][d])
3692                 {
3693                     pos_d -= box[d][d];
3694                     rvec_dec(cg_cm,box[d]);
3695                     if (bScrew)
3696                     {
3697                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3698                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3699                     }
3700                     for(k=k0; (k<k1); k++)
3701                     {
3702                         rvec_dec(pos[k],box[d]);
3703                         if (bScrew)
3704                         {
3705                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3706                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3707                         }
3708                     }
3709                 }
3710                 while(pos_d < 0)
3711                 {
3712                     pos_d += box[d][d];
3713                     rvec_inc(cg_cm,box[d]);
3714                     if (bScrew)
3715                     {
3716                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3717                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3718                     }
3719                     for(k=k0; (k<k1); k++)
3720                     {
3721                         rvec_inc(pos[k],box[d]);
3722                         if (bScrew) {
3723                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3724                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3725                         }
3726                     }
3727                 }
3728             }
3729             /* This could be done more efficiently */
3730             ind[d] = 0;
3731             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3732             {
3733                 ind[d]++;
3734             }
3735         }
3736         i = dd_index(dd->nc,ind);
3737         if (ma->ncg[i] == tmp_nalloc[i])
3738         {
3739             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3740             srenew(tmp_ind[i],tmp_nalloc[i]);
3741         }
3742         tmp_ind[i][ma->ncg[i]] = icg;
3743         ma->ncg[i]++;
3744         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3745     }
3746
3747     k1 = 0;
3748     for(i=0; i<dd->nnodes; i++)
3749     {
3750         ma->index[i] = k1;
3751         for(k=0; k<ma->ncg[i]; k++)
3752         {
3753             ma->cg[k1++] = tmp_ind[i][k];
3754         }
3755     }
3756     ma->index[dd->nnodes] = k1;
3757
3758     for(i=0; i<dd->nnodes; i++)
3759     {
3760         sfree(tmp_ind[i]);
3761     }
3762     sfree(tmp_ind);
3763     sfree(tmp_nalloc);
3764
3765     if (fplog)
3766     {
3767         char buf[22];
3768         fprintf(fplog,"Charge group distribution at step %s:",
3769                 gmx_step_str(step,buf));
3770         for(i=0; i<dd->nnodes; i++)
3771         {
3772             fprintf(fplog," %d",ma->ncg[i]);
3773         }
3774         fprintf(fplog,"\n");
3775     }
3776 }
3777
3778 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3779                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3780                                 rvec pos[])
3781 {
3782     gmx_domdec_master_t *ma=NULL;
3783     ivec npulse;
3784     int  i,cg_gl;
3785     int  *ibuf,buf2[2] = { 0, 0 };
3786
3787     if (DDMASTER(dd))
3788     {
3789         ma = dd->ma;
3790
3791         if (dd->bScrewPBC)
3792         {
3793             check_screw_box(box);
3794         }
3795
3796         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3797
3798         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3799         for(i=0; i<dd->nnodes; i++)
3800         {
3801             ma->ibuf[2*i]   = ma->ncg[i];
3802             ma->ibuf[2*i+1] = ma->nat[i];
3803         }
3804         ibuf = ma->ibuf;
3805     }
3806     else
3807     {
3808         ibuf = NULL;
3809     }
3810     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3811
3812     dd->ncg_home = buf2[0];
3813     dd->nat_home = buf2[1];
3814     dd->ncg_tot  = dd->ncg_home;
3815     dd->nat_tot  = dd->nat_home;
3816     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3817     {
3818         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3819         srenew(dd->index_gl,dd->cg_nalloc);
3820         srenew(dd->cgindex,dd->cg_nalloc+1);
3821     }
3822     if (DDMASTER(dd))
3823     {
3824         for(i=0; i<dd->nnodes; i++)
3825         {
3826             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3827             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3828         }
3829     }
3830
3831     dd_scatterv(dd,
3832                 DDMASTER(dd) ? ma->ibuf : NULL,
3833                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3834                 DDMASTER(dd) ? ma->cg : NULL,
3835                 dd->ncg_home*sizeof(int),dd->index_gl);
3836
3837     /* Determine the home charge group sizes */
3838     dd->cgindex[0] = 0;
3839     for(i=0; i<dd->ncg_home; i++)
3840     {
3841         cg_gl = dd->index_gl[i];
3842         dd->cgindex[i+1] =
3843             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3844     }
3845
3846     if (debug)
3847     {
3848         fprintf(debug,"Home charge groups:\n");
3849         for(i=0; i<dd->ncg_home; i++)
3850         {
3851             fprintf(debug," %d",dd->index_gl[i]);
3852             if (i % 10 == 9)
3853                 fprintf(debug,"\n");
3854         }
3855         fprintf(debug,"\n");
3856     }
3857 }
3858
3859 static int compact_and_copy_vec_at(int ncg,int *move,
3860                                    int *cgindex,
3861                                    int nvec,int vec,
3862                                    rvec *src,gmx_domdec_comm_t *comm,
3863                                    bool bCompact)
3864 {
3865     int m,icg,i,i0,i1,nrcg;
3866     int home_pos;
3867     int pos_vec[DIM*2];
3868
3869     home_pos = 0;
3870
3871     for(m=0; m<DIM*2; m++)
3872     {
3873         pos_vec[m] = 0;
3874     }
3875
3876     i0 = 0;
3877     for(icg=0; icg<ncg; icg++)
3878     {
3879         i1 = cgindex[icg+1];
3880         m = move[icg];
3881         if (m == -1)
3882         {
3883             if (bCompact)
3884             {
3885                 /* Compact the home array in place */
3886                 for(i=i0; i<i1; i++)
3887                 {
3888                     copy_rvec(src[i],src[home_pos++]);
3889                 }
3890             }
3891         }
3892         else
3893         {
3894             /* Copy to the communication buffer */
3895             nrcg = i1 - i0;
3896             pos_vec[m] += 1 + vec*nrcg;
3897             for(i=i0; i<i1; i++)
3898             {
3899                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3900             }
3901             pos_vec[m] += (nvec - vec - 1)*nrcg;
3902         }
3903         if (!bCompact)
3904         {
3905             home_pos += i1 - i0;
3906         }
3907         i0 = i1;
3908     }
3909
3910     return home_pos;
3911 }
3912
3913 static int compact_and_copy_vec_cg(int ncg,int *move,
3914                                    int *cgindex,
3915                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3916                                    bool bCompact)
3917 {
3918     int m,icg,i0,i1,nrcg;
3919     int home_pos;
3920     int pos_vec[DIM*2];
3921
3922     home_pos = 0;
3923
3924     for(m=0; m<DIM*2; m++)
3925     {
3926         pos_vec[m] = 0;
3927     }
3928
3929     i0 = 0;
3930     for(icg=0; icg<ncg; icg++)
3931     {
3932         i1 = cgindex[icg+1];
3933         m = move[icg];
3934         if (m == -1)
3935         {
3936             if (bCompact)
3937             {
3938                 /* Compact the home array in place */
3939                 copy_rvec(src[icg],src[home_pos++]);
3940             }
3941         }
3942         else
3943         {
3944             nrcg = i1 - i0;
3945             /* Copy to the communication buffer */
3946             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3947             pos_vec[m] += 1 + nrcg*nvec;
3948         }
3949         i0 = i1;
3950     }
3951     if (!bCompact)
3952     {
3953         home_pos = ncg;
3954     }
3955
3956     return home_pos;
3957 }
3958
3959 static int compact_ind(int ncg,int *move,
3960                        int *index_gl,int *cgindex,
3961                        int *gatindex,
3962                        gmx_ga2la_t ga2la,char *bLocalCG,
3963                        int *cginfo)
3964 {
3965     int cg,nat,a0,a1,a,a_gl;
3966     int home_pos;
3967
3968     home_pos = 0;
3969     nat = 0;
3970     for(cg=0; cg<ncg; cg++)
3971     {
3972         a0 = cgindex[cg];
3973         a1 = cgindex[cg+1];
3974         if (move[cg] == -1)
3975         {
3976             /* Compact the home arrays in place.
3977              * Anything that can be done here avoids access to global arrays.
3978              */
3979             cgindex[home_pos] = nat;
3980             for(a=a0; a<a1; a++)
3981             {
3982                 a_gl = gatindex[a];
3983                 gatindex[nat] = a_gl;
3984                 /* The cell number stays 0, so we don't need to set it */
3985                 ga2la_change_la(ga2la,a_gl,nat);
3986                 nat++;
3987             }
3988             index_gl[home_pos] = index_gl[cg];
3989             cginfo[home_pos]   = cginfo[cg];
3990             /* The charge group remains local, so bLocalCG does not change */
3991             home_pos++;
3992         }
3993         else
3994         {
3995             /* Clear the global indices */
3996             for(a=a0; a<a1; a++)
3997             {
3998                 ga2la_del(ga2la,gatindex[a]);
3999             }
4000             if (bLocalCG)
4001             {
4002                 bLocalCG[index_gl[cg]] = FALSE;
4003             }
4004         }
4005     }
4006     cgindex[home_pos] = nat;
4007
4008     return home_pos;
4009 }
4010
4011 static void clear_and_mark_ind(int ncg,int *move,
4012                                int *index_gl,int *cgindex,int *gatindex,
4013                                gmx_ga2la_t ga2la,char *bLocalCG,
4014                                int *cell_index)
4015 {
4016     int cg,a0,a1,a;
4017
4018     for(cg=0; cg<ncg; cg++)
4019     {
4020         if (move[cg] >= 0)
4021         {
4022             a0 = cgindex[cg];
4023             a1 = cgindex[cg+1];
4024             /* Clear the global indices */
4025             for(a=a0; a<a1; a++)
4026             {
4027                 ga2la_del(ga2la,gatindex[a]);
4028             }
4029             if (bLocalCG)
4030             {
4031                 bLocalCG[index_gl[cg]] = FALSE;
4032             }
4033             /* Signal that this cg has moved using the ns cell index.
4034              * Here we set it to -1.
4035              * fill_grid will change it from -1 to 4*grid->ncells.
4036              */
4037             cell_index[cg] = -1;
4038         }
4039     }
4040 }
4041
4042 static void print_cg_move(FILE *fplog,
4043                           gmx_domdec_t *dd,
4044                           gmx_large_int_t step,int cg,int dim,int dir,
4045                           real limitd,
4046                           rvec cm_old,rvec cm_new,real pos_d)
4047 {
4048     gmx_domdec_comm_t *comm;
4049     char buf[22];
4050
4051     comm = dd->comm;
4052
4053     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4054     fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4055             ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4056     fprintf(fplog,"distance out of cell %f\n",
4057             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4058     fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4059             cm_old[XX],cm_old[YY],cm_old[ZZ]);
4060     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4061             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4062     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4063             dim2char(dim),
4064             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4065     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4066             dim2char(dim),
4067             comm->cell_x0[dim],comm->cell_x1[dim]);
4068 }
4069
4070 static void cg_move_error(FILE *fplog,
4071                           gmx_domdec_t *dd,
4072                           gmx_large_int_t step,int cg,int dim,int dir,
4073                           real limitd,
4074                           rvec cm_old,rvec cm_new,real pos_d)
4075 {
4076     if (fplog)
4077     {
4078         print_cg_move(fplog, dd,step,cg,dim,dir,limitd,cm_old,cm_new,pos_d);
4079     }
4080     print_cg_move(stderr,dd,step,cg,dim,dir,limitd,cm_old,cm_new,pos_d);
4081     gmx_fatal(FARGS,
4082               "A charge group moved too far between two domain decomposition steps\n"
4083               "This usually means that your system is not well equilibrated");
4084 }
4085
4086 static void rotate_state_atom(t_state *state,int a)
4087 {
4088     int est;
4089
4090     for(est=0; est<estNR; est++)
4091     {
4092         if (EST_DISTR(est) && state->flags & (1<<est)) {
4093             switch (est) {
4094             case estX:
4095                 /* Rotate the complete state; for a rectangular box only */
4096                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4097                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4098                 break;
4099             case estV:
4100                 state->v[a][YY] = -state->v[a][YY];
4101                 state->v[a][ZZ] = -state->v[a][ZZ];
4102                 break;
4103             case estSDX:
4104                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4105                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4106                 break;
4107             case estCGP:
4108                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4109                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4110                 break;
4111             case estDISRE_INITF:
4112             case estDISRE_RM3TAV:
4113             case estORIRE_INITF:
4114             case estORIRE_DTAV:
4115                 /* These are distances, so not affected by rotation */
4116                 break;
4117             default:
4118                 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4119             }
4120         }
4121     }
4122 }
4123
4124 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4125                               gmx_domdec_t *dd,ivec tric_dir,
4126                               t_state *state,rvec **f,
4127                               t_forcerec *fr,t_mdatoms *md,
4128                               bool bCompact,
4129                               t_nrnb *nrnb)
4130 {
4131     int  *move;
4132     int  npbcdim;
4133     int  ncg[DIM*2],nat[DIM*2];
4134     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4135     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4136     int  sbuf[2],rbuf[2];
4137     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4138     int  flag;
4139     bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4140     bool bScrew;
4141     ivec dev;
4142     real inv_ncg,pos_d;
4143     matrix tcm;
4144     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4145     atom_id *cgindex;
4146     cginfo_mb_t *cginfo_mb;
4147     gmx_domdec_comm_t *comm;
4148
4149     if (dd->bScrewPBC)
4150     {
4151         check_screw_box(state->box);
4152     }
4153
4154     comm  = dd->comm;
4155     cg_cm = fr->cg_cm;
4156
4157     for(i=0; i<estNR; i++)
4158     {
4159         if (EST_DISTR(i))
4160         {
4161             switch (i)
4162             {
4163             case estX:   /* Always present */            break;
4164             case estV:   bV   = (state->flags & (1<<i)); break;
4165             case estSDX: bSDX = (state->flags & (1<<i)); break;
4166             case estCGP: bCGP = (state->flags & (1<<i)); break;
4167             case estLD_RNG:
4168             case estLD_RNGI:
4169             case estDISRE_INITF:
4170             case estDISRE_RM3TAV:
4171             case estORIRE_INITF:
4172             case estORIRE_DTAV:
4173                 /* No processing required */
4174                 break;
4175             default:
4176             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4177             }
4178         }
4179     }
4180
4181     if (dd->ncg_tot > comm->nalloc_int)
4182     {
4183         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4184         srenew(comm->buf_int,comm->nalloc_int);
4185     }
4186     move = comm->buf_int;
4187
4188     /* Clear the count */
4189     for(c=0; c<dd->ndim*2; c++)
4190     {
4191         ncg[c] = 0;
4192         nat[c] = 0;
4193     }
4194
4195     npbcdim = dd->npbcdim;
4196
4197     for(d=0; (d<DIM); d++)
4198     {
4199         limitd[d] = dd->comm->cellsize_min[d];
4200         if (d >= npbcdim && dd->ci[d] == 0)
4201         {
4202             cell_x0[d] = -GMX_FLOAT_MAX;
4203         }
4204         else
4205         {
4206             cell_x0[d] = comm->cell_x0[d];
4207         }
4208         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4209         {
4210             cell_x1[d] = GMX_FLOAT_MAX;
4211         }
4212         else
4213         {
4214             cell_x1[d] = comm->cell_x1[d];
4215         }
4216         limit0[d] = comm->old_cell_x0[d] - limitd[d];
4217         limit1[d] = comm->old_cell_x1[d] + limitd[d];
4218     }
4219
4220     make_tric_corr_matrix(npbcdim,state->box,tcm);
4221
4222     cgindex = dd->cgindex;
4223
4224     /* Compute the center of geometry for all home charge groups
4225      * and put them in the box and determine where they should go.
4226      */
4227     for(cg=0; cg<dd->ncg_home; cg++)
4228     {
4229         k0   = cgindex[cg];
4230         k1   = cgindex[cg+1];
4231         nrcg = k1 - k0;
4232         if (nrcg == 1)
4233         {
4234             copy_rvec(state->x[k0],cm_new);
4235         }
4236         else
4237         {
4238             inv_ncg = 1.0/nrcg;
4239
4240             clear_rvec(cm_new);
4241             for(k=k0; (k<k1); k++)
4242             {
4243                 rvec_inc(cm_new,state->x[k]);
4244             }
4245             for(d=0; (d<DIM); d++)
4246             {
4247                 cm_new[d] = inv_ncg*cm_new[d];
4248             }
4249         }
4250
4251         clear_ivec(dev);
4252         /* Do pbc and check DD cell boundary crossings */
4253         for(d=DIM-1; d>=0; d--)
4254         {
4255             if (dd->nc[d] > 1)
4256             {
4257                 bScrew = (dd->bScrewPBC && d == XX);
4258                 /* Determine the location of this cg in lattice coordinates */
4259                 pos_d = cm_new[d];
4260                 if (tric_dir[d])
4261                 {
4262                     for(d2=d+1; d2<DIM; d2++)
4263                     {
4264                         pos_d += cm_new[d2]*tcm[d2][d];
4265                     }
4266                 }
4267                 /* Put the charge group in the triclinic unit-cell */
4268                 if (pos_d >= cell_x1[d])
4269                 {
4270                     if (pos_d >= limit1[d])
4271                     {
4272                         cg_move_error(fplog,dd,step,cg,d,1,limitd[d],
4273                                       cg_cm[cg],cm_new,pos_d);
4274                     }
4275                     dev[d] = 1;
4276                     if (dd->ci[d] == dd->nc[d] - 1)
4277                     {
4278                         rvec_dec(cm_new,state->box[d]);
4279                         if (bScrew)
4280                         {
4281                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4282                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4283                         }
4284                         for(k=k0; (k<k1); k++)
4285                         {
4286                             rvec_dec(state->x[k],state->box[d]);
4287                             if (bScrew)
4288                             {
4289                                 rotate_state_atom(state,k);
4290                             }
4291                         }
4292                     }
4293                 }
4294                 else if (pos_d < cell_x0[d])
4295                 {
4296                     if (pos_d < limit0[d])
4297                     {
4298                         cg_move_error(fplog,dd,step,cg,d,-1,limitd[d],
4299                                       cg_cm[cg],cm_new,pos_d);
4300                     }
4301                     dev[d] = -1;
4302                     if (dd->ci[d] == 0)
4303                     {
4304                         rvec_inc(cm_new,state->box[d]);
4305                         if (bScrew)
4306                         {
4307                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4308                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4309                         }
4310                         for(k=k0; (k<k1); k++)
4311                         {
4312                             rvec_inc(state->x[k],state->box[d]);
4313                             if (bScrew)
4314                             {
4315                                 rotate_state_atom(state,k);
4316                             }
4317                         }
4318                     }
4319                 }
4320             }
4321             else if (d < npbcdim)
4322             {
4323                 /* Put the charge group in the rectangular unit-cell */
4324                 while (cm_new[d] >= state->box[d][d])
4325                 {
4326                     rvec_dec(cm_new,state->box[d]);
4327                     for(k=k0; (k<k1); k++)
4328                     {
4329                         rvec_dec(state->x[k],state->box[d]);
4330                     }
4331                 }
4332                 while (cm_new[d] < 0)
4333                 {
4334                     rvec_inc(cm_new,state->box[d]);
4335                     for(k=k0; (k<k1); k++)
4336                     {
4337                         rvec_inc(state->x[k],state->box[d]);
4338                     }
4339                 }
4340             }
4341         }
4342
4343         copy_rvec(cm_new,cg_cm[cg]);
4344
4345         /* Determine where this cg should go */
4346         flag = 0;
4347         mc = -1;
4348         for(d=0; d<dd->ndim; d++)
4349         {
4350             dim = dd->dim[d];
4351             if (dev[dim] == 1)
4352             {
4353                 flag |= DD_FLAG_FW(d);
4354                 if (mc == -1)
4355                 {
4356                     mc = d*2;
4357                 }
4358             }
4359             else if (dev[dim] == -1)
4360             {
4361                 flag |= DD_FLAG_BW(d);
4362                 if (mc == -1) {
4363                     if (dd->nc[dim] > 2)
4364                     {
4365                         mc = d*2 + 1;
4366                     }
4367                     else
4368                     {
4369                         mc = d*2;
4370                     }
4371                 }
4372             }
4373         }
4374         move[cg] = mc;
4375         if (mc >= 0)
4376         {
4377             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4378             {
4379                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4380                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4381             }
4382             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4383             /* We store the cg size in the lower 16 bits
4384              * and the place where the charge group should go
4385              * in the next 6 bits. This saves some communication volume.
4386              */
4387             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4388             ncg[mc] += 1;
4389             nat[mc] += nrcg;
4390         }
4391     }
4392
4393     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4394     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4395
4396     nvec = 1;
4397     if (bV)
4398     {
4399         nvec++;
4400     }
4401     if (bSDX)
4402     {
4403         nvec++;
4404     }
4405     if (bCGP)
4406     {
4407         nvec++;
4408     }
4409
4410     /* Make sure the communication buffers are large enough */
4411     for(mc=0; mc<dd->ndim*2; mc++)
4412     {
4413         nvr = ncg[mc] + nat[mc]*nvec;
4414         if (nvr > comm->cgcm_state_nalloc[mc])
4415         {
4416             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4417             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4418         }
4419     }
4420
4421     /* Recalculating cg_cm might be cheaper than communicating,
4422      * but that could give rise to rounding issues.
4423      */
4424     home_pos_cg =
4425         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4426                                 nvec,cg_cm,comm,bCompact);
4427
4428     vec = 0;
4429     home_pos_at =
4430         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4431                                 nvec,vec++,state->x,comm,bCompact);
4432     if (bV)
4433     {
4434         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4435                                 nvec,vec++,state->v,comm,bCompact);
4436     }
4437     if (bSDX)
4438     {
4439         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4440                                 nvec,vec++,state->sd_X,comm,bCompact);
4441     }
4442     if (bCGP)
4443     {
4444         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4445                                 nvec,vec++,state->cg_p,comm,bCompact);
4446     }
4447
4448     if (bCompact)
4449     {
4450         compact_ind(dd->ncg_home,move,
4451                     dd->index_gl,dd->cgindex,dd->gatindex,
4452                     dd->ga2la,comm->bLocalCG,
4453                     fr->cginfo);
4454     }
4455     else
4456     {
4457         clear_and_mark_ind(dd->ncg_home,move,
4458                            dd->index_gl,dd->cgindex,dd->gatindex,
4459                            dd->ga2la,comm->bLocalCG,
4460                            fr->ns.grid->cell_index);
4461     }
4462
4463     cginfo_mb = fr->cginfo_mb;
4464
4465     ncg_stay_home = home_pos_cg;
4466     for(d=0; d<dd->ndim; d++)
4467     {
4468         dim = dd->dim[d];
4469         ncg_recv = 0;
4470         nat_recv = 0;
4471         nvr      = 0;
4472         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4473         {
4474             cdd = d*2 + dir;
4475             /* Communicate the cg and atom counts */
4476             sbuf[0] = ncg[cdd];
4477             sbuf[1] = nat[cdd];
4478             if (debug)
4479             {
4480                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4481                         d,dir,sbuf[0],sbuf[1]);
4482             }
4483             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4484
4485             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4486             {
4487                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4488                 srenew(comm->buf_int,comm->nalloc_int);
4489             }
4490
4491             /* Communicate the charge group indices, sizes and flags */
4492             dd_sendrecv_int(dd, d, dir,
4493                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4494                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4495
4496             nvs = ncg[cdd] + nat[cdd]*nvec;
4497             i   = rbuf[0]  + rbuf[1] *nvec;
4498             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4499
4500             /* Communicate cgcm and state */
4501             dd_sendrecv_rvec(dd, d, dir,
4502                              comm->cgcm_state[cdd], nvs,
4503                              comm->vbuf.v+nvr, i);
4504             ncg_recv += rbuf[0];
4505             nat_recv += rbuf[1];
4506             nvr      += i;
4507         }
4508
4509         /* Process the received charge groups */
4510         buf_pos = 0;
4511         for(cg=0; cg<ncg_recv; cg++)
4512         {
4513             flag = comm->buf_int[cg*DD_CGIBS+1];
4514             mc = -1;
4515             if (d < dd->ndim-1)
4516             {
4517                 /* Check which direction this cg should go */
4518                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4519                 {
4520                     if (dd->bGridJump)
4521                     {
4522                         /* The cell boundaries for dimension d2 are not equal
4523                          * for each cell row of the lower dimension(s),
4524                          * therefore we might need to redetermine where
4525                          * this cg should go.
4526                          */
4527                         dim2 = dd->dim[d2];
4528                         /* If this cg crosses the box boundary in dimension d2
4529                          * we can use the communicated flag, so we do not
4530                          * have to worry about pbc.
4531                          */
4532                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4533                                (flag & DD_FLAG_FW(d2))) ||
4534                               (dd->ci[dim2] == 0 &&
4535                                (flag & DD_FLAG_BW(d2)))))
4536                         {
4537                             /* Clear the two flags for this dimension */
4538                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4539                             /* Determine the location of this cg
4540                              * in lattice coordinates
4541                              */
4542                             pos_d = comm->vbuf.v[buf_pos][dim2];
4543                             if (tric_dir[dim2])
4544                             {
4545                                 for(d3=dim2+1; d3<DIM; d3++)
4546                                 {
4547                                     pos_d +=
4548                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4549                                 }
4550                             }
4551                             /* Check of we are not at the box edge.
4552                              * pbc is only handled in the first step above,
4553                              * but this check could move over pbc while
4554                              * the first step did not due to different rounding.
4555                              */
4556                             if (pos_d >= cell_x1[dim2] &&
4557                                 dd->ci[dim2] != dd->nc[dim2]-1)
4558                             {
4559                                 flag |= DD_FLAG_FW(d2);
4560                             }
4561                             else if (pos_d < cell_x0[dim2] &&
4562                                      dd->ci[dim2] != 0)
4563                             {
4564                                 flag |= DD_FLAG_BW(d2);
4565                             }
4566                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4567                         }
4568                     }
4569                     /* Set to which neighboring cell this cg should go */
4570                     if (flag & DD_FLAG_FW(d2))
4571                     {
4572                         mc = d2*2;
4573                     }
4574                     else if (flag & DD_FLAG_BW(d2))
4575                     {
4576                         if (dd->nc[dd->dim[d2]] > 2)
4577                         {
4578                             mc = d2*2+1;
4579                         }
4580                         else
4581                         {
4582                             mc = d2*2;
4583                         }
4584                     }
4585                 }
4586             }
4587
4588             nrcg = flag & DD_FLAG_NRCG;
4589             if (mc == -1)
4590             {
4591                 if (home_pos_cg+1 > dd->cg_nalloc)
4592                 {
4593                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4594                     srenew(dd->index_gl,dd->cg_nalloc);
4595                     srenew(dd->cgindex,dd->cg_nalloc+1);
4596                 }
4597                 /* Set the global charge group index and size */
4598                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4599                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4600                 /* Copy the state from the buffer */
4601                 if (home_pos_cg >= fr->cg_nalloc)
4602                 {
4603                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4604                     cg_cm = fr->cg_cm;
4605                 }
4606                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4607                 /* Set the cginfo */
4608                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4609                                                    dd->index_gl[home_pos_cg]);
4610                 if (comm->bLocalCG)
4611                 {
4612                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4613                 }
4614
4615                 if (home_pos_at+nrcg > state->nalloc)
4616                 {
4617                     dd_realloc_state(state,f,home_pos_at+nrcg);
4618                 }
4619                 for(i=0; i<nrcg; i++)
4620                 {
4621                     copy_rvec(comm->vbuf.v[buf_pos++],
4622                               state->x[home_pos_at+i]);
4623                 }
4624                 if (bV)
4625                 {
4626                     for(i=0; i<nrcg; i++)
4627                     {
4628                         copy_rvec(comm->vbuf.v[buf_pos++],
4629                                   state->v[home_pos_at+i]);
4630                     }
4631                 }
4632                 if (bSDX)
4633                 {
4634                     for(i=0; i<nrcg; i++)
4635                     {
4636                         copy_rvec(comm->vbuf.v[buf_pos++],
4637                                   state->sd_X[home_pos_at+i]);
4638                     }
4639                 }
4640                 if (bCGP)
4641                 {
4642                     for(i=0; i<nrcg; i++)
4643                     {
4644                         copy_rvec(comm->vbuf.v[buf_pos++],
4645                                   state->cg_p[home_pos_at+i]);
4646                     }
4647                 }
4648                 home_pos_cg += 1;
4649                 home_pos_at += nrcg;
4650             }
4651             else
4652             {
4653                 /* Reallocate the buffers if necessary  */
4654                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4655                 {
4656                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4657                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4658                 }
4659                 nvr = ncg[mc] + nat[mc]*nvec;
4660                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4661                 {
4662                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4663                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4664                 }
4665                 /* Copy from the receive to the send buffers */
4666                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4667                        comm->buf_int + cg*DD_CGIBS,
4668                        DD_CGIBS*sizeof(int));
4669                 memcpy(comm->cgcm_state[mc][nvr],
4670                        comm->vbuf.v[buf_pos],
4671                        (1+nrcg*nvec)*sizeof(rvec));
4672                 buf_pos += 1 + nrcg*nvec;
4673                 ncg[mc] += 1;
4674                 nat[mc] += nrcg;
4675             }
4676         }
4677     }
4678
4679     /* With sorting (!bCompact) the indices are now only partially up to date
4680      * and ncg_home and nat_home are not the real count, since there are
4681      * "holes" in the arrays for the charge groups that moved to neighbors.
4682      */
4683     dd->ncg_home = home_pos_cg;
4684     dd->nat_home = home_pos_at;
4685
4686     if (debug)
4687     {
4688         fprintf(debug,"Finished repartitioning\n");
4689     }
4690
4691     return ncg_stay_home;
4692 }
4693
4694 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4695 {
4696     dd->comm->cycl[ddCycl] += cycles;
4697     dd->comm->cycl_n[ddCycl]++;
4698     if (cycles > dd->comm->cycl_max[ddCycl])
4699     {
4700         dd->comm->cycl_max[ddCycl] = cycles;
4701     }
4702 }
4703
4704 static double force_flop_count(t_nrnb *nrnb)
4705 {
4706     int i;
4707     double sum;
4708     const char *name;
4709
4710     sum = 0;
4711     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4712     {
4713         /* To get closer to the real timings, we half the count
4714          * for the normal loops and again half it for water loops.
4715          */
4716         name = nrnb_str(i);
4717         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4718         {
4719             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4720         }
4721         else
4722         {
4723             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4724         }
4725     }
4726     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4727     {
4728         name = nrnb_str(i);
4729         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4730         sum += nrnb->n[i]*cost_nrnb(i);
4731     }
4732     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4733     {
4734         sum += nrnb->n[i]*cost_nrnb(i);
4735     }
4736
4737     return sum;
4738 }
4739
4740 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4741 {
4742     if (dd->comm->eFlop)
4743     {
4744         dd->comm->flop -= force_flop_count(nrnb);
4745     }
4746 }
4747 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4748 {
4749     if (dd->comm->eFlop)
4750     {
4751         dd->comm->flop += force_flop_count(nrnb);
4752         dd->comm->flop_n++;
4753     }
4754 }
4755
4756 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4757 {
4758     int i;
4759
4760     for(i=0; i<ddCyclNr; i++)
4761     {
4762         dd->comm->cycl[i] = 0;
4763         dd->comm->cycl_n[i] = 0;
4764         dd->comm->cycl_max[i] = 0;
4765     }
4766     dd->comm->flop = 0;
4767     dd->comm->flop_n = 0;
4768 }
4769
4770 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4771 {
4772     gmx_domdec_comm_t *comm;
4773     gmx_domdec_load_t *load;
4774     gmx_domdec_root_t *root=NULL;
4775     int  d,dim,cid,i,pos;
4776     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4777     bool bSepPME;
4778
4779     if (debug)
4780     {
4781         fprintf(debug,"get_load_distribution start\n");
4782     }
4783
4784     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4785
4786     comm = dd->comm;
4787
4788     bSepPME = (dd->pme_nodeid >= 0);
4789
4790     for(d=dd->ndim-1; d>=0; d--)
4791     {
4792         dim = dd->dim[d];
4793         /* Check if we participate in the communication in this dimension */
4794         if (d == dd->ndim-1 ||
4795             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4796         {
4797             load = &comm->load[d];
4798             if (dd->bGridJump)
4799             {
4800                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4801             }
4802             pos = 0;
4803             if (d == dd->ndim-1)
4804             {
4805                 sbuf[pos++] = dd_force_load(comm);
4806                 sbuf[pos++] = sbuf[0];
4807                 if (dd->bGridJump)
4808                 {
4809                     sbuf[pos++] = sbuf[0];
4810                     sbuf[pos++] = cell_frac;
4811                     if (d > 0)
4812                     {
4813                         sbuf[pos++] = comm->cell_f_max0[d];
4814                         sbuf[pos++] = comm->cell_f_min1[d];
4815                     }
4816                 }
4817                 if (bSepPME)
4818                 {
4819                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4820                     sbuf[pos++] = comm->cycl[ddCyclPME];
4821                 }
4822             }
4823             else
4824             {
4825                 sbuf[pos++] = comm->load[d+1].sum;
4826                 sbuf[pos++] = comm->load[d+1].max;
4827                 if (dd->bGridJump)
4828                 {
4829                     sbuf[pos++] = comm->load[d+1].sum_m;
4830                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4831                     sbuf[pos++] = comm->load[d+1].flags;
4832                     if (d > 0)
4833                     {
4834                         sbuf[pos++] = comm->cell_f_max0[d];
4835                         sbuf[pos++] = comm->cell_f_min1[d];
4836                     }
4837                 }
4838                 if (bSepPME)
4839                 {
4840                     sbuf[pos++] = comm->load[d+1].mdf;
4841                     sbuf[pos++] = comm->load[d+1].pme;
4842                 }
4843             }
4844             load->nload = pos;
4845             /* Communicate a row in DD direction d.
4846              * The communicators are setup such that the root always has rank 0.
4847              */
4848 #ifdef GMX_MPI
4849             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4850                        load->load,load->nload*sizeof(float),MPI_BYTE,
4851                        0,comm->mpi_comm_load[d]);
4852 #endif
4853             if (dd->ci[dim] == dd->master_ci[dim])
4854             {
4855                 /* We are the root, process this row */
4856                 if (comm->bDynLoadBal)
4857                 {
4858                     root = comm->root[d];
4859                 }
4860                 load->sum = 0;
4861                 load->max = 0;
4862                 load->sum_m = 0;
4863                 load->cvol_min = 1;
4864                 load->flags = 0;
4865                 load->mdf = 0;
4866                 load->pme = 0;
4867                 pos = 0;
4868                 for(i=0; i<dd->nc[dim]; i++)
4869                 {
4870                     load->sum += load->load[pos++];
4871                     load->max = max(load->max,load->load[pos]);
4872                     pos++;
4873                     if (dd->bGridJump)
4874                     {
4875                         if (root->bLimited)
4876                         {
4877                             /* This direction could not be load balanced properly,
4878                              * therefore we need to use the maximum iso the average load.
4879                              */
4880                             load->sum_m = max(load->sum_m,load->load[pos]);
4881                         }
4882                         else
4883                         {
4884                             load->sum_m += load->load[pos];
4885                         }
4886                         pos++;
4887                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4888                         pos++;
4889                         if (d < dd->ndim-1)
4890                         {
4891                             load->flags = (int)(load->load[pos++] + 0.5);
4892                         }
4893                         if (d > 0)
4894                         {
4895                             root->cell_f_max0[i] = load->load[pos++];
4896                             root->cell_f_min1[i] = load->load[pos++];
4897                         }
4898                     }
4899                     if (bSepPME)
4900                     {
4901                         load->mdf = max(load->mdf,load->load[pos]);
4902                         pos++;
4903                         load->pme = max(load->pme,load->load[pos]);
4904                         pos++;
4905                     }
4906                 }
4907                 if (comm->bDynLoadBal && root->bLimited)
4908                 {
4909                     load->sum_m *= dd->nc[dim];
4910                     load->flags |= (1<<d);
4911                 }
4912             }
4913         }
4914     }
4915
4916     if (DDMASTER(dd))
4917     {
4918         comm->nload      += dd_load_count(comm);
4919         comm->load_step  += comm->cycl[ddCyclStep];
4920         comm->load_sum   += comm->load[0].sum;
4921         comm->load_max   += comm->load[0].max;
4922         if (comm->bDynLoadBal)
4923         {
4924             for(d=0; d<dd->ndim; d++)
4925             {
4926                 if (comm->load[0].flags & (1<<d))
4927                 {
4928                     comm->load_lim[d]++;
4929                 }
4930             }
4931         }
4932         if (bSepPME)
4933         {
4934             comm->load_mdf += comm->load[0].mdf;
4935             comm->load_pme += comm->load[0].pme;
4936         }
4937     }
4938
4939     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
4940
4941     if (debug)
4942     {
4943         fprintf(debug,"get_load_distribution finished\n");
4944     }
4945 }
4946
4947 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
4948 {
4949     /* Return the relative performance loss on the total run time
4950      * due to the force calculation load imbalance.
4951      */
4952     if (dd->comm->nload > 0)
4953     {
4954         return
4955             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
4956             (dd->comm->load_step*dd->nnodes);
4957     }
4958     else
4959     {
4960         return 0;
4961     }
4962 }
4963
4964 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
4965 {
4966     char  buf[STRLEN];
4967     int   npp,npme,nnodes,d,limp;
4968     float imbal,pme_f_ratio,lossf,lossp=0;
4969     bool  bLim;
4970     gmx_domdec_comm_t *comm;
4971
4972     comm = dd->comm;
4973     if (DDMASTER(dd) && comm->nload > 0)
4974     {
4975         npp    = dd->nnodes;
4976         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
4977         nnodes = npp + npme;
4978         imbal = comm->load_max*npp/comm->load_sum - 1;
4979         lossf = dd_force_imb_perf_loss(dd);
4980         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
4981         fprintf(fplog,"%s",buf);
4982         fprintf(stderr,"\n");
4983         fprintf(stderr,"%s",buf);
4984         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
4985         fprintf(fplog,"%s",buf);
4986         fprintf(stderr,"%s",buf);
4987         bLim = FALSE;
4988         if (comm->bDynLoadBal)
4989         {
4990             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
4991             for(d=0; d<dd->ndim; d++)
4992             {
4993                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
4994                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
4995                 if (limp >= 50)
4996                 {
4997                     bLim = TRUE;
4998                 }
4999             }
5000             sprintf(buf+strlen(buf),"\n");
5001             fprintf(fplog,"%s",buf);
5002             fprintf(stderr,"%s",buf);
5003         }
5004         if (npme > 0)
5005         {
5006             pme_f_ratio = comm->load_pme/comm->load_mdf;
5007             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5008             if (lossp <= 0)
5009             {
5010                 lossp *= (float)npme/(float)nnodes;
5011             }
5012             else
5013             {
5014                 lossp *= (float)npp/(float)nnodes;
5015             }
5016             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5017             fprintf(fplog,"%s",buf);
5018             fprintf(stderr,"%s",buf);
5019             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5020             fprintf(fplog,"%s",buf);
5021             fprintf(stderr,"%s",buf);
5022         }
5023         fprintf(fplog,"\n");
5024         fprintf(stderr,"\n");
5025
5026         if (lossf >= DD_PERF_LOSS)
5027         {
5028             sprintf(buf,
5029                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5030                     "      in the domain decomposition.\n",lossf*100);
5031             if (!comm->bDynLoadBal)
5032             {
5033                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5034             }
5035             else if (bLim)
5036             {
5037                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5038             }
5039             fprintf(fplog,"%s\n",buf);
5040             fprintf(stderr,"%s\n",buf);
5041         }
5042         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5043         {
5044             sprintf(buf,
5045                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5046                     "      had %s work to do than the PP nodes.\n"
5047                     "      You might want to %s the number of PME nodes\n"
5048                     "      or %s the cut-off and the grid spacing.\n",
5049                     fabs(lossp*100),
5050                     (lossp < 0) ? "less"     : "more",
5051                     (lossp < 0) ? "decrease" : "increase",
5052                     (lossp < 0) ? "decrease" : "increase");
5053             fprintf(fplog,"%s\n",buf);
5054             fprintf(stderr,"%s\n",buf);
5055         }
5056     }
5057 }
5058
5059 static float dd_vol_min(gmx_domdec_t *dd)
5060 {
5061     return dd->comm->load[0].cvol_min*dd->nnodes;
5062 }
5063
5064 static bool dd_load_flags(gmx_domdec_t *dd)
5065 {
5066     return dd->comm->load[0].flags;
5067 }
5068
5069 static float dd_f_imbal(gmx_domdec_t *dd)
5070 {
5071     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5072 }
5073
5074 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5075 {
5076     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5077 }
5078
5079 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5080 {
5081     int flags,d;
5082     char buf[22];
5083
5084     flags = dd_load_flags(dd);
5085     if (flags)
5086     {
5087         fprintf(fplog,
5088                 "DD  load balancing is limited by minimum cell size in dimension");
5089         for(d=0; d<dd->ndim; d++)
5090         {
5091             if (flags & (1<<d))
5092             {
5093                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5094             }
5095         }
5096         fprintf(fplog,"\n");
5097     }
5098     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5099     if (dd->comm->bDynLoadBal)
5100     {
5101         fprintf(fplog,"  vol min/aver %5.3f%c",
5102                 dd_vol_min(dd),flags ? '!' : ' ');
5103     }
5104     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5105     if (dd->comm->cycl_n[ddCyclPME])
5106     {
5107         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5108     }
5109     fprintf(fplog,"\n\n");
5110 }
5111
5112 static void dd_print_load_verbose(gmx_domdec_t *dd)
5113 {
5114     if (dd->comm->bDynLoadBal)
5115     {
5116         fprintf(stderr,"vol %4.2f%c ",
5117                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5118     }
5119     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5120     if (dd->comm->cycl_n[ddCyclPME])
5121     {
5122         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5123     }
5124 }
5125
5126 #ifdef GMX_MPI
5127 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5128                                    int dim_ind,ivec loc)
5129 {
5130     MPI_Group g_row;
5131     MPI_Comm  c_row;
5132     int  dim,i,*rank;
5133     ivec loc_c;
5134     gmx_domdec_root_t *root;
5135
5136     dim = dd->dim[dim_ind];
5137     copy_ivec(loc,loc_c);
5138     snew(rank,dd->nc[dim]);
5139     for(i=0; i<dd->nc[dim]; i++)
5140     {
5141         loc_c[dim] = i;
5142         rank[i] = dd_index(dd->nc,loc_c);
5143     }
5144     /* Here we create a new group, that does not necessarily
5145      * include our process. But MPI_Comm_create needs to be
5146      * called by all the processes in the original communicator.
5147      * Calling MPI_Group_free afterwards gives errors, so I assume
5148      * also the group is needed by all processes. (B. Hess)
5149      */
5150     MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5151     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5152     if (c_row != MPI_COMM_NULL)
5153     {
5154         /* This process is part of the group */
5155         dd->comm->mpi_comm_load[dim_ind] = c_row;
5156         if (dd->comm->eDLB != edlbNO)
5157         {
5158             if (dd->ci[dim] == dd->master_ci[dim])
5159             {
5160                 /* This is the root process of this row */
5161                 snew(dd->comm->root[dim_ind],1);
5162                 root = dd->comm->root[dim_ind];
5163                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5164                 snew(root->old_cell_f,dd->nc[dim]+1);
5165                 snew(root->bCellMin,dd->nc[dim]);
5166                 if (dim_ind > 0)
5167                 {
5168                     snew(root->cell_f_max0,dd->nc[dim]);
5169                     snew(root->cell_f_min1,dd->nc[dim]);
5170                     snew(root->bound_min,dd->nc[dim]);
5171                     snew(root->bound_max,dd->nc[dim]);
5172                 }
5173                 snew(root->buf_ncd,dd->nc[dim]);
5174             }
5175             else
5176             {
5177                 /* This is not a root process, we only need to receive cell_f */
5178                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5179             }
5180         }
5181         if (dd->ci[dim] == dd->master_ci[dim])
5182         {
5183             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5184         }
5185     }
5186     sfree(rank);
5187 }
5188 #endif
5189
5190 static void make_load_communicators(gmx_domdec_t *dd)
5191 {
5192 #ifdef GMX_MPI
5193   MPI_Group g_all;
5194   int  dim0,dim1,i,j;
5195   ivec loc;
5196
5197   if (debug)
5198     fprintf(debug,"Making load communicators\n");
5199
5200   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5201
5202   snew(dd->comm->load,dd->ndim);
5203   snew(dd->comm->mpi_comm_load,dd->ndim);
5204
5205   clear_ivec(loc);
5206   make_load_communicator(dd,g_all,0,loc);
5207   if (dd->ndim > 1) {
5208     dim0 = dd->dim[0];
5209     for(i=0; i<dd->nc[dim0]; i++) {
5210       loc[dim0] = i;
5211       make_load_communicator(dd,g_all,1,loc);
5212     }
5213   }
5214   if (dd->ndim > 2) {
5215     dim0 = dd->dim[0];
5216     for(i=0; i<dd->nc[dim0]; i++) {
5217       loc[dim0] = i;
5218       dim1 = dd->dim[1];
5219       for(j=0; j<dd->nc[dim1]; j++) {
5220           loc[dim1] = j;
5221           make_load_communicator(dd,g_all,2,loc);
5222       }
5223     }
5224   }
5225
5226   MPI_Group_free(&g_all);
5227
5228   if (debug)
5229     fprintf(debug,"Finished making load communicators\n");
5230 #endif
5231 }
5232
5233 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5234 {
5235     bool bZYX;
5236     int  d,dim,i,j,m;
5237     ivec tmp,s;
5238     int  nzone,nzonep;
5239     ivec dd_zp[DD_MAXIZONE];
5240     gmx_domdec_zones_t *zones;
5241     gmx_domdec_ns_ranges_t *izone;
5242
5243     for(d=0; d<dd->ndim; d++)
5244     {
5245         dim = dd->dim[d];
5246         copy_ivec(dd->ci,tmp);
5247         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5248         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5249         copy_ivec(dd->ci,tmp);
5250         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5251         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5252         if (debug)
5253         {
5254             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5255                     dd->rank,dim,
5256                     dd->neighbor[d][0],
5257                     dd->neighbor[d][1]);
5258         }
5259     }
5260
5261     if (DDMASTER(dd))
5262     {
5263         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5264             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5265     }
5266     if (fplog)
5267     {
5268         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5269                 dd->ndim,
5270                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5271                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5272     }
5273     switch (dd->ndim)
5274     {
5275     case 3:
5276         nzone  = dd_z3n;
5277         nzonep = dd_zp3n;
5278         for(i=0; i<nzonep; i++)
5279         {
5280             copy_ivec(dd_zp3[i],dd_zp[i]);
5281         }
5282         break;
5283     case 2:
5284         nzone  = dd_z2n;
5285         nzonep = dd_zp2n;
5286         for(i=0; i<nzonep; i++)
5287         {
5288             copy_ivec(dd_zp2[i],dd_zp[i]);
5289         }
5290         break;
5291     case 1:
5292         nzone  = dd_z1n;
5293         nzonep = dd_zp1n;
5294         for(i=0; i<nzonep; i++)
5295         {
5296             copy_ivec(dd_zp1[i],dd_zp[i]);
5297         }
5298         break;
5299     default:
5300         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5301         nzone = 0;
5302         nzonep = 0;
5303     }
5304
5305     zones = &dd->comm->zones;
5306
5307     for(i=0; i<nzone; i++)
5308     {
5309         m = 0;
5310         clear_ivec(zones->shift[i]);
5311         for(d=0; d<dd->ndim; d++)
5312         {
5313             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5314         }
5315     }
5316
5317     zones->n = nzone;
5318     for(i=0; i<nzone; i++)
5319     {
5320         for(d=0; d<DIM; d++)
5321         {
5322             s[d] = dd->ci[d] - zones->shift[i][d];
5323             if (s[d] < 0)
5324             {
5325                 s[d] += dd->nc[d];
5326             }
5327             else if (s[d] >= dd->nc[d])
5328             {
5329                 s[d] -= dd->nc[d];
5330             }
5331         }
5332     }
5333     zones->nizone = nzonep;
5334     for(i=0; i<zones->nizone; i++)
5335     {
5336         if (dd_zp[i][0] != i)
5337         {
5338             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5339         }
5340         izone = &zones->izone[i];
5341         izone->j0 = dd_zp[i][1];
5342         izone->j1 = dd_zp[i][2];
5343         for(dim=0; dim<DIM; dim++)
5344         {
5345             if (dd->nc[dim] == 1)
5346             {
5347                 /* All shifts should be allowed */
5348                 izone->shift0[dim] = -1;
5349                 izone->shift1[dim] = 1;
5350             }
5351             else
5352             {
5353                 /*
5354                   izone->shift0[d] = 0;
5355                   izone->shift1[d] = 0;
5356                   for(j=izone->j0; j<izone->j1; j++) {
5357                   if (dd->shift[j][d] > dd->shift[i][d])
5358                   izone->shift0[d] = -1;
5359                   if (dd->shift[j][d] < dd->shift[i][d])
5360                   izone->shift1[d] = 1;
5361                   }
5362                 */
5363
5364                 int shift_diff;
5365
5366                 /* Assume the shift are not more than 1 cell */
5367                 izone->shift0[dim] = 1;
5368                 izone->shift1[dim] = -1;
5369                 for(j=izone->j0; j<izone->j1; j++)
5370                 {
5371                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5372                     if (shift_diff < izone->shift0[dim])
5373                     {
5374                         izone->shift0[dim] = shift_diff;
5375                     }
5376                     if (shift_diff > izone->shift1[dim])
5377                     {
5378                         izone->shift1[dim] = shift_diff;
5379                     }
5380                 }
5381             }
5382         }
5383     }
5384
5385     if (dd->comm->eDLB != edlbNO)
5386     {
5387         snew(dd->comm->root,dd->ndim);
5388     }
5389
5390     if (dd->comm->bRecordLoad)
5391     {
5392         make_load_communicators(dd);
5393     }
5394 }
5395
5396 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5397 {
5398     gmx_domdec_t *dd;
5399     gmx_domdec_comm_t *comm;
5400     int  i,rank,*buf;
5401     ivec periods;
5402 #ifdef GMX_MPI
5403     MPI_Comm comm_cart;
5404 #endif
5405
5406     dd = cr->dd;
5407     comm = dd->comm;
5408
5409 #ifdef GMX_MPI
5410     if (comm->bCartesianPP)
5411     {
5412         /* Set up cartesian communication for the particle-particle part */
5413         if (fplog)
5414         {
5415             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5416                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5417         }
5418
5419         for(i=0; i<DIM; i++)
5420         {
5421             periods[i] = TRUE;
5422         }
5423         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5424                         &comm_cart);
5425         /* We overwrite the old communicator with the new cartesian one */
5426         cr->mpi_comm_mygroup = comm_cart;
5427     }
5428
5429     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5430     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5431
5432     if (comm->bCartesianPP_PME)
5433     {
5434         /* Since we want to use the original cartesian setup for sim,
5435          * and not the one after split, we need to make an index.
5436          */
5437         snew(comm->ddindex2ddnodeid,dd->nnodes);
5438         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5439         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5440         /* Get the rank of the DD master,
5441          * above we made sure that the master node is a PP node.
5442          */
5443         if (MASTER(cr))
5444         {
5445             rank = dd->rank;
5446         }
5447         else
5448         {
5449             rank = 0;
5450         }
5451         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5452     }
5453     else if (comm->bCartesianPP)
5454     {
5455         if (cr->npmenodes == 0)
5456         {
5457             /* The PP communicator is also
5458              * the communicator for this simulation
5459              */
5460             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5461         }
5462         cr->nodeid = dd->rank;
5463
5464         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5465
5466         /* We need to make an index to go from the coordinates
5467          * to the nodeid of this simulation.
5468          */
5469         snew(comm->ddindex2simnodeid,dd->nnodes);
5470         snew(buf,dd->nnodes);
5471         if (cr->duty & DUTY_PP)
5472         {
5473             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5474         }
5475         /* Communicate the ddindex to simulation nodeid index */
5476         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5477                       cr->mpi_comm_mysim);
5478         sfree(buf);
5479
5480         /* Determine the master coordinates and rank.
5481          * The DD master should be the same node as the master of this sim.
5482          */
5483         for(i=0; i<dd->nnodes; i++)
5484         {
5485             if (comm->ddindex2simnodeid[i] == 0)
5486             {
5487                 ddindex2xyz(dd->nc,i,dd->master_ci);
5488                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5489             }
5490         }
5491         if (debug)
5492         {
5493             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5494         }
5495     }
5496     else
5497     {
5498         /* No Cartesian communicators */
5499         /* We use the rank in dd->comm->all as DD index */
5500         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5501         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5502         dd->masterrank = 0;
5503         clear_ivec(dd->master_ci);
5504     }
5505 #endif
5506
5507     if (fplog)
5508     {
5509         fprintf(fplog,
5510                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5511                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5512     }
5513     if (debug)
5514     {
5515         fprintf(debug,
5516                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5517                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5518     }
5519 }
5520
5521 static void receive_ddindex2simnodeid(t_commrec *cr)
5522 {
5523     gmx_domdec_t *dd;
5524
5525     gmx_domdec_comm_t *comm;
5526     int  *buf;
5527
5528     dd = cr->dd;
5529     comm = dd->comm;
5530
5531 #ifdef GMX_MPI
5532     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5533     {
5534         snew(comm->ddindex2simnodeid,dd->nnodes);
5535         snew(buf,dd->nnodes);
5536         if (cr->duty & DUTY_PP)
5537         {
5538             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5539         }
5540 #ifdef GMX_MPI
5541         /* Communicate the ddindex to simulation nodeid index */
5542         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5543                       cr->mpi_comm_mysim);
5544 #endif
5545         sfree(buf);
5546     }
5547 #endif
5548 }
5549
5550 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5551                                                      int ncg,int natoms)
5552 {
5553     gmx_domdec_master_t *ma;
5554     int i;
5555
5556     snew(ma,1);
5557
5558     snew(ma->ncg,dd->nnodes);
5559     snew(ma->index,dd->nnodes+1);
5560     snew(ma->cg,ncg);
5561     snew(ma->nat,dd->nnodes);
5562     snew(ma->ibuf,dd->nnodes*2);
5563     snew(ma->cell_x,DIM);
5564     for(i=0; i<DIM; i++)
5565     {
5566         snew(ma->cell_x[i],dd->nc[i]+1);
5567     }
5568
5569     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5570     {
5571         ma->vbuf = NULL;
5572     }
5573     else
5574     {
5575         snew(ma->vbuf,natoms);
5576     }
5577
5578     return ma;
5579 }
5580
5581 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5582                                int reorder)
5583 {
5584     gmx_domdec_t *dd;
5585     gmx_domdec_comm_t *comm;
5586     int  i,rank;
5587     bool bDiv[DIM];
5588     ivec periods;
5589 #ifdef GMX_MPI
5590     MPI_Comm comm_cart;
5591 #endif
5592
5593     dd = cr->dd;
5594     comm = dd->comm;
5595
5596     if (comm->bCartesianPP)
5597     {
5598         for(i=1; i<DIM; i++)
5599         {
5600             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5601         }
5602         if (bDiv[YY] || bDiv[ZZ])
5603         {
5604             comm->bCartesianPP_PME = TRUE;
5605             /* If we have 2D PME decomposition, which is always in x+y,
5606              * we stack the PME only nodes in z.
5607              * Otherwise we choose the direction that provides the thinnest slab
5608              * of PME only nodes as this will have the least effect
5609              * on the PP communication.
5610              * But for the PME communication the opposite might be better.
5611              */
5612             if (bDiv[ZZ] && (comm->npmenodes_minor > 1 ||
5613                              !bDiv[YY] ||
5614                              dd->nc[YY] > dd->nc[ZZ]))
5615             {
5616                 comm->cartpmedim = ZZ;
5617             }
5618             else
5619             {
5620                 comm->cartpmedim = YY;
5621             }
5622             comm->ntot[comm->cartpmedim]
5623                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5624         }
5625         else if (fplog)
5626         {
5627             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5628             fprintf(fplog,
5629                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5630         }
5631     }
5632
5633 #ifdef GMX_MPI
5634     if (comm->bCartesianPP_PME)
5635     {
5636         if (fplog)
5637         {
5638             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5639         }
5640
5641         for(i=0; i<DIM; i++)
5642         {
5643             periods[i] = TRUE;
5644         }
5645         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5646                         &comm_cart);
5647
5648         MPI_Comm_rank(comm_cart,&rank);
5649         if (MASTERNODE(cr) && rank != 0)
5650         {
5651             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5652         }
5653
5654         /* With this assigment we loose the link to the original communicator
5655          * which will usually be MPI_COMM_WORLD, unless have multisim.
5656          */
5657         cr->mpi_comm_mysim = comm_cart;
5658         cr->sim_nodeid = rank;
5659
5660         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5661
5662         if (fplog)
5663         {
5664             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5665                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5666         }
5667
5668         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5669         {
5670             cr->duty = DUTY_PP;
5671         }
5672         if (cr->npmenodes == 0 ||
5673             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5674         {
5675             cr->duty = DUTY_PME;
5676         }
5677
5678         /* Split the sim communicator into PP and PME only nodes */
5679         MPI_Comm_split(cr->mpi_comm_mysim,
5680                        cr->duty,
5681                        dd_index(comm->ntot,dd->ci),
5682                        &cr->mpi_comm_mygroup);
5683     }
5684     else
5685     {
5686         switch (dd_node_order)
5687         {
5688         case ddnoPP_PME:
5689             if (fplog)
5690             {
5691                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5692             }
5693             break;
5694         case ddnoINTERLEAVE:
5695             /* Interleave the PP-only and PME-only nodes,
5696              * as on clusters with dual-core machines this will double
5697              * the communication bandwidth of the PME processes
5698              * and thus speed up the PP <-> PME and inter PME communication.
5699              */
5700             if (fplog)
5701             {
5702                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5703             }
5704             comm->pmenodes = dd_pmenodes(cr);
5705             break;
5706         case ddnoCARTESIAN:
5707             break;
5708         default:
5709             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5710         }
5711
5712         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5713         {
5714             cr->duty = DUTY_PME;
5715         }
5716         else
5717         {
5718             cr->duty = DUTY_PP;
5719         }
5720
5721         /* Split the sim communicator into PP and PME only nodes */
5722         MPI_Comm_split(cr->mpi_comm_mysim,
5723                        cr->duty,
5724                        cr->nodeid,
5725                        &cr->mpi_comm_mygroup);
5726         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5727     }
5728 #endif
5729
5730     if (fplog)
5731     {
5732         fprintf(fplog,"This is a %s only node\n\n",
5733                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5734     }
5735 }
5736
5737 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5738 {
5739     gmx_domdec_t *dd;
5740     gmx_domdec_comm_t *comm;
5741     int CartReorder;
5742
5743     dd = cr->dd;
5744     comm = dd->comm;
5745
5746     copy_ivec(dd->nc,comm->ntot);
5747
5748     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5749     comm->bCartesianPP_PME = FALSE;
5750
5751     /* Reorder the nodes by default. This might change the MPI ranks.
5752      * Real reordering is only supported on very few architectures,
5753      * Blue Gene is one of them.
5754      */
5755     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5756
5757     if (cr->npmenodes > 0)
5758     {
5759         /* Split the communicator into a PP and PME part */
5760         split_communicator(fplog,cr,dd_node_order,CartReorder);
5761         if (comm->bCartesianPP_PME)
5762         {
5763             /* We (possibly) reordered the nodes in split_communicator,
5764              * so it is no longer required in make_pp_communicator.
5765              */
5766             CartReorder = FALSE;
5767         }
5768     }
5769     else
5770     {
5771         /* All nodes do PP and PME */
5772 #ifdef GMX_MPI
5773         /* We do not require separate communicators */
5774         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5775 #endif
5776     }
5777
5778     if (cr->duty & DUTY_PP)
5779     {
5780         /* Copy or make a new PP communicator */
5781         make_pp_communicator(fplog,cr,CartReorder);
5782     }
5783     else
5784     {
5785         receive_ddindex2simnodeid(cr);
5786     }
5787
5788     if (!(cr->duty & DUTY_PME))
5789     {
5790         /* Set up the commnuication to our PME node */
5791         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5792         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5793         if (debug)
5794         {
5795             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5796                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5797         }
5798     }
5799     else
5800     {
5801         dd->pme_nodeid = -1;
5802     }
5803
5804     if (DDMASTER(dd))
5805     {
5806         dd->ma = init_gmx_domdec_master_t(dd,
5807                                           comm->cgs_gl.nr,
5808                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5809     }
5810 }
5811
5812 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5813 {
5814     real *slb_frac,tot;
5815     int  i,n;
5816     double dbl;
5817
5818     slb_frac = NULL;
5819     if (nc > 1 && size_string != NULL)
5820     {
5821         if (fplog)
5822         {
5823             fprintf(fplog,"Using static load balancing for the %s direction\n",
5824                     dir);
5825         }
5826         snew(slb_frac,nc);
5827         tot = 0;
5828         for (i=0; i<nc; i++)
5829         {
5830             dbl = 0;
5831             sscanf(size_string,"%lf%n",&dbl,&n);
5832             if (dbl == 0)
5833             {
5834                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5835             }
5836             slb_frac[i] = dbl;
5837             size_string += n;
5838             tot += slb_frac[i];
5839         }
5840         /* Normalize */
5841         if (fplog)
5842         {
5843             fprintf(fplog,"Relative cell sizes:");
5844         }
5845         for (i=0; i<nc; i++)
5846         {
5847             slb_frac[i] /= tot;
5848             if (fplog)
5849             {
5850                 fprintf(fplog," %5.3f",slb_frac[i]);
5851             }
5852         }
5853         if (fplog)
5854         {
5855             fprintf(fplog,"\n");
5856         }
5857     }
5858
5859     return slb_frac;
5860 }
5861
5862 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5863 {
5864     int n,nmol,ftype;
5865     gmx_mtop_ilistloop_t iloop;
5866     t_ilist *il;
5867
5868     n = 0;
5869     iloop = gmx_mtop_ilistloop_init(mtop);
5870     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5871     {
5872         for(ftype=0; ftype<F_NRE; ftype++)
5873         {
5874             if ((interaction_function[ftype].flags & IF_BOND) &&
5875                 NRAL(ftype) >  2)
5876             {
5877                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5878             }
5879         }
5880   }
5881
5882   return n;
5883 }
5884
5885 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5886 {
5887     char *val;
5888     int  nst;
5889
5890     nst = def;
5891     val = getenv(env_var);
5892     if (val)
5893     {
5894         if (sscanf(val,"%d",&nst) <= 0)
5895         {
5896             nst = 1;
5897         }
5898         if (fplog)
5899         {
5900             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5901                     env_var,val,nst);
5902         }
5903     }
5904
5905     return nst;
5906 }
5907
5908 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5909 {
5910     if (MASTER(cr))
5911     {
5912         fprintf(stderr,"\n%s\n",warn_string);
5913     }
5914     if (fplog)
5915     {
5916         fprintf(fplog,"\n%s\n",warn_string);
5917     }
5918 }
5919
5920 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5921                                   t_inputrec *ir,FILE *fplog)
5922 {
5923     if (ir->ePBC == epbcSCREW &&
5924         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
5925     {
5926         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
5927     }
5928
5929     if (ir->ns_type == ensSIMPLE)
5930     {
5931         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
5932     }
5933
5934     if (ir->nstlist == 0)
5935     {
5936         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
5937     }
5938
5939     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
5940     {
5941         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
5942     }
5943 }
5944
5945 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
5946 {
5947     int  di,d;
5948     real r;
5949
5950     r = ddbox->box_size[XX];
5951     for(di=0; di<dd->ndim; di++)
5952     {
5953         d = dd->dim[di];
5954         /* Check using the initial average cell size */
5955         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
5956     }
5957
5958     return r;
5959 }
5960
5961 static int check_dlb_support(FILE *fplog,t_commrec *cr,
5962                              const char *dlb_opt,bool bRecordLoad,
5963                              unsigned long Flags,t_inputrec *ir)
5964 {
5965     gmx_domdec_t *dd;
5966     int  eDLB=-1;
5967     char buf[STRLEN];
5968
5969     switch (dlb_opt[0])
5970     {
5971     case 'a': eDLB = edlbAUTO; break;
5972     case 'n': eDLB = edlbNO;   break;
5973     case 'y': eDLB = edlbYES;  break;
5974     default: gmx_incons("Unknown dlb_opt");
5975     }
5976
5977     if (Flags & MD_RERUN)
5978     {
5979         return edlbNO;
5980     }
5981
5982     if (!EI_DYNAMICS(ir->eI))
5983     {
5984         if (eDLB == edlbYES)
5985         {
5986             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
5987             dd_warning(cr,fplog,buf);
5988         }
5989
5990         return edlbNO;
5991     }
5992
5993     if (!bRecordLoad)
5994     {
5995         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
5996
5997         return edlbNO;
5998     }
5999
6000     if (Flags & MD_REPRODUCIBLE)
6001     {
6002         switch (eDLB)
6003         {
6004                         case edlbNO:
6005                                 break;
6006                         case edlbAUTO:
6007                                 dd_warning(cr,fplog,"NOTE: reproducability requested, will not use dynamic load balancing\n");
6008                                 eDLB = edlbNO;
6009                                 break;
6010                         case edlbYES:
6011                                 dd_warning(cr,fplog,"WARNING: reproducability requested with dynamic load balancing, the simulation will NOT be binary reproducable\n");
6012                                 break;
6013                         default:
6014                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6015                                 break;
6016         }
6017     }
6018
6019     return eDLB;
6020 }
6021
6022 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6023 {
6024     int dim;
6025
6026     dd->ndim = 0;
6027     if (getenv("GMX_DD_ORDER_ZYX"))
6028     {
6029         /* Decomposition order z,y,x */
6030         if (fplog)
6031         {
6032             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6033         }
6034         for(dim=DIM-1; dim>=0; dim--)
6035         {
6036             if (dd->nc[dim] > 1)
6037             {
6038                 dd->dim[dd->ndim++] = dim;
6039             }
6040         }
6041     }
6042     else
6043     {
6044         /* Decomposition order x,y,z */
6045         for(dim=0; dim<DIM; dim++)
6046         {
6047             if (dd->nc[dim] > 1)
6048             {
6049                 dd->dim[dd->ndim++] = dim;
6050             }
6051         }
6052     }
6053 }
6054
6055 static gmx_domdec_comm_t *init_dd_comm()
6056 {
6057     gmx_domdec_comm_t *comm;
6058     int  i;
6059
6060     snew(comm,1);
6061     snew(comm->cggl_flag,DIM*2);
6062     snew(comm->cgcm_state,DIM*2);
6063     for(i=0; i<DIM*2; i++)
6064     {
6065         comm->cggl_flag_nalloc[i]  = 0;
6066         comm->cgcm_state_nalloc[i] = 0;
6067     }
6068
6069     comm->nalloc_int = 0;
6070     comm->buf_int    = NULL;
6071
6072     vec_rvec_init(&comm->vbuf);
6073
6074     comm->n_load_have    = 0;
6075     comm->n_load_collect = 0;
6076
6077     for(i=0; i<ddnatNR-ddnatZONE; i++)
6078     {
6079         comm->sum_nat[i] = 0;
6080     }
6081     comm->ndecomp = 0;
6082     comm->nload   = 0;
6083     comm->load_step = 0;
6084     comm->load_sum  = 0;
6085     comm->load_max  = 0;
6086     clear_ivec(comm->load_lim);
6087     comm->load_mdf  = 0;
6088     comm->load_pme  = 0;
6089
6090     return comm;
6091 }
6092
6093 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6094                                         unsigned long Flags,
6095                                         ivec nc,
6096                                         real comm_distance_min,real rconstr,
6097                                         const char *dlb_opt,real dlb_scale,
6098                                         const char *sizex,const char *sizey,const char *sizez,
6099                                         gmx_mtop_t *mtop,t_inputrec *ir,
6100                                         matrix box,rvec *x,
6101                                         gmx_ddbox_t *ddbox,
6102                                         int *npme_major,int *npme_minor)
6103 {
6104     gmx_domdec_t *dd;
6105     gmx_domdec_comm_t *comm;
6106     int  recload;
6107     int  d,i,j;
6108     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6109     bool bC;
6110     char buf[STRLEN];
6111
6112     if (fplog)
6113     {
6114         fprintf(fplog,
6115                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6116     }
6117
6118     snew(dd,1);
6119     dd->comm = init_dd_comm();
6120     comm = dd->comm;
6121     snew(comm->cggl_flag,DIM*2);
6122     snew(comm->cgcm_state,DIM*2);
6123
6124     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6125     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6126
6127     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6128     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6129     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6130     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6131     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6132     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6133     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6134
6135     dd->pme_recv_f_alloc = 0;
6136     dd->pme_recv_f_buf = NULL;
6137
6138     if (dd->bSendRecv2 && fplog)
6139     {
6140         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6141     }
6142     if (comm->eFlop)
6143     {
6144         if (fplog)
6145         {
6146             fprintf(fplog,"Will load balance based on FLOP count\n");
6147         }
6148         if (comm->eFlop > 1)
6149         {
6150             srand(1+cr->nodeid);
6151         }
6152         comm->bRecordLoad = TRUE;
6153     }
6154     else
6155     {
6156         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6157
6158     }
6159
6160     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6161
6162     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6163     if (fplog)
6164     {
6165         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6166     }
6167     dd->bGridJump = comm->bDynLoadBal;
6168
6169     if (comm->nstSortCG)
6170     {
6171         if (fplog)
6172         {
6173             if (comm->nstSortCG == 1)
6174             {
6175                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6176             }
6177             else
6178             {
6179                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6180                         comm->nstSortCG);
6181             }
6182         }
6183         snew(comm->sort,1);
6184     }
6185     else
6186     {
6187         if (fplog)
6188         {
6189             fprintf(fplog,"Will not sort the charge groups\n");
6190         }
6191     }
6192
6193     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6194     if (comm->bInterCGBondeds)
6195     {
6196         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6197     }
6198     else
6199     {
6200         comm->bInterCGMultiBody = FALSE;
6201     }
6202
6203     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6204
6205     if (ir->rlistlong == 0)
6206     {
6207         /* Set the cut-off to some very large value,
6208          * so we don't need if statements everywhere in the code.
6209          * We use sqrt, since the cut-off is squared in some places.
6210          */
6211         comm->cutoff   = GMX_CUTOFF_INF;
6212     }
6213     else
6214     {
6215         comm->cutoff   = ir->rlistlong;
6216     }
6217     comm->cutoff_mbody = 0;
6218
6219     comm->cellsize_limit = 0;
6220     comm->bBondComm = FALSE;
6221
6222     if (comm->bInterCGBondeds)
6223     {
6224         if (comm_distance_min > 0)
6225         {
6226             comm->cutoff_mbody = comm_distance_min;
6227             if (Flags & MD_DDBONDCOMM)
6228             {
6229                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6230             }
6231             else
6232             {
6233                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6234             }
6235             r_bonded_limit = comm->cutoff_mbody;
6236         }
6237         else if (ir->bPeriodicMols)
6238         {
6239             /* Can not easily determine the required cut-off */
6240             dd_warning(cr,fplog,"NOTE: Periodic molecules: can not easily determine the required minimum bonded cut-off, using half the non-bonded cut-off\n");
6241             comm->cutoff_mbody = comm->cutoff/2;
6242             r_bonded_limit = comm->cutoff_mbody;
6243         }
6244         else
6245         {
6246             if (MASTER(cr))
6247             {
6248                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6249                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6250             }
6251             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6252             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6253
6254             /* We use an initial margin of 10% for the minimum cell size,
6255              * except when we are just below the non-bonded cut-off.
6256              */
6257             if (Flags & MD_DDBONDCOMM)
6258             {
6259                 if (max(r_2b,r_mb) > comm->cutoff)
6260                 {
6261                     r_bonded       = max(r_2b,r_mb);
6262                     r_bonded_limit = 1.1*r_bonded;
6263                     comm->bBondComm = TRUE;
6264                 }
6265                 else
6266                 {
6267                     r_bonded       = r_mb;
6268                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6269                 }
6270                 /* We determine cutoff_mbody later */
6271             }
6272             else
6273             {
6274                 /* No special bonded communication,
6275                  * simply increase the DD cut-off.
6276                  */
6277                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6278                 comm->cutoff_mbody = r_bonded_limit;
6279                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6280             }
6281         }
6282         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6283         if (fplog)
6284         {
6285             fprintf(fplog,
6286                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6287                     comm->cellsize_limit);
6288         }
6289     }
6290
6291     if (dd->bInterCGcons && rconstr <= 0)
6292     {
6293         /* There is a cell size limit due to the constraints (P-LINCS) */
6294         rconstr = constr_r_max(fplog,mtop,ir);
6295         if (fplog)
6296         {
6297             fprintf(fplog,
6298                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6299                     rconstr);
6300             if (rconstr > comm->cellsize_limit)
6301             {
6302                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6303             }
6304         }
6305     }
6306     else if (rconstr > 0 && fplog)
6307     {
6308         /* Here we do not check for dd->bInterCGcons,
6309          * because one can also set a cell size limit for virtual sites only
6310          * and at this point we don't know yet if there are intercg v-sites.
6311          */
6312         fprintf(fplog,
6313                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6314                 rconstr);
6315     }
6316     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6317
6318     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6319
6320     if (nc[XX] > 0)
6321     {
6322         copy_ivec(nc,dd->nc);
6323         set_dd_dim(fplog,dd);
6324         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6325
6326         if (cr->npmenodes == -1)
6327         {
6328             cr->npmenodes = 0;
6329         }
6330         acs = average_cellsize_min(dd,ddbox);
6331         if (acs < comm->cellsize_limit)
6332         {
6333             if (fplog)
6334             {
6335                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6336             }
6337             gmx_fatal_collective(FARGS,MASTER(cr),
6338                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6339                                  acs,comm->cellsize_limit);
6340         }
6341     }
6342     else
6343     {
6344         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6345
6346         /* We need to choose the optimal DD grid and possibly PME nodes */
6347         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6348                                comm->eDLB!=edlbNO,dlb_scale,
6349                                comm->cellsize_limit,comm->cutoff,
6350                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6351
6352         if (dd->nc[XX] == 0)
6353         {
6354             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6355             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6356                     !bC ? "-rdd" : "-rcon",
6357                     comm->eDLB!=edlbNO ? " or -dds" : "",
6358                     bC ? " or your LINCS settings" : "");
6359
6360             gmx_fatal_collective(FARGS,MASTER(cr),
6361                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6362                                  "%s\n"
6363                                  "Look in the log file for details on the domain decomposition",
6364                                  cr->nnodes-cr->npmenodes,limit,buf);
6365         }
6366         set_dd_dim(fplog,dd);
6367     }
6368
6369     if (fplog)
6370     {
6371         fprintf(fplog,
6372                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6373                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6374     }
6375
6376     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6377     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6378     {
6379         gmx_fatal(FARGS,"The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6380                   dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6381     }
6382     if (cr->npmenodes > dd->nnodes)
6383     {
6384         gmx_fatal(FARGS,"The number of separate PME node (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6385     }
6386     if (cr->npmenodes > 0)
6387     {
6388         comm->npmenodes = cr->npmenodes;
6389     }
6390     else
6391     {
6392         comm->npmenodes = dd->nnodes;
6393     }
6394
6395     if (EEL_PME(ir->coulombtype))
6396     {
6397         if (comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6398             getenv("GMX_PMEONEDD") == NULL)
6399         {
6400             comm->npmedecompdim   = 2;
6401             comm->npmenodes_major = dd->nc[XX];
6402             comm->npmenodes_minor = comm->npmenodes/comm->npmenodes_major;
6403         }
6404         else
6405         {
6406             comm->npmedecompdim   = 1;
6407             comm->npmenodes_major = comm->npmenodes;
6408             comm->npmenodes_minor = comm->npmenodes/comm->npmenodes_major;
6409         }
6410         if (fplog)
6411         {
6412             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6413                     comm->npmenodes_major,comm->npmenodes_minor,1);
6414         }
6415     }
6416     else
6417     {
6418         comm->npmedecompdim   = 0;
6419         comm->npmenodes_major = 0;
6420         comm->npmenodes_minor = 0;
6421     }
6422
6423     /* Technically we don't need both of these, but it simplifies code not having to recalculate it */
6424     *npme_major = comm->npmenodes_major;
6425     *npme_minor = comm->npmenodes_minor;
6426
6427     snew(comm->slb_frac,DIM);
6428     if (comm->eDLB == edlbNO)
6429     {
6430         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6431         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6432         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6433     }
6434
6435     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6436     {
6437         if (comm->bBondComm || comm->eDLB != edlbNO)
6438         {
6439             /* Set the bonded communication distance to halfway
6440              * the minimum and the maximum,
6441              * since the extra communication cost is nearly zero.
6442              */
6443             acs = average_cellsize_min(dd,ddbox);
6444             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6445             if (comm->eDLB != edlbNO)
6446             {
6447                 /* Check if this does not limit the scaling */
6448                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6449             }
6450             if (!comm->bBondComm)
6451             {
6452                 /* Without bBondComm do not go beyond the n.b. cut-off */
6453                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6454                 if (comm->cellsize_limit >= comm->cutoff)
6455                 {
6456                     /* We don't loose a lot of efficieny
6457                      * when increasing it to the n.b. cut-off.
6458                      * It can even be slightly faster, because we need
6459                      * less checks for the communication setup.
6460                      */
6461                     comm->cutoff_mbody = comm->cutoff;
6462                 }
6463             }
6464             /* Check if we did not end up below our original limit */
6465             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6466
6467             if (comm->cutoff_mbody > comm->cellsize_limit)
6468             {
6469                 comm->cellsize_limit = comm->cutoff_mbody;
6470             }
6471         }
6472         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6473     }
6474
6475     if (debug)
6476     {
6477         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6478                 "cellsize limit %f\n",
6479                 comm->bBondComm,comm->cellsize_limit);
6480     }
6481
6482     if (MASTER(cr))
6483     {
6484         check_dd_restrictions(cr,dd,ir,fplog);
6485     }
6486
6487     comm->partition_step = INT_MIN;
6488     dd->ddp_count = 0;
6489
6490     clear_dd_cycle_counts(dd);
6491
6492     return dd;
6493 }
6494
6495 static void set_dlb_limits(gmx_domdec_t *dd)
6496
6497 {
6498     int d;
6499
6500     for(d=0; d<dd->ndim; d++)
6501     {
6502         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6503         dd->comm->cellsize_min[dd->dim[d]] =
6504             dd->comm->cellsize_min_dlb[dd->dim[d]];
6505     }
6506 }
6507
6508
6509 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6510 {
6511     gmx_domdec_t *dd;
6512     gmx_domdec_comm_t *comm;
6513     real cellsize_min;
6514     int  d,nc,i;
6515     char buf[STRLEN];
6516
6517     dd = cr->dd;
6518     comm = dd->comm;
6519
6520     if (fplog)
6521     {
6522         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6523     }
6524
6525     cellsize_min = comm->cellsize_min[dd->dim[0]];
6526     for(d=1; d<dd->ndim; d++)
6527     {
6528         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6529     }
6530
6531     if (cellsize_min < comm->cellsize_limit*1.05)
6532     {
6533         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6534
6535         /* Change DLB from "auto" to "no". */
6536         comm->eDLB = edlbNO;
6537
6538         return;
6539     }
6540
6541     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6542     comm->bDynLoadBal = TRUE;
6543     dd->bGridJump = TRUE;
6544
6545     set_dlb_limits(dd);
6546
6547     /* We can set the required cell size info here,
6548      * so we do not need to communicate this.
6549      * The grid is completely uniform.
6550      */
6551     for(d=0; d<dd->ndim; d++)
6552     {
6553         if (comm->root[d])
6554         {
6555             comm->load[d].sum_m = comm->load[d].sum;
6556
6557             nc = dd->nc[dd->dim[d]];
6558             for(i=0; i<nc; i++)
6559             {
6560                 comm->root[d]->cell_f[i]    = i/(real)nc;
6561                 if (d > 0)
6562                 {
6563                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6564                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6565                 }
6566             }
6567             comm->root[d]->cell_f[nc] = 1.0;
6568         }
6569     }
6570 }
6571
6572 static char *init_bLocalCG(gmx_mtop_t *mtop)
6573 {
6574     int  ncg,cg;
6575     char *bLocalCG;
6576
6577     ncg = ncg_mtop(mtop);
6578     snew(bLocalCG,ncg);
6579     for(cg=0; cg<ncg; cg++)
6580     {
6581         bLocalCG[cg] = FALSE;
6582     }
6583
6584     return bLocalCG;
6585 }
6586
6587 void dd_init_bondeds(FILE *fplog,
6588                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6589                      gmx_vsite_t *vsite,gmx_constr_t constr,
6590                      t_inputrec *ir,bool bBCheck,cginfo_mb_t *cginfo_mb)
6591 {
6592     gmx_domdec_comm_t *comm;
6593     bool bBondComm;
6594     int  d;
6595
6596     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6597
6598     comm = dd->comm;
6599
6600     if (comm->bBondComm)
6601     {
6602         /* Communicate atoms beyond the cut-off for bonded interactions */
6603         comm = dd->comm;
6604
6605         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6606
6607         comm->bLocalCG = init_bLocalCG(mtop);
6608     }
6609     else
6610     {
6611         /* Only communicate atoms based on cut-off */
6612         comm->cglink   = NULL;
6613         comm->bLocalCG = NULL;
6614     }
6615 }
6616
6617 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6618                               t_inputrec *ir,
6619                               bool bDynLoadBal,real dlb_scale,
6620                               gmx_ddbox_t *ddbox)
6621 {
6622     gmx_domdec_comm_t *comm;
6623     int  d;
6624     ivec np;
6625     real limit,shrink;
6626     char buf[64];
6627
6628     if (fplog == NULL)
6629     {
6630         return;
6631     }
6632
6633     comm = dd->comm;
6634
6635     if (bDynLoadBal)
6636     {
6637         fprintf(fplog,"The maximum number of communication pulses is:");
6638         for(d=0; d<dd->ndim; d++)
6639         {
6640             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6641         }
6642         fprintf(fplog,"\n");
6643         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6644         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6645         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6646         for(d=0; d<DIM; d++)
6647         {
6648             if (dd->nc[d] > 1)
6649             {
6650                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6651                 {
6652                     shrink = 0;
6653                 }
6654                 else
6655                 {
6656                     shrink =
6657                         comm->cellsize_min_dlb[d]/
6658                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6659                 }
6660                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6661             }
6662         }
6663         fprintf(fplog,"\n");
6664     }
6665     else
6666     {
6667         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6668         fprintf(fplog,"The initial number of communication pulses is:");
6669         for(d=0; d<dd->ndim; d++)
6670         {
6671             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6672         }
6673         fprintf(fplog,"\n");
6674         fprintf(fplog,"The initial domain decomposition cell size is:");
6675         for(d=0; d<DIM; d++) {
6676             if (dd->nc[d] > 1)
6677             {
6678                 fprintf(fplog," %c %.2f nm",
6679                         dim2char(d),dd->comm->cellsize_min[d]);
6680             }
6681         }
6682         fprintf(fplog,"\n\n");
6683     }
6684
6685     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6686     {
6687         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6688         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6689                 "non-bonded interactions","",comm->cutoff);
6690
6691         if (bDynLoadBal)
6692         {
6693             limit = dd->comm->cellsize_limit;
6694         }
6695         else
6696         {
6697             if (dynamic_dd_box(ddbox,ir))
6698             {
6699                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6700             }
6701             limit = dd->comm->cellsize_min[XX];
6702             for(d=1; d<DIM; d++)
6703             {
6704                 limit = min(limit,dd->comm->cellsize_min[d]);
6705             }
6706         }
6707
6708         if (comm->bInterCGBondeds)
6709         {
6710             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6711                     "two-body bonded interactions","(-rdd)",
6712                     max(comm->cutoff,comm->cutoff_mbody));
6713             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6714                     "multi-body bonded interactions","(-rdd)",
6715                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6716         }
6717         if (dd->vsite_comm)
6718         {
6719             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6720                     "virtual site constructions","(-rcon)",limit);
6721         }
6722         if (dd->constraint_comm)
6723         {
6724             sprintf(buf,"atoms separated by up to %d constraints",
6725                     1+ir->nProjOrder);
6726             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6727                     buf,"(-rcon)",limit);
6728         }
6729         fprintf(fplog,"\n");
6730     }
6731
6732     fflush(fplog);
6733 }
6734
6735 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6736                        t_inputrec *ir,t_forcerec *fr,
6737                        gmx_ddbox_t *ddbox)
6738 {
6739     gmx_domdec_comm_t *comm;
6740     int  d,dim,npulse,npulse_d_max,npulse_d;
6741     bool bNoCutOff;
6742     int  natoms_tot;
6743     real vol_frac;
6744
6745     comm = dd->comm;
6746
6747     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6748
6749     if (EEL_PME(ir->coulombtype))
6750     {
6751         init_ddpme(dd,&comm->ddpme[0],0,comm->npmenodes_major);
6752         if (comm->npmedecompdim >= 2)
6753         {
6754             init_ddpme(dd,&comm->ddpme[1],1,
6755                        comm->npmenodes/comm->npmenodes_major);
6756         }
6757     }
6758     else
6759     {
6760         comm->npmenodes = 0;
6761         if (dd->pme_nodeid >= 0)
6762         {
6763             gmx_fatal(FARGS,"Can not have separate PME nodes without PME electrostatics");
6764         }
6765     }
6766
6767     /* If each molecule is a single charge group
6768      * or we use domain decomposition for each periodic dimension,
6769      * we do not need to take pbc into account for the bonded interactions.
6770      */
6771     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6772         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6773     {
6774         fr->bMolPBC = FALSE;
6775     }
6776     else
6777     {
6778         fr->bMolPBC = TRUE;
6779     }
6780
6781     if (debug)
6782     {
6783         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6784     }
6785     if (comm->eDLB != edlbNO)
6786     {
6787         /* Determine the maximum number of comm. pulses in one dimension */
6788
6789         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6790
6791         /* Determine the maximum required number of grid pulses */
6792         if (comm->cellsize_limit >= comm->cutoff)
6793         {
6794             /* Only a single pulse is required */
6795             npulse = 1;
6796         }
6797         else if (!bNoCutOff && comm->cellsize_limit > 0)
6798         {
6799             /* We round down slightly here to avoid overhead due to the latency
6800              * of extra communication calls when the cut-off
6801              * would be only slightly longer than the cell size.
6802              * Later cellsize_limit is redetermined,
6803              * so we can not miss interactions due to this rounding.
6804              */
6805             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6806         }
6807         else
6808         {
6809             /* There is no cell size limit */
6810             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6811         }
6812
6813         if (!bNoCutOff && npulse > 1)
6814         {
6815             /* See if we can do with less pulses, based on dlb_scale */
6816             npulse_d_max = 0;
6817             for(d=0; d<dd->ndim; d++)
6818             {
6819                 dim = dd->dim[d];
6820                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6821                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6822                 npulse_d_max = max(npulse_d_max,npulse_d);
6823             }
6824             npulse = min(npulse,npulse_d_max);
6825         }
6826
6827         /* This env var can override npulse */
6828         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6829         if (d > 0)
6830         {
6831             npulse = d;
6832         }
6833
6834         comm->maxpulse = 1;
6835         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6836         for(d=0; d<dd->ndim; d++)
6837         {
6838             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6839             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6840             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6841             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6842             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6843             {
6844                 comm->bVacDLBNoLimit = FALSE;
6845             }
6846         }
6847
6848         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6849         if (!comm->bVacDLBNoLimit)
6850         {
6851             comm->cellsize_limit = max(comm->cellsize_limit,
6852                                        comm->cutoff/comm->maxpulse);
6853         }
6854         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6855         /* Set the minimum cell size for each DD dimension */
6856         for(d=0; d<dd->ndim; d++)
6857         {
6858             if (comm->bVacDLBNoLimit ||
6859                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6860             {
6861                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6862             }
6863             else
6864             {
6865                 comm->cellsize_min_dlb[dd->dim[d]] =
6866                     comm->cutoff/comm->cd[d].np_dlb;
6867             }
6868         }
6869         if (comm->cutoff_mbody <= 0)
6870         {
6871             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6872         }
6873         if (comm->bDynLoadBal)
6874         {
6875             set_dlb_limits(dd);
6876         }
6877     }
6878
6879     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6880     if (comm->eDLB == edlbAUTO)
6881     {
6882         if (fplog)
6883         {
6884             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6885         }
6886         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6887     }
6888
6889     vol_frac = 1/(real)dd->nnodes + comm_box_frac(dd->nc,comm->cutoff,ddbox);
6890     if (debug)
6891     {
6892         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
6893     }
6894     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
6895
6896     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
6897 }
6898
6899 static void merge_cg_buffers(int ncell,
6900                              gmx_domdec_comm_dim_t *cd, int pulse,
6901                              int  *ncg_cell,
6902                              int  *index_gl, int  *recv_i,
6903                              rvec *cg_cm,    rvec *recv_vr,
6904                              int *cgindex,
6905                              cginfo_mb_t *cginfo_mb,int *cginfo)
6906 {
6907     gmx_domdec_ind_t *ind,*ind_p;
6908     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
6909     int shift,shift_at;
6910
6911     ind = &cd->ind[pulse];
6912
6913     /* First correct the already stored data */
6914     shift = ind->nrecv[ncell];
6915     for(cell=ncell-1; cell>=0; cell--)
6916     {
6917         shift -= ind->nrecv[cell];
6918         if (shift > 0)
6919         {
6920             /* Move the cg's present from previous grid pulses */
6921             cg0 = ncg_cell[ncell+cell];
6922             cg1 = ncg_cell[ncell+cell+1];
6923             cgindex[cg1+shift] = cgindex[cg1];
6924             for(cg=cg1-1; cg>=cg0; cg--)
6925             {
6926                 index_gl[cg+shift] = index_gl[cg];
6927                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
6928                 cgindex[cg+shift] = cgindex[cg];
6929                 cginfo[cg+shift] = cginfo[cg];
6930             }
6931             /* Correct the already stored send indices for the shift */
6932             for(p=1; p<=pulse; p++)
6933             {
6934                 ind_p = &cd->ind[p];
6935                 cg0 = 0;
6936                 for(c=0; c<cell; c++)
6937                 {
6938                     cg0 += ind_p->nsend[c];
6939                 }
6940                 cg1 = cg0 + ind_p->nsend[cell];
6941                 for(cg=cg0; cg<cg1; cg++)
6942                 {
6943                     ind_p->index[cg] += shift;
6944                 }
6945             }
6946         }
6947     }
6948
6949     /* Merge in the communicated buffers */
6950     shift = 0;
6951     shift_at = 0;
6952     cg0 = 0;
6953     for(cell=0; cell<ncell; cell++)
6954     {
6955         cg1 = ncg_cell[ncell+cell+1] + shift;
6956         if (shift_at > 0)
6957         {
6958             /* Correct the old cg indices */
6959             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
6960             {
6961                 cgindex[cg+1] += shift_at;
6962             }
6963         }
6964         for(cg=0; cg<ind->nrecv[cell]; cg++)
6965         {
6966             /* Copy this charge group from the buffer */
6967             index_gl[cg1] = recv_i[cg0];
6968             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
6969             /* Add it to the cgindex */
6970             cg_gl = index_gl[cg1];
6971             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
6972             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
6973             cgindex[cg1+1] = cgindex[cg1] + nat;
6974             cg0++;
6975             cg1++;
6976             shift_at += nat;
6977         }
6978         shift += ind->nrecv[cell];
6979         ncg_cell[ncell+cell+1] = cg1;
6980     }
6981 }
6982
6983 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
6984                                int nzone,int cg0,const int *cgindex)
6985 {
6986     int cg,zone,p;
6987
6988     /* Store the atom block boundaries for easy copying of communication buffers
6989      */
6990     cg = cg0;
6991     for(zone=0; zone<nzone; zone++)
6992     {
6993         for(p=0; p<cd->np; p++) {
6994             cd->ind[p].cell2at0[zone] = cgindex[cg];
6995             cg += cd->ind[p].nrecv[zone];
6996             cd->ind[p].cell2at1[zone] = cgindex[cg];
6997         }
6998     }
6999 }
7000
7001 static bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7002 {
7003     int  i;
7004     bool bMiss;
7005
7006     bMiss = FALSE;
7007     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7008     {
7009         if (!bLocalCG[link->a[i]])
7010         {
7011             bMiss = TRUE;
7012         }
7013     }
7014
7015     return bMiss;
7016 }
7017
7018 static void setup_dd_communication(gmx_domdec_t *dd,
7019                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7020 {
7021     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7022     int nzone,nzone_send,zone,zonei,cg0,cg1;
7023     int c,i,j,cg,cg_gl,nrcg;
7024     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7025     gmx_domdec_comm_t *comm;
7026     gmx_domdec_zones_t *zones;
7027     gmx_domdec_comm_dim_t *cd;
7028     gmx_domdec_ind_t *ind;
7029     cginfo_mb_t *cginfo_mb;
7030     bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7031     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7032     rvec rb,rn;
7033     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7034     real bcorner[DIM],bcorner_round_1=0;
7035     ivec tric_dist;
7036     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7037     real skew_fac2_d,skew_fac_01;
7038     rvec sf2_round;
7039     int  nsend,nat;
7040
7041     if (debug)
7042     {
7043         fprintf(debug,"Setting up DD communication\n");
7044     }
7045
7046     comm  = dd->comm;
7047     cg_cm = fr->cg_cm;
7048
7049     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7050     {
7051         dim = dd->dim[dim_ind];
7052
7053         /* Check if we need to use triclinic distances */
7054         tric_dist[dim_ind] = 0;
7055         for(i=0; i<=dim_ind; i++)
7056         {
7057             if (ddbox->tric_dir[dd->dim[i]])
7058             {
7059                 tric_dist[dim_ind] = 1;
7060             }
7061         }
7062     }
7063
7064     bBondComm = comm->bBondComm;
7065
7066     /* Do we need to determine extra distances for multi-body bondeds? */
7067     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7068
7069     /* Do we need to determine extra distances for only two-body bondeds? */
7070     bDist2B = (bBondComm && !bDistMB);
7071
7072     r_comm2  = sqr(comm->cutoff);
7073     r_bcomm2 = sqr(comm->cutoff_mbody);
7074
7075     if (debug)
7076     {
7077         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7078     }
7079
7080     zones = &comm->zones;
7081
7082     dim0 = dd->dim[0];
7083     /* The first dimension is equal for all cells */
7084     corner[0][0] = comm->cell_x0[dim0];
7085     if (bDistMB)
7086     {
7087         bcorner[0] = corner[0][0];
7088     }
7089     if (dd->ndim >= 2)
7090     {
7091         dim1 = dd->dim[1];
7092         /* This cell row is only seen from the first row */
7093         corner[1][0] = comm->cell_x0[dim1];
7094         /* All rows can see this row */
7095         corner[1][1] = comm->cell_x0[dim1];
7096         if (dd->bGridJump)
7097         {
7098             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7099             if (bDistMB)
7100             {
7101                 /* For the multi-body distance we need the maximum */
7102                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7103             }
7104         }
7105         /* Set the upper-right corner for rounding */
7106         corner_round_0 = comm->cell_x1[dim0];
7107
7108         if (dd->ndim >= 3)
7109         {
7110             dim2 = dd->dim[2];
7111             for(j=0; j<4; j++)
7112             {
7113                 corner[2][j] = comm->cell_x0[dim2];
7114             }
7115             if (dd->bGridJump)
7116             {
7117                 /* Use the maximum of the i-cells that see a j-cell */
7118                 for(i=0; i<zones->nizone; i++)
7119                 {
7120                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7121                     {
7122                         if (j >= 4)
7123                         {
7124                             corner[2][j-4] =
7125                                 max(corner[2][j-4],
7126                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7127                         }
7128                     }
7129                 }
7130                 if (bDistMB)
7131                 {
7132                     /* For the multi-body distance we need the maximum */
7133                     bcorner[2] = comm->cell_x0[dim2];
7134                     for(i=0; i<2; i++)
7135                     {
7136                         for(j=0; j<2; j++)
7137                         {
7138                             bcorner[2] = max(bcorner[2],
7139                                              comm->zone_d2[i][j].p1_0);
7140                         }
7141                     }
7142                 }
7143             }
7144
7145             /* Set the upper-right corner for rounding */
7146             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7147              * Only cell (0,0,0) can see cell 7 (1,1,1)
7148              */
7149             corner_round_1[0] = comm->cell_x1[dim1];
7150             corner_round_1[3] = comm->cell_x1[dim1];
7151             if (dd->bGridJump)
7152             {
7153                 corner_round_1[0] = max(comm->cell_x1[dim1],
7154                                         comm->zone_d1[1].mch1);
7155                 if (bDistMB)
7156                 {
7157                     /* For the multi-body distance we need the maximum */
7158                     bcorner_round_1 = max(comm->cell_x1[dim1],
7159                                           comm->zone_d1[1].p1_1);
7160                 }
7161             }
7162         }
7163     }
7164
7165     /* Triclinic stuff */
7166     normal = ddbox->normal;
7167     skew_fac_01 = 0;
7168     if (dd->ndim >= 2)
7169     {
7170         v_0 = ddbox->v[dim0];
7171         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7172         {
7173             /* Determine the coupling coefficient for the distances
7174              * to the cell planes along dim0 and dim1 through dim2.
7175              * This is required for correct rounding.
7176              */
7177             skew_fac_01 =
7178                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7179             if (debug)
7180             {
7181                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7182             }
7183         }
7184     }
7185     if (dd->ndim >= 3)
7186     {
7187         v_1 = ddbox->v[dim1];
7188     }
7189
7190     zone_cg_range = zones->cg_range;
7191     index_gl = dd->index_gl;
7192     cgindex  = dd->cgindex;
7193     cginfo_mb = fr->cginfo_mb;
7194
7195     zone_cg_range[0]   = 0;
7196     zone_cg_range[1]   = dd->ncg_home;
7197     comm->zone_ncg1[0] = dd->ncg_home;
7198     pos_cg             = dd->ncg_home;
7199
7200     nat_tot = dd->nat_home;
7201     nzone = 1;
7202     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7203     {
7204         dim = dd->dim[dim_ind];
7205         cd = &comm->cd[dim_ind];
7206
7207         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7208         {
7209             /* No pbc in this dimension, the first node should not comm. */
7210             nzone_send = 0;
7211         }
7212         else
7213         {
7214             nzone_send = nzone;
7215         }
7216
7217         bScrew = (dd->bScrewPBC && dim == XX);
7218
7219         v_d = ddbox->v[dim];
7220         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7221
7222         cd->bInPlace = TRUE;
7223         for(p=0; p<cd->np; p++)
7224         {
7225             /* Only atoms communicated in the first pulse are used
7226              * for multi-body bonded interactions or for bBondComm.
7227              */
7228             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7229             bDistMB_pulse = (bDistMB && bDistBonded);
7230
7231             ind = &cd->ind[p];
7232             nsend = 0;
7233             nat = 0;
7234             for(zone=0; zone<nzone_send; zone++)
7235             {
7236                 if (tric_dist[dim_ind] && dim_ind > 0)
7237                 {
7238                     /* Determine slightly more optimized skew_fac's
7239                      * for rounding.
7240                      * This reduces the number of communicated atoms
7241                      * by about 10% for 3D DD of rhombic dodecahedra.
7242                      */
7243                     for(dimd=0; dimd<dim; dimd++)
7244                     {
7245                         sf2_round[dimd] = 1;
7246                         if (ddbox->tric_dir[dimd])
7247                         {
7248                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7249                             {
7250                                 /* If we are shifted in dimension i
7251                                  * and the cell plane is tilted forward
7252                                  * in dimension i, skip this coupling.
7253                                  */
7254                                 if (!(zones->shift[nzone+zone][i] &&
7255                                       ddbox->v[dimd][i][dimd] >= 0))
7256                                 {
7257                                     sf2_round[dimd] +=
7258                                         sqr(ddbox->v[dimd][i][dimd]);
7259                                 }
7260                             }
7261                             sf2_round[dimd] = 1/sf2_round[dimd];
7262                         }
7263                     }
7264                 }
7265
7266                 zonei = zone_perm[dim_ind][zone];
7267                 if (p == 0)
7268                 {
7269                     /* Here we permutate the zones to obtain a convenient order
7270                      * for neighbor searching
7271                      */
7272                     cg0 = zone_cg_range[zonei];
7273                     cg1 = zone_cg_range[zonei+1];
7274                 }
7275                 else
7276                 {
7277                     /* Look only at the cg's received in the previous grid pulse
7278                      */
7279                     cg1 = zone_cg_range[nzone+zone+1];
7280                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7281                 }
7282                 ind->nsend[zone] = 0;
7283                 for(cg=cg0; cg<cg1; cg++)
7284                 {
7285                     r2  = 0;
7286                     rb2 = 0;
7287                     if (tric_dist[dim_ind] == 0)
7288                     {
7289                         /* Rectangular direction, easy */
7290                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7291                         if (r > 0)
7292                         {
7293                             r2 += r*r;
7294                         }
7295                         if (bDistMB_pulse)
7296                         {
7297                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7298                             if (r > 0)
7299                             {
7300                                 rb2 += r*r;
7301                             }
7302                         }
7303                         /* Rounding gives at most a 16% reduction
7304                          * in communicated atoms
7305                          */
7306                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7307                         {
7308                             r = cg_cm[cg][dim0] - corner_round_0;
7309                             /* This is the first dimension, so always r >= 0 */
7310                             r2 += r*r;
7311                             if (bDistMB_pulse)
7312                             {
7313                                 rb2 += r*r;
7314                             }
7315                         }
7316                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7317                         {
7318                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7319                             if (r > 0)
7320                             {
7321                                 r2 += r*r;
7322                             }
7323                             if (bDistMB_pulse)
7324                             {
7325                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7326                                 if (r > 0)
7327                                 {
7328                                     rb2 += r*r;
7329                                 }
7330                             }
7331                         }
7332                     }
7333                     else
7334                     {
7335                         /* Triclinic direction, more complicated */
7336                         clear_rvec(rn);
7337                         clear_rvec(rb);
7338                         /* Rounding, conservative as the skew_fac multiplication
7339                          * will slightly underestimate the distance.
7340                          */
7341                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7342                         {
7343                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7344                             for(i=dim0+1; i<DIM; i++)
7345                             {
7346                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7347                             }
7348                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7349                             if (bDistMB_pulse)
7350                             {
7351                                 rb[dim0] = rn[dim0];
7352                                 rb2 = r2;
7353                             }
7354                             /* Take care that the cell planes along dim0 might not
7355                              * be orthogonal to those along dim1 and dim2.
7356                              */
7357                             for(i=1; i<=dim_ind; i++)
7358                             {
7359                                 dimd = dd->dim[i];
7360                                 if (normal[dim0][dimd] > 0)
7361                                 {
7362                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7363                                     if (bDistMB_pulse)
7364                                     {
7365                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7366                                     }
7367                                 }
7368                             }
7369                         }
7370                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7371                         {
7372                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7373                             tric_sh = 0;
7374                             for(i=dim1+1; i<DIM; i++)
7375                             {
7376                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7377                             }
7378                             rn[dim1] += tric_sh;
7379                             if (rn[dim1] > 0)
7380                             {
7381                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7382                                 /* Take care of coupling of the distances
7383                                  * to the planes along dim0 and dim1 through dim2.
7384                                  */
7385                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7386                                 /* Take care that the cell planes along dim1
7387                                  * might not be orthogonal to that along dim2.
7388                                  */
7389                                 if (normal[dim1][dim2] > 0)
7390                                 {
7391                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7392                                 }
7393                             }
7394                             if (bDistMB_pulse)
7395                             {
7396                                 rb[dim1] +=
7397                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7398                                 if (rb[dim1] > 0)
7399                                 {
7400                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7401                                     /* Take care of coupling of the distances
7402                                      * to the planes along dim0 and dim1 through dim2.
7403                                      */
7404                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7405                                     /* Take care that the cell planes along dim1
7406                                      * might not be orthogonal to that along dim2.
7407                                      */
7408                                     if (normal[dim1][dim2] > 0)
7409                                     {
7410                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7411                                     }
7412                                 }
7413                             }
7414                         }
7415                         /* The distance along the communication direction */
7416                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7417                         tric_sh = 0;
7418                         for(i=dim+1; i<DIM; i++)
7419                         {
7420                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7421                         }
7422                         rn[dim] += tric_sh;
7423                         if (rn[dim] > 0)
7424                         {
7425                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7426                             /* Take care of coupling of the distances
7427                              * to the planes along dim0 and dim1 through dim2.
7428                              */
7429                             if (dim_ind == 1 && zonei == 1)
7430                             {
7431                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7432                             }
7433                         }
7434                         if (bDistMB_pulse)
7435                         {
7436                             clear_rvec(rb);
7437                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7438                             if (rb[dim] > 0)
7439                             {
7440                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7441                                 /* Take care of coupling of the distances
7442                                  * to the planes along dim0 and dim1 through dim2.
7443                                  */
7444                                 if (dim_ind == 1 && zonei == 1)
7445                                 {
7446                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7447                                 }
7448                             }
7449                         }
7450                     }
7451
7452                     if (r2 < r_comm2 ||
7453                         (bDistBonded &&
7454                          ((bDistMB && rb2 < r_bcomm2) ||
7455                           (bDist2B && r2  < r_bcomm2)) &&
7456                          (!bBondComm ||
7457                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7458                            missing_link(comm->cglink,index_gl[cg],
7459                                         comm->bLocalCG)))))
7460                     {
7461                         /* Make an index to the local charge groups */
7462                         if (nsend+1 > ind->nalloc)
7463                         {
7464                             ind->nalloc = over_alloc_large(nsend+1);
7465                             srenew(ind->index,ind->nalloc);
7466                         }
7467                         if (nsend+1 > comm->nalloc_int)
7468                         {
7469                             comm->nalloc_int = over_alloc_large(nsend+1);
7470                             srenew(comm->buf_int,comm->nalloc_int);
7471                         }
7472                         ind->index[nsend] = cg;
7473                         comm->buf_int[nsend] = index_gl[cg];
7474                         ind->nsend[zone]++;
7475                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7476
7477                         if (dd->ci[dim] == 0)
7478                         {
7479                             /* Correct cg_cm for pbc */
7480                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7481                             if (bScrew)
7482                             {
7483                                 comm->vbuf.v[nsend][YY] =
7484                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7485                                 comm->vbuf.v[nsend][ZZ] =
7486                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7487                             }
7488                         }
7489                         else
7490                         {
7491                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7492                         }
7493                         nsend++;
7494                         nat += cgindex[cg+1] - cgindex[cg];
7495                     }
7496                 }
7497             }
7498             /* Clear the counts in case we do not have pbc */
7499             for(zone=nzone_send; zone<nzone; zone++)
7500             {
7501                 ind->nsend[zone] = 0;
7502             }
7503             ind->nsend[nzone]   = nsend;
7504             ind->nsend[nzone+1] = nat;
7505             /* Communicate the number of cg's and atoms to receive */
7506             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7507                             ind->nsend, nzone+2,
7508                             ind->nrecv, nzone+2);
7509
7510             /* The rvec buffer is also required for atom buffers of size nsend
7511              * in dd_move_x and dd_move_f.
7512              */
7513             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7514
7515             if (p > 0)
7516             {
7517                 /* We can receive in place if only the last zone is not empty */
7518                 for(zone=0; zone<nzone-1; zone++)
7519                 {
7520                     if (ind->nrecv[zone] > 0)
7521                     {
7522                         cd->bInPlace = FALSE;
7523                     }
7524                 }
7525                 if (!cd->bInPlace)
7526                 {
7527                     /* The int buffer is only required here for the cg indices */
7528                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7529                     {
7530                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7531                         srenew(comm->buf_int2,comm->nalloc_int2);
7532                     }
7533                     /* The rvec buffer is also required for atom buffers
7534                      * of size nrecv in dd_move_x and dd_move_f.
7535                      */
7536                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7537                     vec_rvec_check_alloc(&comm->vbuf2,i);
7538                 }
7539             }
7540
7541             /* Make space for the global cg indices */
7542             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7543                 || dd->cg_nalloc == 0)
7544             {
7545                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7546                 srenew(index_gl,dd->cg_nalloc);
7547                 srenew(cgindex,dd->cg_nalloc+1);
7548             }
7549             /* Communicate the global cg indices */
7550             if (cd->bInPlace)
7551             {
7552                 recv_i = index_gl + pos_cg;
7553             }
7554             else
7555             {
7556                 recv_i = comm->buf_int2;
7557             }
7558             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7559                             comm->buf_int, nsend,
7560                             recv_i,        ind->nrecv[nzone]);
7561
7562             /* Make space for cg_cm */
7563             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7564             {
7565                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7566                 cg_cm = fr->cg_cm;
7567             }
7568             /* Communicate cg_cm */
7569             if (cd->bInPlace)
7570             {
7571                 recv_vr = cg_cm + pos_cg;
7572             }
7573             else
7574             {
7575                 recv_vr = comm->vbuf2.v;
7576             }
7577             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7578                              comm->vbuf.v, nsend,
7579                              recv_vr,      ind->nrecv[nzone]);
7580
7581             /* Make the charge group index */
7582             if (cd->bInPlace)
7583             {
7584                 zone = (p == 0 ? 0 : nzone - 1);
7585                 while (zone < nzone)
7586                 {
7587                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7588                     {
7589                         cg_gl = index_gl[pos_cg];
7590                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7591                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7592                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7593                         if (bBondComm)
7594                         {
7595                             /* Update the charge group presence,
7596                              * so we can use it in the next pass of the loop.
7597                              */
7598                             comm->bLocalCG[cg_gl] = TRUE;
7599                         }
7600                         pos_cg++;
7601                     }
7602                     if (p == 0)
7603                     {
7604                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7605                     }
7606                     zone++;
7607                     zone_cg_range[nzone+zone] = pos_cg;
7608                 }
7609             }
7610             else
7611             {
7612                 /* This part of the code is never executed with bBondComm. */
7613                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7614                                  index_gl,recv_i,cg_cm,recv_vr,
7615                                  cgindex,fr->cginfo_mb,fr->cginfo);
7616                 pos_cg += ind->nrecv[nzone];
7617             }
7618             nat_tot += ind->nrecv[nzone+1];
7619         }
7620         if (!cd->bInPlace)
7621         {
7622             /* Store the atom block for easy copying of communication buffers */
7623             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7624         }
7625         nzone += nzone;
7626     }
7627     dd->index_gl = index_gl;
7628     dd->cgindex  = cgindex;
7629
7630     dd->ncg_tot = zone_cg_range[zones->n];
7631     dd->nat_tot = nat_tot;
7632     comm->nat[ddnatHOME] = dd->nat_home;
7633     for(i=ddnatZONE; i<ddnatNR; i++)
7634     {
7635         comm->nat[i] = dd->nat_tot;
7636     }
7637
7638     if (!bBondComm)
7639     {
7640         /* We don't need to update cginfo, since that was alrady done above.
7641          * So we pass NULL for the forcerec.
7642          */
7643         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7644                       NULL,comm->bLocalCG);
7645     }
7646
7647     if (debug)
7648     {
7649         fprintf(debug,"Finished setting up DD communication, zones:");
7650         for(c=0; c<zones->n; c++)
7651         {
7652             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7653         }
7654         fprintf(debug,"\n");
7655     }
7656 }
7657
7658 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7659 {
7660     int c;
7661
7662     for(c=0; c<zones->nizone; c++)
7663     {
7664         zones->izone[c].cg1  = zones->cg_range[c+1];
7665         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7666         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7667     }
7668 }
7669
7670 static int comp_cgsort(const void *a,const void *b)
7671 {
7672     int comp;
7673
7674     gmx_cgsort_t *cga,*cgb;
7675     cga = (gmx_cgsort_t *)a;
7676     cgb = (gmx_cgsort_t *)b;
7677
7678     comp = cga->nsc - cgb->nsc;
7679     if (comp == 0)
7680     {
7681         comp = cga->ind_gl - cgb->ind_gl;
7682     }
7683
7684     return comp;
7685 }
7686
7687 static void order_int_cg(int n,gmx_cgsort_t *sort,
7688                          int *a,int *buf)
7689 {
7690     int i;
7691
7692     /* Order the data */
7693     for(i=0; i<n; i++)
7694     {
7695         buf[i] = a[sort[i].ind];
7696     }
7697
7698     /* Copy back to the original array */
7699     for(i=0; i<n; i++)
7700     {
7701         a[i] = buf[i];
7702     }
7703 }
7704
7705 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7706                          rvec *v,rvec *buf)
7707 {
7708     int i;
7709
7710     /* Order the data */
7711     for(i=0; i<n; i++)
7712     {
7713         copy_rvec(v[sort[i].ind],buf[i]);
7714     }
7715
7716     /* Copy back to the original array */
7717     for(i=0; i<n; i++)
7718     {
7719         copy_rvec(buf[i],v[i]);
7720     }
7721 }
7722
7723 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7724                            rvec *v,rvec *buf)
7725 {
7726     int a,atot,cg,cg0,cg1,i;
7727
7728     /* Order the data */
7729     a = 0;
7730     for(cg=0; cg<ncg; cg++)
7731     {
7732         cg0 = cgindex[sort[cg].ind];
7733         cg1 = cgindex[sort[cg].ind+1];
7734         for(i=cg0; i<cg1; i++)
7735         {
7736             copy_rvec(v[i],buf[a]);
7737             a++;
7738         }
7739     }
7740     atot = a;
7741
7742     /* Copy back to the original array */
7743     for(a=0; a<atot; a++)
7744     {
7745         copy_rvec(buf[a],v[a]);
7746     }
7747 }
7748
7749 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7750                          int nsort_new,gmx_cgsort_t *sort_new,
7751                          gmx_cgsort_t *sort1)
7752 {
7753     int i1,i2,i_new;
7754
7755     /* The new indices are not very ordered, so we qsort them */
7756     qsort(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7757
7758     /* sort2 is already ordered, so now we can merge the two arrays */
7759     i1 = 0;
7760     i2 = 0;
7761     i_new = 0;
7762     while(i2 < nsort2 || i_new < nsort_new)
7763     {
7764         if (i2 == nsort2)
7765         {
7766             sort1[i1++] = sort_new[i_new++];
7767         }
7768         else if (i_new == nsort_new)
7769         {
7770             sort1[i1++] = sort2[i2++];
7771         }
7772         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7773                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7774                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7775         {
7776             sort1[i1++] = sort2[i2++];
7777         }
7778         else
7779         {
7780             sort1[i1++] = sort_new[i_new++];
7781         }
7782     }
7783 }
7784
7785 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7786                           rvec *cgcm,t_forcerec *fr,t_state *state,
7787                           int ncg_home_old)
7788 {
7789     gmx_domdec_sort_t *sort;
7790     gmx_cgsort_t *cgsort,*sort_i;
7791     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7792     rvec *vbuf;
7793
7794     sort = dd->comm->sort;
7795
7796     if (dd->ncg_home > sort->sort_nalloc)
7797     {
7798         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7799         srenew(sort->sort1,sort->sort_nalloc);
7800         srenew(sort->sort2,sort->sort_nalloc);
7801     }
7802
7803     if (ncg_home_old >= 0)
7804     {
7805         /* The charge groups that remained in the same ns grid cell
7806          * are completely ordered. So we can sort efficiently by sorting
7807          * the charge groups that did move into the stationary list.
7808          */
7809         ncg_new = 0;
7810         nsort2 = 0;
7811         nsort_new = 0;
7812         for(i=0; i<dd->ncg_home; i++)
7813         {
7814             /* Check if this cg did not move to another node */
7815             cell_index = fr->ns.grid->cell_index[i];
7816             if (cell_index !=  4*fr->ns.grid->ncells)
7817             {
7818                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7819                 {
7820                     /* This cg is new on this node or moved ns grid cell */
7821                     if (nsort_new >= sort->sort_new_nalloc)
7822                     {
7823                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7824                         srenew(sort->sort_new,sort->sort_new_nalloc);
7825                     }
7826                     sort_i = &(sort->sort_new[nsort_new++]);
7827                 }
7828                 else
7829                 {
7830                     /* This cg did not move */
7831                     sort_i = &(sort->sort2[nsort2++]);
7832                 }
7833                 /* Sort on the ns grid cell indices
7834                  * and the global topology index
7835                  */
7836                 sort_i->nsc    = cell_index;
7837                 sort_i->ind_gl = dd->index_gl[i];
7838                 sort_i->ind    = i;
7839                 ncg_new++;
7840             }
7841         }
7842         if (debug)
7843         {
7844             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7845                     nsort2,nsort_new);
7846         }
7847         /* Sort efficiently */
7848         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7849     }
7850     else
7851     {
7852         cgsort = sort->sort1;
7853         ncg_new = 0;
7854         for(i=0; i<dd->ncg_home; i++)
7855         {
7856             /* Sort on the ns grid cell indices
7857              * and the global topology index
7858              */
7859             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7860             cgsort[i].ind_gl = dd->index_gl[i];
7861             cgsort[i].ind    = i;
7862             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7863             {
7864                 ncg_new++;
7865             }
7866         }
7867         if (debug)
7868         {
7869             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7870         }
7871         /* Determine the order of the charge groups using qsort */
7872         qsort(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7873     }
7874     cgsort = sort->sort1;
7875
7876     /* We alloc with the old size, since cgindex is still old */
7877     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7878     vbuf = dd->comm->vbuf.v;
7879
7880     /* Remove the charge groups which are no longer at home here */
7881     dd->ncg_home = ncg_new;
7882
7883     /* Reorder the state */
7884     for(i=0; i<estNR; i++)
7885     {
7886         if (EST_DISTR(i) && state->flags & (1<<i))
7887         {
7888             switch (i)
7889             {
7890             case estX:
7891                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
7892                 break;
7893             case estV:
7894                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
7895                 break;
7896             case estSDX:
7897                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
7898                 break;
7899             case estCGP:
7900                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
7901                 break;
7902             case estLD_RNG:
7903             case estLD_RNGI:
7904             case estDISRE_INITF:
7905             case estDISRE_RM3TAV:
7906             case estORIRE_INITF:
7907             case estORIRE_DTAV:
7908                 /* No ordering required */
7909                 break;
7910             default:
7911                 gmx_incons("Unknown state entry encountered in dd_sort_state");
7912                 break;
7913             }
7914         }
7915     }
7916     /* Reorder cgcm */
7917     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
7918
7919     if (dd->ncg_home+1 > sort->ibuf_nalloc)
7920     {
7921         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
7922         srenew(sort->ibuf,sort->ibuf_nalloc);
7923     }
7924     ibuf = sort->ibuf;
7925     /* Reorder the global cg index */
7926     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
7927     /* Reorder the cginfo */
7928     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
7929     /* Rebuild the local cg index */
7930     ibuf[0] = 0;
7931     for(i=0; i<dd->ncg_home; i++)
7932     {
7933         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
7934         ibuf[i+1] = ibuf[i] + cgsize;
7935     }
7936     for(i=0; i<dd->ncg_home+1; i++)
7937     {
7938         dd->cgindex[i] = ibuf[i];
7939     }
7940     /* Set the home atom number */
7941     dd->nat_home = dd->cgindex[dd->ncg_home];
7942
7943     /* Copy the sorted ns cell indices back to the ns grid struct */
7944     for(i=0; i<dd->ncg_home; i++)
7945     {
7946         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
7947     }
7948     fr->ns.grid->nr = dd->ncg_home;
7949 }
7950
7951 static void add_dd_statistics(gmx_domdec_t *dd)
7952 {
7953     gmx_domdec_comm_t *comm;
7954     int ddnat;
7955
7956     comm = dd->comm;
7957
7958     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
7959     {
7960         comm->sum_nat[ddnat-ddnatZONE] +=
7961             comm->nat[ddnat] - comm->nat[ddnat-1];
7962     }
7963     comm->ndecomp++;
7964 }
7965
7966 void reset_dd_statistics_counters(gmx_domdec_t *dd)
7967 {
7968     gmx_domdec_comm_t *comm;
7969     int ddnat;
7970
7971     comm = dd->comm;
7972
7973     /* Reset all the statistics and counters for total run counting */
7974     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
7975     {
7976         comm->sum_nat[ddnat-ddnatZONE] = 0;
7977     }
7978     comm->ndecomp = 0;
7979     comm->nload = 0;
7980     comm->load_step = 0;
7981     comm->load_sum = 0;
7982     comm->load_max = 0;
7983     clear_ivec(comm->load_lim);
7984     comm->load_mdf = 0;
7985     comm->load_pme = 0;
7986 }
7987
7988 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
7989 {
7990     gmx_domdec_comm_t *comm;
7991     int ddnat;
7992     double av;
7993
7994     comm = cr->dd->comm;
7995
7996     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
7997
7998     if (fplog == NULL)
7999     {
8000         return;
8001     }
8002
8003     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8004
8005     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8006     {
8007         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8008         switch(ddnat)
8009         {
8010         case ddnatZONE:
8011             fprintf(fplog,
8012                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8013                     2,av);
8014             break;
8015         case ddnatVSITE:
8016             if (cr->dd->vsite_comm)
8017             {
8018                 fprintf(fplog,
8019                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8020                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8021                         av);
8022             }
8023             break;
8024         case ddnatCON:
8025             if (cr->dd->constraint_comm)
8026             {
8027                 fprintf(fplog,
8028                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8029                         1 + ir->nLincsIter,av);
8030             }
8031             break;
8032         default:
8033             gmx_incons(" Unknown type for DD statistics");
8034         }
8035     }
8036     fprintf(fplog,"\n");
8037
8038     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8039     {
8040         print_dd_load_av(fplog,cr->dd);
8041     }
8042 }
8043
8044 void dd_partition_system(FILE            *fplog,
8045                          gmx_large_int_t      step,
8046                          t_commrec       *cr,
8047                          bool            bMasterState,
8048                          int             nstglobalcomm,
8049                          t_state         *state_global,
8050                          gmx_mtop_t      *top_global,
8051                          t_inputrec      *ir,
8052                          t_state         *state_local,
8053                          rvec            **f,
8054                          t_mdatoms       *mdatoms,
8055                          gmx_localtop_t  *top_local,
8056                          t_forcerec      *fr,
8057                          gmx_vsite_t     *vsite,
8058                          gmx_shellfc_t   shellfc,
8059                          gmx_constr_t    constr,
8060                          t_nrnb          *nrnb,
8061                          gmx_wallcycle_t wcycle,
8062                          bool            bVerbose)
8063 {
8064     gmx_domdec_t *dd;
8065     gmx_domdec_comm_t *comm;
8066     gmx_ddbox_t ddbox;
8067     t_block *cgs_gl;
8068     gmx_large_int_t step_pcoupl;
8069     rvec cell_ns_x0,cell_ns_x1;
8070     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8071     bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8072     bool bRedist,bSortCG,bResortAll;
8073     ivec ncells_old,np;
8074     real grid_density;
8075     char sbuf[22];
8076
8077     dd = cr->dd;
8078     comm = dd->comm;
8079
8080     bBoxChanged = (bMasterState || DEFORM(*ir));
8081     if (ir->epc != epcNO)
8082     {
8083         /* With nstcalcenery > 1 pressure coupling happens.
8084          * one step after calculating the energies.
8085          * Box scaling happens at the end of the MD step,
8086          * after the DD partitioning.
8087          * We therefore have to do DLB in the first partitioning
8088          * after an MD step where P-coupling occured.
8089          * We need to determine the last step in which p-coupling occurred.
8090          * MRS -- need to validate this for vv?
8091          */
8092         n = ir->nstcalcenergy;
8093         if (n == 1)
8094         {
8095             step_pcoupl = step - 1;
8096         }
8097         else
8098         {
8099             step_pcoupl = ((step - 1)/n)*n + 1;
8100         }
8101         if (step_pcoupl >= comm->partition_step)
8102         {
8103             bBoxChanged = TRUE;
8104         }
8105     }
8106
8107     bNStGlobalComm = (step >= comm->partition_step + nstglobalcomm);
8108
8109     if (!comm->bDynLoadBal)
8110     {
8111         bDoDLB = FALSE;
8112     }
8113     else
8114     {
8115         /* Should we do dynamic load balacing this step?
8116          * Since it requires (possibly expensive) global communication,
8117          * we might want to do DLB less frequently.
8118          */
8119         if (bBoxChanged || ir->epc != epcNO)
8120         {
8121             bDoDLB = bBoxChanged;
8122         }
8123         else
8124         {
8125             bDoDLB = bNStGlobalComm;
8126         }
8127     }
8128
8129     /* Check if we have recorded loads on the nodes */
8130     if (comm->bRecordLoad && dd_load_count(comm))
8131     {
8132         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8133         {
8134             /* Check if we should use DLB at the second partitioning
8135              * and every 100 partitionings,
8136              * so the extra communication cost is negligible.
8137              */
8138             n = max(100,nstglobalcomm);
8139             bCheckDLB = (comm->n_load_collect == 0 ||
8140                          comm->n_load_have % n == n-1);
8141         }
8142         else
8143         {
8144             bCheckDLB = FALSE;
8145         }
8146
8147         /* Print load every nstlog, first and last step to the log file */
8148         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8149                     comm->n_load_collect == 0 ||
8150                     (step + ir->nstlist > ir->init_step + ir->nsteps));
8151
8152         /* Avoid extra communication due to verbose screen output
8153          * when nstglobalcomm is set.
8154          */
8155         if (bDoDLB || bLogLoad || bCheckDLB ||
8156             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8157         {
8158             get_load_distribution(dd,wcycle);
8159             if (DDMASTER(dd))
8160             {
8161                 if (bLogLoad)
8162                 {
8163                     dd_print_load(fplog,dd,step-1);
8164                 }
8165                 if (bVerbose)
8166                 {
8167                     dd_print_load_verbose(dd);
8168                 }
8169             }
8170             comm->n_load_collect++;
8171
8172             if (bCheckDLB) {
8173                 /* Since the timings are node dependent, the master decides */
8174                 if (DDMASTER(dd))
8175                 {
8176                     bTurnOnDLB =
8177                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8178                     if (debug)
8179                     {
8180                         fprintf(debug,"step %s, imb loss %f\n",
8181                                 gmx_step_str(step,sbuf),
8182                                 dd_force_imb_perf_loss(dd));
8183                     }
8184                 }
8185                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8186                 if (bTurnOnDLB)
8187                 {
8188                     turn_on_dlb(fplog,cr,step);
8189                     bDoDLB = TRUE;
8190                 }
8191             }
8192         }
8193         comm->n_load_have++;
8194     }
8195
8196     cgs_gl = &comm->cgs_gl;
8197
8198     bRedist = FALSE;
8199     if (bMasterState)
8200     {
8201         /* Clear the old state */
8202         clear_dd_indices(dd,0,0);
8203
8204         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8205                   TRUE,cgs_gl,state_global->x,&ddbox);
8206
8207         get_cg_distribution(fplog,step,dd,cgs_gl,
8208                             state_global->box,&ddbox,state_global->x);
8209
8210         dd_distribute_state(dd,cgs_gl,
8211                             state_global,state_local,f);
8212
8213         dd_make_local_cgs(dd,&top_local->cgs);
8214
8215         if (dd->ncg_home > fr->cg_nalloc)
8216         {
8217             dd_realloc_fr_cg(fr,dd->ncg_home);
8218         }
8219         calc_cgcm(fplog,0,dd->ncg_home,
8220                   &top_local->cgs,state_local->x,fr->cg_cm);
8221
8222         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8223
8224         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8225
8226         cg0 = 0;
8227     }
8228     else if (state_local->ddp_count != dd->ddp_count)
8229     {
8230         if (state_local->ddp_count > dd->ddp_count)
8231         {
8232             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8233         }
8234
8235         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8236         {
8237             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8238         }
8239
8240         /* Clear the old state */
8241         clear_dd_indices(dd,0,0);
8242
8243         /* Build the new indices */
8244         rebuild_cgindex(dd,cgs_gl->index,state_local);
8245         make_dd_indices(dd,cgs_gl->index,0);
8246
8247         /* Redetermine the cg COMs */
8248         calc_cgcm(fplog,0,dd->ncg_home,
8249                   &top_local->cgs,state_local->x,fr->cg_cm);
8250
8251         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8252
8253         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8254
8255         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8256                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8257
8258         bRedist = comm->bDynLoadBal;
8259     }
8260     else
8261     {
8262         /* We have the full state, only redistribute the cgs */
8263
8264         /* Clear the non-home indices */
8265         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8266
8267         /* Avoid global communication for dim's without pbc and -gcom */
8268         if (!bNStGlobalComm)
8269         {
8270             copy_rvec(comm->box0    ,ddbox.box0    );
8271             copy_rvec(comm->box_size,ddbox.box_size);
8272         }
8273         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8274                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8275
8276         bBoxChanged = TRUE;
8277         bRedist = TRUE;
8278     }
8279     /* For dim's without pbc and -gcom */
8280     copy_rvec(ddbox.box0    ,comm->box0    );
8281     copy_rvec(ddbox.box_size,comm->box_size);
8282
8283     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8284                       step,wcycle);
8285
8286     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8287     {
8288         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8289     }
8290
8291     /* Check if we should sort the charge groups */
8292     if (comm->nstSortCG > 0)
8293     {
8294         bSortCG = (bMasterState ||
8295                    (bRedist && (step % comm->nstSortCG == 0)));
8296     }
8297     else
8298     {
8299         bSortCG = FALSE;
8300     }
8301
8302     ncg_home_old = dd->ncg_home;
8303
8304     if (bRedist)
8305     {
8306         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8307                                  state_local,f,fr,mdatoms,
8308                                  !bSortCG,nrnb);
8309     }
8310
8311     get_nsgrid_boundaries(fr->ns.grid,dd,
8312                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8313                           dd->ncg_home,fr->cg_cm,
8314                           cell_ns_x0,cell_ns_x1,&grid_density);
8315
8316     if (bBoxChanged)
8317     {
8318         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8319     }
8320
8321     copy_ivec(fr->ns.grid->n,ncells_old);
8322     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8323                state_local->box,cell_ns_x0,cell_ns_x1,
8324                fr->rlistlong,grid_density);
8325     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8326     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8327
8328     if (bSortCG)
8329     {
8330         /* Sort the state on charge group position.
8331          * This enables exact restarts from this step.
8332          * It also improves performance by about 15% with larger numbers
8333          * of atoms per node.
8334          */
8335
8336         /* Fill the ns grid with the home cell,
8337          * so we can sort with the indices.
8338          */
8339         set_zones_ncg_home(dd);
8340         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8341                   0,dd->ncg_home,fr->cg_cm);
8342
8343         /* Check if we can user the old order and ns grid cell indices
8344          * of the charge groups to sort the charge groups efficiently.
8345          */
8346         bResortAll = (bMasterState ||
8347                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8348                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8349                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8350
8351         if (debug)
8352         {
8353             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8354                     gmx_step_str(step,sbuf),dd->ncg_home);
8355         }
8356         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8357                       bResortAll ? -1 : ncg_home_old);
8358         /* Rebuild all the indices */
8359         cg0 = 0;
8360         ga2la_clear(dd->ga2la);
8361     }
8362
8363     /* Setup up the communication and communicate the coordinates */
8364     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8365
8366     /* Set the indices */
8367     make_dd_indices(dd,cgs_gl->index,cg0);
8368
8369     /* Set the charge group boundaries for neighbor searching */
8370     set_cg_boundaries(&comm->zones);
8371
8372     /*
8373     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8374                  -1,state_local->x,state_local->box);
8375     */
8376
8377     /* Extract a local topology from the global topology */
8378     for(i=0; i<dd->ndim; i++)
8379     {
8380         np[dd->dim[i]] = comm->cd[i].np;
8381     }
8382     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8383                       comm->cellsize_min,np,
8384                       fr,vsite,top_global,top_local);
8385
8386     /* Set up the special atom communication */
8387     n = comm->nat[ddnatZONE];
8388     for(i=ddnatZONE+1; i<ddnatNR; i++)
8389     {
8390         switch(i)
8391         {
8392         case ddnatVSITE:
8393             if (vsite && vsite->n_intercg_vsite)
8394             {
8395                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8396             }
8397             break;
8398         case ddnatCON:
8399             if (dd->bInterCGcons)
8400             {
8401                 /* Only for inter-cg constraints we need special code */
8402                 n = dd_make_local_constraints(dd,n,top_global,
8403                                               constr,ir->nProjOrder,
8404                                               &top_local->idef.il[F_CONSTR]);
8405             }
8406             break;
8407         default:
8408             gmx_incons("Unknown special atom type setup");
8409         }
8410         comm->nat[i] = n;
8411     }
8412
8413     /* Make space for the extra coordinates for virtual site
8414      * or constraint communication.
8415      */
8416     state_local->natoms = comm->nat[ddnatNR-1];
8417     if (state_local->natoms > state_local->nalloc)
8418     {
8419         dd_realloc_state(state_local,f,state_local->natoms);
8420     }
8421
8422     if (fr->bF_NoVirSum)
8423     {
8424         if (vsite && vsite->n_intercg_vsite)
8425         {
8426             nat_f_novirsum = comm->nat[ddnatVSITE];
8427         }
8428         else
8429         {
8430             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8431             {
8432                 nat_f_novirsum = dd->nat_tot;
8433             }
8434             else
8435             {
8436                 nat_f_novirsum = dd->nat_home;
8437             }
8438         }
8439     }
8440     else
8441     {
8442         nat_f_novirsum = 0;
8443     }
8444
8445     /* Set the number of atoms required for the force calculation.
8446      * Forces need to be constrained when using a twin-range setup
8447      * or with energy minimization. For simple simulations we could
8448      * avoid some allocation, zeroing and copying, but this is
8449      * probably not worth the complications ande checking.
8450      */
8451     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8452                         comm->nat[ddnatCON],nat_f_novirsum);
8453
8454     /* We make the all mdatoms up to nat_tot_con.
8455      * We could save some work by only setting invmass
8456      * between nat_tot and nat_tot_con.
8457      */
8458     /* This call also sets the new number of home particles to dd->nat_home */
8459     atoms2md(top_global,ir,
8460              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8461
8462     /* Now we have the charges we can sort the FE interactions */
8463     dd_sort_local_top(dd,mdatoms,top_local);
8464
8465     if (shellfc)
8466     {
8467         /* Make the local shell stuff, currently no communication is done */
8468         make_local_shells(cr,mdatoms,shellfc);
8469     }
8470
8471         if (ir->implicit_solvent)
8472     {
8473         make_local_gb(cr,fr->born,ir->gb_algorithm);
8474     }
8475
8476     if (!(cr->duty & DUTY_PME))
8477     {
8478         /* Send the charges to our PME only node */
8479         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8480                        mdatoms->chargeA,mdatoms->chargeB,
8481                        comm->ddpme[0].maxshift,comm->ddpme[1].maxshift);
8482     }
8483
8484     if (constr)
8485     {
8486         set_constraints(constr,top_local,ir,mdatoms,cr);
8487     }
8488
8489     if (ir->ePull != epullNO)
8490     {
8491         /* Update the local pull groups */
8492         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8493     }
8494
8495     add_dd_statistics(dd);
8496
8497     /* Make sure we only count the cycles for this DD partitioning */
8498     clear_dd_cycle_counts(dd);
8499
8500     /* Because the order of the atoms might have changed since
8501      * the last vsite construction, we need to communicate the constructing
8502      * atom coordinates again (for spreading the forces this MD step).
8503      */
8504     dd_move_x_vsites(dd,state_local->box,state_local->x);
8505
8506     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8507     {
8508         dd_move_x(dd,state_local->box,state_local->x);
8509         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8510                      -1,state_local->x,state_local->box);
8511     }
8512
8513     /* Store the partitioning step */
8514     comm->partition_step = step;
8515
8516     /* Increase the DD partitioning counter */
8517     dd->ddp_count++;
8518     /* The state currently matches this DD partitioning count, store it */
8519     state_local->ddp_count = dd->ddp_count;
8520     if (bMasterState)
8521     {
8522         /* The DD master node knows the complete cg distribution,
8523          * store the count so we can possibly skip the cg info communication.
8524          */
8525         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8526     }
8527
8528     if (comm->DD_debug > 0)
8529     {
8530         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8531         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8532                                 "after partitioning");
8533     }
8534 }