src/gromacs/mdlib/nbnxn_grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "nbnxn_grid.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <string.h>
  44
  45 #include <cmath>
  46
  47 #include <algorithm>
  48
  49 #include "gromacs/domdec/domdec_struct.h"
  50 #include "gromacs/gmxlib/gmx_omp_nthreads.h"
  51 #include "gromacs/math/utilities.h"
  52 #include "gromacs/math/vec.h"
  53 #include "gromacs/mdlib/nb_verlet.h"
  54 #include "gromacs/mdlib/nbnxn_atomdata.h"
  55 #include "gromacs/mdlib/nbnxn_consts.h"
  56 #include "gromacs/mdlib/nbnxn_internal.h"
  57 #include "gromacs/mdlib/nbnxn_search.h"
  58 #include "gromacs/mdlib/nbnxn_simd.h"
  59 #include "gromacs/mdlib/nbnxn_util.h"
  60 #include "gromacs/simd/simd.h"
  61 #include "gromacs/simd/vector_operations.h"
  62 #include "gromacs/utility/exceptions.h"
  63 #include "gromacs/utility/smalloc.h"
  64
  65 struct gmx_domdec_zones_t;
  66
  67 static void nbnxn_grid_init(nbnxn_grid_t * grid)
  68 {
  69     grid->cxy_na      = NULL;
  70     grid->cxy_ind     = NULL;
  71     grid->cxy_nalloc  = 0;
  72     grid->bb          = NULL;
  73     grid->bbj         = NULL;
  74     grid->nc_nalloc   = 0;
  75 }
  76
  77 void nbnxn_grids_init(nbnxn_search_t nbs, int ngrid)
  78 {
  79     int g;
  80
  81     nbs->ngrid = ngrid;
  82
  83     snew(nbs->grid, nbs->ngrid);
  84     for (g = 0; g < nbs->ngrid; g++)
  85     {
  86         nbnxn_grid_init(&nbs->grid[g]);
  87     }
  88 }
  89
  90 static real grid_atom_density(int n, rvec corner0, rvec corner1)
  91 {
  92     rvec size;
  93
  94     if (n == 0)
  95     {
  96         /* To avoid zero density we use a minimum of 1 atom */
  97         n = 1;
  98     }
  99
 100     rvec_sub(corner1, corner0, size);
 101
 102     return n/(size[XX]*size[YY]*size[ZZ]);
 103 }
 104
 105 static int set_grid_size_xy(const nbnxn_search_t nbs,
 106                             nbnxn_grid_t *grid,
 107                             int dd_zone,
 108                             int n, rvec corner0, rvec corner1,
 109                             real atom_density)
 110 {
 111     rvec size;
 112     int  na_c;
 113     int  nc_max;
 114     real tlen, tlen_x, tlen_y;
 115
 116     rvec_sub(corner1, corner0, size);
 117
 118     if (n > grid->na_sc)
 119     {
 120         assert(atom_density > 0);
 121
 122         /* target cell length */
 123         if (grid->bSimple)
 124         {
 125             /* To minimize the zero interactions, we should make
 126              * the largest of the i/j cell cubic.
 127              */
 128             na_c = std::max(grid->na_c, grid->na_cj);
 129
 130             /* Approximately cubic cells */
 131             tlen   = std::pow(static_cast<real>(na_c/atom_density), static_cast<real>(1.0/3.0));
 132             tlen_x = tlen;
 133             tlen_y = tlen;
 134         }
 135         else
 136         {
 137             /* Approximately cubic sub cells */
 138             tlen   = std::pow(static_cast<real>(grid->na_c/atom_density), static_cast<real>(1.0/3.0));
 139             tlen_x = tlen*GPU_NSUBCELL_X;
 140             tlen_y = tlen*GPU_NSUBCELL_Y;
 141         }
 142         /* We round ncx and ncy down, because we get less cell pairs
 143          * in the nbsist when the fixed cell dimensions (x,y) are
 144          * larger than the variable one (z) than the other way around.
 145          */
 146         grid->ncx = std::max(1, static_cast<int>(size[XX]/tlen_x));
 147         grid->ncy = std::max(1, static_cast<int>(size[YY]/tlen_y));
 148     }
 149     else
 150     {
 151         grid->ncx = 1;
 152         grid->ncy = 1;
 153     }
 154
 155     grid->sx     = size[XX]/grid->ncx;
 156     grid->sy     = size[YY]/grid->ncy;
 157     grid->inv_sx = 1/grid->sx;
 158     grid->inv_sy = 1/grid->sy;
 159
 160     if (dd_zone > 0)
 161     {
 162         /* This is a non-home zone, add an extra row of cells
 163          * for particles communicated for bonded interactions.
 164          * These can be beyond the cut-off. It doesn't matter where
 165          * they end up on the grid, but for performance it's better
 166          * if they don't end up in cells that can be within cut-off range.
 167          */
 168         grid->ncx++;
 169         grid->ncy++;
 170     }
 171
 172     /* We need one additional cell entry for particles moved by DD */
 173     if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 174     {
 175         grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 176         srenew(grid->cxy_na, grid->cxy_nalloc);
 177         srenew(grid->cxy_ind, grid->cxy_nalloc+1);
 178     }
 179     for (int t = 0; t < nbs->nthread_max; t++)
 180     {
 181         if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 182         {
 183             nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 184             srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
 185         }
 186     }
 187
 188     /* Worst case scenario of 1 atom in each last cell */
 189     if (grid->na_cj <= grid->na_c)
 190     {
 191         nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 192     }
 193     else
 194     {
 195         nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 196     }
 197
 198     if (nc_max > grid->nc_nalloc)
 199     {
 200         grid->nc_nalloc = over_alloc_large(nc_max);
 201         srenew(grid->nsubc, grid->nc_nalloc);
 202         srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
 203
 204         sfree_aligned(grid->bb);
 205         /* This snew also zeros the contents, this avoid possible
 206          * floating exceptions in SIMD with the unused bb elements.
 207          */
 208         if (grid->bSimple)
 209         {
 210             snew_aligned(grid->bb, grid->nc_nalloc, 16);
 211         }
 212         else
 213         {
 214 #ifdef NBNXN_BBXXXX
 215             int pbb_nalloc;
 216
 217             pbb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
 218             snew_aligned(grid->pbb, pbb_nalloc, 16);
 219 #else
 220             snew_aligned(grid->bb, grid->nc_nalloc*GPU_NSUBCELL, 16);
 221 #endif
 222         }
 223
 224         if (grid->bSimple)
 225         {
 226             if (grid->na_cj == grid->na_c)
 227             {
 228                 grid->bbj = grid->bb;
 229             }
 230             else
 231             {
 232                 sfree_aligned(grid->bbj);
 233                 snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16);
 234             }
 235         }
 236
 237         srenew(grid->flags, grid->nc_nalloc);
 238         if (nbs->bFEP)
 239         {
 240             srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c);
 241         }
 242     }
 243
 244     copy_rvec(corner0, grid->c0);
 245     copy_rvec(corner1, grid->c1);
 246     copy_rvec(size,    grid->size);
 247
 248     return nc_max;
 249 }
 250
 251 /* We need to sort paricles in grid columns on z-coordinate.
 252  * As particle are very often distributed homogeneously, we a sorting
 253  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 254  * by a factor, cast to an int and try to store in that hole. If the hole
 255  * is full, we move this or another particle. A second pass is needed to make
 256  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 257  * 4 is the optimal value for homogeneous particle distribution and allows
 258  * for an O(#particles) sort up till distributions were all particles are
 259  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 260  * as it can be expensive to detect imhomogeneous particle distributions.
 261  * SGSF is the maximum ratio of holes used, in the worst case all particles
 262  * end up in the last hole and we need #particles extra holes at the end.
 263  */
 264 #define SORT_GRID_OVERSIZE 4
 265 #define SGSF (SORT_GRID_OVERSIZE + 1)
 266
 267 /* Sort particle index a on coordinates x along dim.
 268  * Backwards tells if we want decreasing iso increasing coordinates.
 269  * h0 is the minimum of the coordinate range.
 270  * invh is the 1/length of the sorting range.
 271  * n_per_h (>=n) is the expected average number of particles per 1/invh
 272  * sort is the sorting work array.
 273  * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 274  * or easier, allocate at least n*SGSF elements.
 275  */
 276 static void sort_atoms(int dim, gmx_bool Backwards,
 277                        int gmx_unused dd_zone,
 278                        int *a, int n, rvec *x,
 279                        real h0, real invh, int n_per_h,
 280                        int *sort)
 281 {
 282     if (n <= 1)
 283     {
 284         /* Nothing to do */
 285         return;
 286     }
 287
 288 #ifndef NDEBUG
 289     if (n > n_per_h)
 290     {
 291         gmx_incons("n > n_per_h");
 292     }
 293 #endif
 294
 295     /* Transform the inverse range height into the inverse hole height */
 296     invh *= n_per_h*SORT_GRID_OVERSIZE;
 297
 298     /* Set nsort to the maximum possible number of holes used.
 299      * In worst case all n elements end up in the last bin.
 300      */
 301     int nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 302
 303     /* Determine the index range used, so we can limit it for the second pass */
 304     int zi_min = INT_MAX;
 305     int zi_max = -1;
 306
 307     /* Sort the particles using a simple index sort */
 308     for (int i = 0; i < n; i++)
 309     {
 310         /* The cast takes care of float-point rounding effects below zero.
 311          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 312          * times the box height out of the box.
 313          */
 314         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 315
 316 #ifndef NDEBUG
 317         /* As we can have rounding effect, we use > iso >= here */
 318         if (zi < 0 || (dd_zone == 0 && zi > n_per_h*SORT_GRID_OVERSIZE))
 319         {
 320             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 321                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 322                       n_per_h, SORT_GRID_OVERSIZE);
 323         }
 324 #endif
 325
 326         /* In a non-local domain, particles communcated for bonded interactions
 327          * can be far beyond the grid size, which is set by the non-bonded
 328          * cut-off distance. We sort such particles into the last cell.
 329          */
 330         if (zi > n_per_h*SORT_GRID_OVERSIZE)
 331         {
 332             zi = n_per_h*SORT_GRID_OVERSIZE;
 333         }
 334
 335         /* Ideally this particle should go in sort cell zi,
 336          * but that might already be in use,
 337          * in that case find the first empty cell higher up
 338          */
 339         if (sort[zi] < 0)
 340         {
 341             sort[zi] = a[i];
 342             zi_min   = std::min(zi_min, zi);
 343             zi_max   = std::max(zi_max, zi);
 344         }
 345         else
 346         {
 347             /* We have multiple atoms in the same sorting slot.
 348              * Sort on real z for minimal bounding box size.
 349              * There is an extra check for identical z to ensure
 350              * well-defined output order, independent of input order
 351              * to ensure binary reproducibility after restarts.
 352              */
 353             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 354                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 355                                        a[i] > sort[zi])))
 356             {
 357                 zi++;
 358             }
 359
 360             if (sort[zi] >= 0)
 361             {
 362                 /* Shift all elements by one slot until we find an empty slot */
 363                 int cp  = sort[zi];
 364                 int zim = zi + 1;
 365                 while (sort[zim] >= 0)
 366                 {
 367                     int tmp   = sort[zim];
 368                     sort[zim] = cp;
 369                     cp        = tmp;
 370                     zim++;
 371                 }
 372                 sort[zim] = cp;
 373                 zi_max    = std::max(zi_max, zim);
 374             }
 375             sort[zi] = a[i];
 376             zi_max   = std::max(zi_max, zi);
 377         }
 378     }
 379
 380     int c = 0;
 381     if (!Backwards)
 382     {
 383         for (int zi = 0; zi < nsort; zi++)
 384         {
 385             if (sort[zi] >= 0)
 386             {
 387                 a[c++]   = sort[zi];
 388                 sort[zi] = -1;
 389             }
 390         }
 391     }
 392     else
 393     {
 394         for (int zi = zi_max; zi >= zi_min; zi--)
 395         {
 396             if (sort[zi] >= 0)
 397             {
 398                 a[c++]   = sort[zi];
 399                 sort[zi] = -1;
 400             }
 401         }
 402     }
 403     if (c < n)
 404     {
 405         gmx_incons("Lost particles while sorting");
 406     }
 407 }
 408
 409 #ifdef GMX_DOUBLE
 410 #define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 411 #define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 412 #else
 413 #define R2F_D(x) (x)
 414 #define R2F_U(x) (x)
 415 #endif
 416
 417 /* Coordinate order x,y,z, bb order xyz0 */
 418 static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 419 {
 420     int  i;
 421     real xl, xh, yl, yh, zl, zh;
 422
 423     i  = 0;
 424     xl = x[i+XX];
 425     xh = x[i+XX];
 426     yl = x[i+YY];
 427     yh = x[i+YY];
 428     zl = x[i+ZZ];
 429     zh = x[i+ZZ];
 430     i += stride;
 431     for (int j = 1; j < na; j++)
 432     {
 433         xl = std::min(xl, x[i+XX]);
 434         xh = std::max(xh, x[i+XX]);
 435         yl = std::min(yl, x[i+YY]);
 436         yh = std::max(yh, x[i+YY]);
 437         zl = std::min(zl, x[i+ZZ]);
 438         zh = std::max(zh, x[i+ZZ]);
 439         i += stride;
 440     }
 441     /* Note: possible double to float conversion here */
 442     bb->lower[BB_X] = R2F_D(xl);
 443     bb->lower[BB_Y] = R2F_D(yl);
 444     bb->lower[BB_Z] = R2F_D(zl);
 445     bb->upper[BB_X] = R2F_U(xh);
 446     bb->upper[BB_Y] = R2F_U(yh);
 447     bb->upper[BB_Z] = R2F_U(zh);
 448 }
 449
 450 /* Packed coordinates, bb order xyz0 */
 451 static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 452 {
 453     real xl, xh, yl, yh, zl, zh;
 454
 455     xl = x[XX*PACK_X4];
 456     xh = x[XX*PACK_X4];
 457     yl = x[YY*PACK_X4];
 458     yh = x[YY*PACK_X4];
 459     zl = x[ZZ*PACK_X4];
 460     zh = x[ZZ*PACK_X4];
 461     for (int j = 1; j < na; j++)
 462     {
 463         xl = std::min(xl, x[j+XX*PACK_X4]);
 464         xh = std::max(xh, x[j+XX*PACK_X4]);
 465         yl = std::min(yl, x[j+YY*PACK_X4]);
 466         yh = std::max(yh, x[j+YY*PACK_X4]);
 467         zl = std::min(zl, x[j+ZZ*PACK_X4]);
 468         zh = std::max(zh, x[j+ZZ*PACK_X4]);
 469     }
 470     /* Note: possible double to float conversion here */
 471     bb->lower[BB_X] = R2F_D(xl);
 472     bb->lower[BB_Y] = R2F_D(yl);
 473     bb->lower[BB_Z] = R2F_D(zl);
 474     bb->upper[BB_X] = R2F_U(xh);
 475     bb->upper[BB_Y] = R2F_U(yh);
 476     bb->upper[BB_Z] = R2F_U(zh);
 477 }
 478
 479 /* Packed coordinates, bb order xyz0 */
 480 static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 481 {
 482     real xl, xh, yl, yh, zl, zh;
 483
 484     xl = x[XX*PACK_X8];
 485     xh = x[XX*PACK_X8];
 486     yl = x[YY*PACK_X8];
 487     yh = x[YY*PACK_X8];
 488     zl = x[ZZ*PACK_X8];
 489     zh = x[ZZ*PACK_X8];
 490     for (int j = 1; j < na; j++)
 491     {
 492         xl = std::min(xl, x[j+XX*PACK_X8]);
 493         xh = std::max(xh, x[j+XX*PACK_X8]);
 494         yl = std::min(yl, x[j+YY*PACK_X8]);
 495         yh = std::max(yh, x[j+YY*PACK_X8]);
 496         zl = std::min(zl, x[j+ZZ*PACK_X8]);
 497         zh = std::max(zh, x[j+ZZ*PACK_X8]);
 498     }
 499     /* Note: possible double to float conversion here */
 500     bb->lower[BB_X] = R2F_D(xl);
 501     bb->lower[BB_Y] = R2F_D(yl);
 502     bb->lower[BB_Z] = R2F_D(zl);
 503     bb->upper[BB_X] = R2F_U(xh);
 504     bb->upper[BB_Y] = R2F_U(yh);
 505     bb->upper[BB_Z] = R2F_U(zh);
 506 }
 507
 508 /* Packed coordinates, bb order xyz0 */
 509 static void calc_bounding_box_x_x4_halves(int na, const real *x,
 510                                           nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 511 {
 512     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 513
 514     if (na > 2)
 515     {
 516         calc_bounding_box_x_x4(std::min(na-2, 2), x+(PACK_X4>>1), bbj+1);
 517     }
 518     else
 519     {
 520         /* Set the "empty" bounding box to the same as the first one,
 521          * so we don't need to treat special cases in the rest of the code.
 522          */
 523 #ifdef NBNXN_SEARCH_BB_SIMD4
 524         gmx_simd4_store_f(&bbj[1].lower[0], gmx_simd4_load_f(&bbj[0].lower[0]));
 525         gmx_simd4_store_f(&bbj[1].upper[0], gmx_simd4_load_f(&bbj[0].upper[0]));
 526 #else
 527         bbj[1] = bbj[0];
 528 #endif
 529     }
 530
 531 #ifdef NBNXN_SEARCH_BB_SIMD4
 532     gmx_simd4_store_f(&bb->lower[0],
 533                       gmx_simd4_min_f(gmx_simd4_load_f(&bbj[0].lower[0]),
 534                                       gmx_simd4_load_f(&bbj[1].lower[0])));
 535     gmx_simd4_store_f(&bb->upper[0],
 536                       gmx_simd4_max_f(gmx_simd4_load_f(&bbj[0].upper[0]),
 537                                       gmx_simd4_load_f(&bbj[1].upper[0])));
 538 #else
 539     {
 540         int i;
 541
 542         for (i = 0; i < NNBSBB_C; i++)
 543         {
 544             bb->lower[i] = std::min(bbj[0].lower[i], bbj[1].lower[i]);
 545             bb->upper[i] = std::max(bbj[0].upper[i], bbj[1].upper[i]);
 546         }
 547     }
 548 #endif
 549 }
 550
 551 #ifdef NBNXN_SEARCH_BB_SIMD4
 552
 553 /* Coordinate order xyz, bb order xxxxyyyyzzzz */
 554 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 555 {
 556     int  i;
 557     real xl, xh, yl, yh, zl, zh;
 558
 559     i  = 0;
 560     xl = x[i+XX];
 561     xh = x[i+XX];
 562     yl = x[i+YY];
 563     yh = x[i+YY];
 564     zl = x[i+ZZ];
 565     zh = x[i+ZZ];
 566     i += stride;
 567     for (int j = 1; j < na; j++)
 568     {
 569         xl = std::min(xl, x[i+XX]);
 570         xh = std::max(xh, x[i+XX]);
 571         yl = std::min(yl, x[i+YY]);
 572         yh = std::max(yh, x[i+YY]);
 573         zl = std::min(zl, x[i+ZZ]);
 574         zh = std::max(zh, x[i+ZZ]);
 575         i += stride;
 576     }
 577     /* Note: possible double to float conversion here */
 578     bb[0*STRIDE_PBB] = R2F_D(xl);
 579     bb[1*STRIDE_PBB] = R2F_D(yl);
 580     bb[2*STRIDE_PBB] = R2F_D(zl);
 581     bb[3*STRIDE_PBB] = R2F_U(xh);
 582     bb[4*STRIDE_PBB] = R2F_U(yh);
 583     bb[5*STRIDE_PBB] = R2F_U(zh);
 584 }
 585
 586 #endif /* NBNXN_SEARCH_BB_SIMD4 */
 587
 588 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 589
 590 /* Coordinate order xyz?, bb order xyz0 */
 591 static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 592 {
 593     gmx_simd4_float_t bb_0_S, bb_1_S;
 594     gmx_simd4_float_t x_S;
 595
 596     bb_0_S = gmx_simd4_load_f(x);
 597     bb_1_S = bb_0_S;
 598
 599     for (int i = 1; i < na; i++)
 600     {
 601         x_S    = gmx_simd4_load_f(x+i*NNBSBB_C);
 602         bb_0_S = gmx_simd4_min_f(bb_0_S, x_S);
 603         bb_1_S = gmx_simd4_max_f(bb_1_S, x_S);
 604     }
 605
 606     gmx_simd4_store_f(&bb->lower[0], bb_0_S);
 607     gmx_simd4_store_f(&bb->upper[0], bb_1_S);
 608 }
 609
 610 /* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 611 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 612                                          nbnxn_bb_t *bb_work_aligned,
 613                                          real *bb)
 614 {
 615     calc_bounding_box_simd4(na, x, bb_work_aligned);
 616
 617     bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
 618     bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
 619     bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
 620     bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
 621     bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
 622     bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 623 }
 624
 625 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 626
 627
 628 /* Combines pairs of consecutive bounding boxes */
 629 static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
 630 {
 631     for (int i = 0; i < grid->ncx*grid->ncy; i++)
 632     {
 633         /* Starting bb in a column is expected to be 2-aligned */
 634         int sc2 = grid->cxy_ind[i]>>1;
 635         /* For odd numbers skip the last bb here */
 636         int nc2 = (grid->cxy_na[i]+3)>>(2+1);
 637         int c2;
 638         for (c2 = sc2; c2 < sc2+nc2; c2++)
 639         {
 640 #ifdef NBNXN_SEARCH_BB_SIMD4
 641             gmx_simd4_float_t min_S, max_S;
 642
 643             min_S = gmx_simd4_min_f(gmx_simd4_load_f(&bb[c2*2+0].lower[0]),
 644                                     gmx_simd4_load_f(&bb[c2*2+1].lower[0]));
 645             max_S = gmx_simd4_max_f(gmx_simd4_load_f(&bb[c2*2+0].upper[0]),
 646                                     gmx_simd4_load_f(&bb[c2*2+1].upper[0]));
 647             gmx_simd4_store_f(&grid->bbj[c2].lower[0], min_S);
 648             gmx_simd4_store_f(&grid->bbj[c2].upper[0], max_S);
 649 #else
 650             for (int j = 0; j < NNBSBB_C; j++)
 651             {
 652                 grid->bbj[c2].lower[j] = std::min(bb[c2*2+0].lower[j],
 653                                                   bb[c2*2+1].lower[j]);
 654                 grid->bbj[c2].upper[j] = std::max(bb[c2*2+0].upper[j],
 655                                                   bb[c2*2+1].upper[j]);
 656             }
 657 #endif
 658         }
 659         if (((grid->cxy_na[i]+3)>>2) & 1)
 660         {
 661             /* The bb count in this column is odd: duplicate the last bb */
 662             for (int j = 0; j < NNBSBB_C; j++)
 663             {
 664                 grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
 665                 grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 666             }
 667         }
 668     }
 669 }
 670
 671
 672 /* Prints the average bb size, used for debug output */
 673 static void print_bbsizes_simple(FILE                *fp,
 674                                  const nbnxn_grid_t  *grid)
 675 {
 676     dvec ba;
 677
 678     clear_dvec(ba);
 679     for (int c = 0; c < grid->nc; c++)
 680     {
 681         for (int d = 0; d < DIM; d++)
 682         {
 683             ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 684         }
 685     }
 686     dsvmul(1.0/grid->nc, ba, ba);
 687
 688     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 689             grid->sx,
 690             grid->sy,
 691             grid->atom_density > 0 ?
 692             grid->na_c/(grid->atom_density*grid->sx*grid->sy) : 0.0,
 693             ba[XX], ba[YY], ba[ZZ],
 694             ba[XX]/grid->sx,
 695             ba[YY]/grid->sy,
 696             grid->atom_density > 0 ?
 697             ba[ZZ]/(grid->na_c/(grid->atom_density*grid->sx*grid->sy)) : 0.0);
 698 }
 699
 700 /* Prints the average bb size, used for debug output */
 701 static void print_bbsizes_supersub(FILE                *fp,
 702                                    const nbnxn_grid_t  *grid)
 703 {
 704     int  ns;
 705     dvec ba;
 706
 707     clear_dvec(ba);
 708     ns = 0;
 709     for (int c = 0; c < grid->nc; c++)
 710     {
 711 #ifdef NBNXN_BBXXXX
 712         for (int s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 713         {
 714             int cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
 715             for (int i = 0; i < STRIDE_PBB; i++)
 716             {
 717                 for (int d = 0; d < DIM; d++)
 718                 {
 719                     ba[d] +=
 720                         grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
 721                         grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 722                 }
 723             }
 724         }
 725 #else
 726         for (int s = 0; s < grid->nsubc[c]; s++)
 727         {
 728             int cs = c*GPU_NSUBCELL + s;
 729             for (int d = 0; d < DIM; d++)
 730             {
 731                 ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 732             }
 733         }
 734 #endif
 735         ns += grid->nsubc[c];
 736     }
 737     dsvmul(1.0/ns, ba, ba);
 738
 739     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 740             grid->sx/GPU_NSUBCELL_X,
 741             grid->sy/GPU_NSUBCELL_Y,
 742             grid->atom_density > 0 ?
 743             grid->na_sc/(grid->atom_density*grid->sx*grid->sy*GPU_NSUBCELL_Z) : 0.0,
 744             ba[XX], ba[YY], ba[ZZ],
 745             ba[XX]*GPU_NSUBCELL_X/grid->sx,
 746             ba[YY]*GPU_NSUBCELL_Y/grid->sy,
 747             grid->atom_density > 0 ?
 748             ba[ZZ]/(grid->na_sc/(grid->atom_density*grid->sx*grid->sy*GPU_NSUBCELL_Z)) : 0.0);
 749 }
 750
 751 /* Set non-bonded interaction flags for the current cluster.
 752  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 753  */
 754 static void sort_cluster_on_flag(int na_c,
 755                                  int a0, int a1, const int *atinfo,
 756                                  int *order,
 757                                  int *flags)
 758 {
 759     int      subc;
 760     int      sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 761     int      sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 762
 763     *flags = 0;
 764
 765     subc = 0;
 766     for (int s = a0; s < a1; s += na_c)
 767     {
 768         /* Make lists for this (sub-)cell on atoms with and without LJ */
 769         int      n1         = 0;
 770         int      n2         = 0;
 771         gmx_bool haveQ      = FALSE;
 772         int      a_lj_max   = -1;
 773         for (int a = s; a < std::min(s+na_c, a1); a++)
 774         {
 775             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 776
 777             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 778             {
 779                 sort1[n1++] = order[a];
 780                 a_lj_max    = a;
 781             }
 782             else
 783             {
 784                 sort2[n2++] = order[a];
 785             }
 786         }
 787
 788         /* If we don't have atoms with LJ, there's nothing to sort */
 789         if (n1 > 0)
 790         {
 791             *flags |= NBNXN_CI_DO_LJ(subc);
 792
 793             if (2*n1 <= na_c)
 794             {
 795                 /* Only sort when strictly necessary. Ordering particles
 796                  * Ordering particles can lead to less accurate summation
 797                  * due to rounding, both for LJ and Coulomb interactions.
 798                  */
 799                 if (2*(a_lj_max - s) >= na_c)
 800                 {
 801                     for (int i = 0; i < n1; i++)
 802                     {
 803                         order[a0+i] = sort1[i];
 804                     }
 805                     for (int j = 0; j < n2; j++)
 806                     {
 807                         order[a0+n1+j] = sort2[j];
 808                     }
 809                 }
 810
 811                 *flags |= NBNXN_CI_HALF_LJ(subc);
 812             }
 813         }
 814         if (haveQ)
 815         {
 816             *flags |= NBNXN_CI_DO_COUL(subc);
 817         }
 818         subc++;
 819     }
 820 }
 821
 822 /* Fill a pair search cell with atoms.
 823  * Potentially sorts atoms and sets the interaction flags.
 824  */
 825 static void fill_cell(const nbnxn_search_t nbs,
 826                       nbnxn_grid_t *grid,
 827                       nbnxn_atomdata_t *nbat,
 828                       int a0, int a1,
 829                       const int *atinfo,
 830                       rvec *x,
 831                       int sx, int sy, int sz,
 832                       nbnxn_bb_t gmx_unused *bb_work_aligned)
 833 {
 834     int         na, a;
 835     size_t      offset;
 836     nbnxn_bb_t *bb_ptr;
 837 #ifdef NBNXN_BBXXXX
 838     float      *pbb_ptr;
 839 #endif
 840
 841     na = a1 - a0;
 842
 843     if (grid->bSimple)
 844     {
 845         /* Note that non-local grids are already sorted.
 846          * Then sort_cluster_on_flag will only set the flags and the sorting
 847          * will not affect the atom order.
 848          */
 849         sort_cluster_on_flag(grid->na_c, a0, a1, atinfo, nbs->a,
 850                              grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 851     }
 852
 853     if (nbs->bFEP)
 854     {
 855         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 856         int c;
 857
 858         /* The grid-local cluster/(sub-)cell index */
 859         c            = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : GPU_NSUBCELL);
 860         grid->fep[c] = 0;
 861         for (int at = a0; at < a1; at++)
 862         {
 863             if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]]))
 864             {
 865                 grid->fep[c] |= (1 << (at - a0));
 866             }
 867         }
 868     }
 869
 870     /* Now we have sorted the atoms, set the cell indices */
 871     for (a = a0; a < a1; a++)
 872     {
 873         nbs->cell[nbs->a[a]] = a;
 874     }
 875
 876     copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
 877                            nbat->XFormat, nbat->x, a0,
 878                            sx, sy, sz);
 879
 880     if (nbat->XFormat == nbatX4)
 881     {
 882         /* Store the bounding boxes as xyz.xyz. */
 883         offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 884         bb_ptr = grid->bb + offset;
 885
 886 #if defined GMX_NBNXN_SIMD && GMX_SIMD_REAL_WIDTH == 2
 887         if (2*grid->na_cj == grid->na_c)
 888         {
 889             calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
 890                                           grid->bbj+offset*2);
 891         }
 892         else
 893 #endif
 894         {
 895             calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0), bb_ptr);
 896         }
 897     }
 898     else if (nbat->XFormat == nbatX8)
 899     {
 900         /* Store the bounding boxes as xyz.xyz. */
 901         offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 902         bb_ptr = grid->bb + offset;
 903
 904         calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
 905     }
 906 #ifdef NBNXN_BBXXXX
 907     else if (!grid->bSimple)
 908     {
 909         /* Store the bounding boxes in a format convenient
 910          * for SIMD4 calculations: xxxxyyyyzzzz...
 911          */
 912         pbb_ptr =
 913             grid->pbb +
 914             ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 915             (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 916
 917 #ifdef NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 918         if (nbat->XFormat == nbatXYZQ)
 919         {
 920             calc_bounding_box_xxxx_simd4(na, nbat->x+a0*nbat->xstride,
 921                                          bb_work_aligned, pbb_ptr);
 922         }
 923         else
 924 #endif
 925         {
 926             calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 927                                    pbb_ptr);
 928         }
 929         if (gmx_debug_at)
 930         {
 931             fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 932                     sx, sy, sz,
 933                     pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
 934                     pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
 935                     pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 936         }
 937     }
 938 #endif
 939     else
 940     {
 941         /* Store the bounding boxes as xyz.xyz. */
 942         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
 943
 944         calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 945                           bb_ptr);
 946
 947         if (gmx_debug_at)
 948         {
 949             int bbo;
 950             bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 951             fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 952                     sx, sy, sz,
 953                     grid->bb[bbo].lower[BB_X],
 954                     grid->bb[bbo].lower[BB_Y],
 955                     grid->bb[bbo].lower[BB_Z],
 956                     grid->bb[bbo].upper[BB_X],
 957                     grid->bb[bbo].upper[BB_Y],
 958                     grid->bb[bbo].upper[BB_Z]);
 959         }
 960     }
 961 }
 962
 963 /* Spatially sort the atoms within one grid column */
 964 static void sort_columns_simple(const nbnxn_search_t nbs,
 965                                 int dd_zone,
 966                                 nbnxn_grid_t *grid,
 967                                 int a0, int a1,
 968                                 const int *atinfo,
 969                                 rvec *x,
 970                                 nbnxn_atomdata_t *nbat,
 971                                 int cxy_start, int cxy_end,
 972                                 int *sort_work)
 973 {
 974     int  cfilled, c;
 975
 976     if (debug)
 977     {
 978         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 979                 grid->cell0, cxy_start, cxy_end, a0, a1);
 980     }
 981
 982     /* Sort the atoms within each x,y column in 3 dimensions */
 983     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
 984     {
 985         int cx = cxy/grid->ncy;
 986         int cy = cxy - cx*grid->ncy;
 987
 988         int na  = grid->cxy_na[cxy];
 989         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 990         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 991
 992         /* Sort the atoms within each x,y column on z coordinate */
 993         sort_atoms(ZZ, FALSE, dd_zone,
 994                    nbs->a+ash, na, x,
 995                    grid->c0[ZZ],
 996                    1.0/grid->size[ZZ], ncz*grid->na_sc,
 997                    sort_work);
 998
 999         /* Fill the ncz cells in this column */
1000         cfilled = grid->cxy_ind[cxy];
1001         for (int cz = 0; cz < ncz; cz++)
1002         {
1003             c  = grid->cxy_ind[cxy] + cz;
1004
1005             int ash_c = ash + cz*grid->na_sc;
1006             int na_c  = std::min(grid->na_sc, na-(ash_c-ash));
1007
1008             fill_cell(nbs, grid, nbat,
1009                       ash_c, ash_c+na_c, atinfo, x,
1010                       grid->na_sc*cx + (dd_zone >> 2),
1011                       grid->na_sc*cy + (dd_zone & 3),
1012                       grid->na_sc*cz,
1013                       NULL);
1014
1015             /* This copy to bbcz is not really necessary.
1016              * But it allows to use the same grid search code
1017              * for the simple and supersub cell setups.
1018              */
1019             if (na_c > 0)
1020             {
1021                 cfilled = c;
1022             }
1023             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
1024             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
1025         }
1026
1027         /* Set the unused atom indices to -1 */
1028         for (int ind = na; ind < ncz*grid->na_sc; ind++)
1029         {
1030             nbs->a[ash+ind] = -1;
1031         }
1032     }
1033 }
1034
1035 /* Spatially sort the atoms within one grid column */
1036 static void sort_columns_supersub(const nbnxn_search_t nbs,
1037                                   int dd_zone,
1038                                   nbnxn_grid_t *grid,
1039                                   int a0, int a1,
1040                                   const int *atinfo,
1041                                   rvec *x,
1042                                   nbnxn_atomdata_t *nbat,
1043                                   int cxy_start, int cxy_end,
1044                                   int *sort_work)
1045 {
1046     nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
1047
1048     bb_work_aligned = reinterpret_cast<nbnxn_bb_t *>((reinterpret_cast<std::size_t>(bb_work_array+1)) & (~(static_cast<std::size_t>(15))));
1049
1050     if (debug)
1051     {
1052         fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
1053                 grid->cell0, cxy_start, cxy_end, a0, a1);
1054     }
1055
1056     int subdiv_x = grid->na_c;
1057     int subdiv_y = GPU_NSUBCELL_X*subdiv_x;
1058     int subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
1059
1060     /* Sort the atoms within each x,y column in 3 dimensions */
1061     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1062     {
1063         int cx = cxy/grid->ncy;
1064         int cy = cxy - cx*grid->ncy;
1065
1066         int na  = grid->cxy_na[cxy];
1067         int ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
1068         int ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
1069
1070         /* Sort the atoms within each x,y column on z coordinate */
1071         sort_atoms(ZZ, FALSE, dd_zone,
1072                    nbs->a+ash, na, x,
1073                    grid->c0[ZZ],
1074                    1.0/grid->size[ZZ], ncz*grid->na_sc,
1075                    sort_work);
1076
1077         /* This loop goes over the supercells and subcells along z at once */
1078         for (int sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z; sub_z++)
1079         {
1080             int ash_z = ash + sub_z*subdiv_z;
1081             int na_z  = std::min(subdiv_z, na-(ash_z-ash));
1082             int cz    = -1;
1083             /* We have already sorted on z */
1084
1085             if (sub_z % GPU_NSUBCELL_Z == 0)
1086             {
1087                 cz = sub_z/GPU_NSUBCELL_Z;
1088                 int c  = grid->cxy_ind[cxy] + cz;
1089
1090                 /* The number of atoms in this supercell */
1091                 int na_c = std::min(grid->na_sc, na-(ash_z-ash));
1092
1093                 grid->nsubc[c] = std::min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c);
1094
1095                 /* Store the z-boundaries of the super cell */
1096                 grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
1097                 grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
1098             }
1099
1100 #if GPU_NSUBCELL_Y > 1
1101             /* Sort the atoms along y */
1102             sort_atoms(YY, (sub_z & 1), dd_zone,
1103                        nbs->a+ash_z, na_z, x,
1104                        grid->c0[YY]+cy*grid->sy,
1105                        grid->inv_sy, subdiv_z,
1106                        sort_work);
1107 #endif
1108
1109             for (int sub_y = 0; sub_y < GPU_NSUBCELL_Y; sub_y++)
1110             {
1111                 int ash_y = ash_z + sub_y*subdiv_y;
1112                 int na_y  = std::min(subdiv_y, na-(ash_y-ash));
1113
1114 #if GPU_NSUBCELL_X > 1
1115                 /* Sort the atoms along x */
1116                 sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1), dd_zone,
1117                            nbs->a+ash_y, na_y, x,
1118                            grid->c0[XX]+cx*grid->sx,
1119                            grid->inv_sx, subdiv_y,
1120                            sort_work);
1121 #endif
1122
1123                 for (int sub_x = 0; sub_x < GPU_NSUBCELL_X; sub_x++)
1124                 {
1125                     int ash_x = ash_y + sub_x*subdiv_x;
1126                     int na_x  = std::min(subdiv_x, na-(ash_x-ash));
1127
1128                     fill_cell(nbs, grid, nbat,
1129                               ash_x, ash_x+na_x, atinfo, x,
1130                               grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
1131                               grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
1132                               grid->na_c*sub_z,
1133                               bb_work_aligned);
1134                 }
1135             }
1136         }
1137
1138         /* Set the unused atom indices to -1 */
1139         for (int ind = na; ind < ncz*grid->na_sc; ind++)
1140         {
1141             nbs->a[ash+ind] = -1;
1142         }
1143     }
1144 }
1145
1146 /* Determine in which grid column atoms should go */
1147 static void calc_column_indices(nbnxn_grid_t *grid,
1148                                 int a0, int a1,
1149                                 rvec *x,
1150                                 int dd_zone, const int *move,
1151                                 int thread, int nthread,
1152                                 int *cell,
1153                                 int *cxy_na)
1154 {
1155     /* We add one extra cell for particles which moved during DD */
1156     for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1157     {
1158         cxy_na[i] = 0;
1159     }
1160
1161     int n0 = a0 + static_cast<int>((thread+0)*(a1 - a0))/nthread;
1162     int n1 = a0 + static_cast<int>((thread+1)*(a1 - a0))/nthread;
1163     if (dd_zone == 0)
1164     {
1165         /* Home zone */
1166         for (int i = n0; i < n1; i++)
1167         {
1168             if (move == NULL || move[i] >= 0)
1169             {
1170                 /* We need to be careful with rounding,
1171                  * particles might be a few bits outside the local zone.
1172                  * The int cast takes care of the lower bound,
1173                  * we will explicitly take care of the upper bound.
1174                  */
1175                 int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1176                 int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1177
1178 #ifndef NDEBUG
1179                 if (cx < 0 || cx > grid->ncx ||
1180                     cy < 0 || cy > grid->ncy)
1181                 {
1182                     gmx_fatal(FARGS,
1183                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1184                               "atom %f %f %f, grid->c0 %f %f",
1185                               cx, cy, grid->ncx, grid->ncy,
1186                               x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
1187                 }
1188 #endif
1189                 /* Take care of potential rouding issues */
1190                 cx = std::min(cx, grid->ncx - 1);
1191                 cy = std::min(cy, grid->ncy - 1);
1192
1193                 /* For the moment cell will contain only the, grid local,
1194                  * x and y indices, not z.
1195                  */
1196                 cell[i] = cx*grid->ncy + cy;
1197             }
1198             else
1199             {
1200                 /* Put this moved particle after the end of the grid,
1201                  * so we can process it later without using conditionals.
1202                  */
1203                 cell[i] = grid->ncx*grid->ncy;
1204             }
1205
1206             cxy_na[cell[i]]++;
1207         }
1208     }
1209     else
1210     {
1211         /* Non-home zone */
1212         for (int i = n0; i < n1; i++)
1213         {
1214             int cx = static_cast<int>((x[i][XX] - grid->c0[XX])*grid->inv_sx);
1215             int cy = static_cast<int>((x[i][YY] - grid->c0[YY])*grid->inv_sy);
1216
1217             /* For non-home zones there could be particles outside
1218              * the non-bonded cut-off range, which have been communicated
1219              * for bonded interactions only. For the result it doesn't
1220              * matter where these end up on the grid. For performance
1221              * we put them in an extra row at the border.
1222              */
1223             cx = std::max(cx, 0);
1224             cx = std::min(cx, grid->ncx - 1);
1225             cy = std::max(cy, 0);
1226             cy = std::min(cy, grid->ncy - 1);
1227
1228             /* For the moment cell will contain only the, grid local,
1229              * x and y indices, not z.
1230              */
1231             cell[i] = cx*grid->ncy + cy;
1232
1233             cxy_na[cell[i]]++;
1234         }
1235     }
1236 }
1237
1238 /* Determine in which grid cells the atoms should go */
1239 static void calc_cell_indices(const nbnxn_search_t nbs,
1240                               int dd_zone,
1241                               nbnxn_grid_t *grid,
1242                               int a0, int a1,
1243                               const int *atinfo,
1244                               rvec *x,
1245                               const int *move,
1246                               nbnxn_atomdata_t *nbat)
1247 {
1248     int   n0, n1;
1249     int   cx, cy, cxy, ncz_max, ncz;
1250     int   nthread;
1251     int   cxy_na_i;
1252
1253     nthread = gmx_omp_nthreads_get(emntPairsearch);
1254
1255 #pragma omp parallel for num_threads(nthread) schedule(static)
1256     for (int thread = 0; thread < nthread; thread++)
1257     {
1258         try
1259         {
1260             calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
1261                                 nbs->cell, nbs->work[thread].cxy_na);
1262         }
1263         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1264     }
1265
1266     /* Make the cell index as a function of x and y */
1267     ncz_max          = 0;
1268     ncz              = 0;
1269     grid->cxy_ind[0] = 0;
1270     for (int i = 0; i < grid->ncx*grid->ncy+1; i++)
1271     {
1272         /* We set ncz_max at the beginning of the loop iso at the end
1273          * to skip i=grid->ncx*grid->ncy which are moved particles
1274          * that do not need to be ordered on the grid.
1275          */
1276         if (ncz > ncz_max)
1277         {
1278             ncz_max = ncz;
1279         }
1280         cxy_na_i = nbs->work[0].cxy_na[i];
1281         for (int thread = 1; thread < nthread; thread++)
1282         {
1283             cxy_na_i += nbs->work[thread].cxy_na[i];
1284         }
1285         ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
1286         if (nbat->XFormat == nbatX8)
1287         {
1288             /* Make the number of cell a multiple of 2 */
1289             ncz = (ncz + 1) & ~1;
1290         }
1291         grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
1292         /* Clear cxy_na, so we can reuse the array below */
1293         grid->cxy_na[i] = 0;
1294     }
1295     grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
1296
1297     nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
1298
1299     if (debug)
1300     {
1301         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1302                 grid->na_sc, grid->na_c, grid->nc,
1303                 grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
1304                 ncz_max);
1305         if (gmx_debug_at)
1306         {
1307             int i = 0;
1308             for (cy = 0; cy < grid->ncy; cy++)
1309             {
1310                 for (cx = 0; cx < grid->ncx; cx++)
1311                 {
1312                     fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
1313                     i++;
1314                 }
1315                 fprintf(debug, "\n");
1316             }
1317         }
1318     }
1319
1320     /* Make sure the work array for sorting is large enough */
1321     if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
1322     {
1323         for (int thread = 0; thread < nbs->nthread_max; thread++)
1324         {
1325             nbs->work[thread].sort_work_nalloc =
1326                 over_alloc_large(ncz_max*grid->na_sc*SGSF);
1327             srenew(nbs->work[thread].sort_work,
1328                    nbs->work[thread].sort_work_nalloc);
1329             /* When not in use, all elements should be -1 */
1330             for (int i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
1331             {
1332                 nbs->work[thread].sort_work[i] = -1;
1333             }
1334         }
1335     }
1336
1337     /* Now we know the dimensions we can fill the grid.
1338      * This is the first, unsorted fill. We sort the columns after this.
1339      */
1340     for (int i = a0; i < a1; i++)
1341     {
1342         /* At this point nbs->cell contains the local grid x,y indices */
1343         cxy = nbs->cell[i];
1344         nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
1345     }
1346
1347     if (dd_zone == 0)
1348     {
1349         /* Set the cell indices for the moved particles */
1350         n0 = grid->nc*grid->na_sc;
1351         n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
1352         if (dd_zone == 0)
1353         {
1354             for (int i = n0; i < n1; i++)
1355             {
1356                 nbs->cell[nbs->a[i]] = i;
1357             }
1358         }
1359     }
1360
1361     /* Sort the super-cell columns along z into the sub-cells. */
1362 #pragma omp parallel for num_threads(nthread) schedule(static)
1363     for (int thread = 0; thread < nthread; thread++)
1364     {
1365         try
1366         {
1367             if (grid->bSimple)
1368             {
1369                 sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1370                                     ((thread+0)*grid->ncx*grid->ncy)/nthread,
1371                                     ((thread+1)*grid->ncx*grid->ncy)/nthread,
1372                                     nbs->work[thread].sort_work);
1373             }
1374             else
1375             {
1376                 sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
1377                                       ((thread+0)*grid->ncx*grid->ncy)/nthread,
1378                                       ((thread+1)*grid->ncx*grid->ncy)/nthread,
1379                                       nbs->work[thread].sort_work);
1380             }
1381         }
1382         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1383     }
1384
1385     if (grid->bSimple && nbat->XFormat == nbatX8)
1386     {
1387         combine_bounding_box_pairs(grid, grid->bb);
1388     }
1389
1390     if (!grid->bSimple)
1391     {
1392         grid->nsubc_tot = 0;
1393         for (int i = 0; i < grid->nc; i++)
1394         {
1395             grid->nsubc_tot += grid->nsubc[i];
1396         }
1397     }
1398
1399     if (debug)
1400     {
1401         if (grid->bSimple)
1402         {
1403             print_bbsizes_simple(debug, grid);
1404         }
1405         else
1406         {
1407             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1408                     grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
1409
1410             print_bbsizes_supersub(debug, grid);
1411         }
1412     }
1413 }
1414
1415 static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
1416                               int                   natoms)
1417 {
1418     flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
1419     if (flags->nflag > flags->flag_nalloc)
1420     {
1421         flags->flag_nalloc = over_alloc_large(flags->nflag);
1422         srenew(flags->flag, flags->flag_nalloc);
1423     }
1424     for (int b = 0; b < flags->nflag; b++)
1425     {
1426         bitmask_clear(&(flags->flag[b]));
1427     }
1428 }
1429
1430 /* Sets up a grid and puts the atoms on the grid.
1431  * This function only operates on one domain of the domain decompostion.
1432  * Note that without domain decomposition there is only one domain.
1433  */
1434 void nbnxn_put_on_grid(nbnxn_search_t nbs,
1435                        int ePBC, matrix box,
1436                        int dd_zone,
1437                        rvec corner0, rvec corner1,
1438                        int a0, int a1,
1439                        real atom_density,
1440                        const int *atinfo,
1441                        rvec *x,
1442                        int nmoved, int *move,
1443                        int nb_kernel_type,
1444                        nbnxn_atomdata_t *nbat)
1445 {
1446     nbnxn_grid_t *grid;
1447     int           n;
1448     int           nc_max_grid, nc_max;
1449
1450     grid = &nbs->grid[dd_zone];
1451
1452     nbs_cycle_start(&nbs->cc[enbsCCgrid]);
1453
1454     grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
1455
1456     grid->na_c      = nbnxn_kernel_to_cluster_i_size(nb_kernel_type);
1457     grid->na_cj     = nbnxn_kernel_to_cluster_j_size(nb_kernel_type);
1458     grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
1459     grid->na_c_2log = get_2log(grid->na_c);
1460
1461     nbat->na_c = grid->na_c;
1462
1463     if (dd_zone == 0)
1464     {
1465         grid->cell0 = 0;
1466     }
1467     else
1468     {
1469         grid->cell0 =
1470             (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
1471             nbs->grid[dd_zone-1].na_sc/grid->na_sc;
1472     }
1473
1474     n = a1 - a0;
1475
1476     if (dd_zone == 0)
1477     {
1478         nbs->ePBC = ePBC;
1479         copy_mat(box, nbs->box);
1480
1481         /* Avoid zero density */
1482         if (atom_density > 0)
1483         {
1484             grid->atom_density = atom_density;
1485         }
1486         else
1487         {
1488             grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
1489         }
1490
1491         grid->cell0 = 0;
1492
1493         nbs->natoms_local    = a1 - nmoved;
1494         /* We assume that nbnxn_put_on_grid is called first
1495          * for the local atoms (dd_zone=0).
1496          */
1497         nbs->natoms_nonlocal = a1 - nmoved;
1498
1499         if (debug)
1500         {
1501             fprintf(debug, "natoms_local = %5d atom_density = %5.1f\n",
1502                     nbs->natoms_local, grid->atom_density);
1503         }
1504     }
1505     else
1506     {
1507         nbs->natoms_nonlocal = std::max(nbs->natoms_nonlocal, a1);
1508     }
1509
1510     /* We always use the home zone (grid[0]) for setting the cell size,
1511      * since determining densities for non-local zones is difficult.
1512      */
1513     nc_max_grid = set_grid_size_xy(nbs, grid,
1514                                    dd_zone, n-nmoved, corner0, corner1,
1515                                    nbs->grid[0].atom_density);
1516
1517     nc_max = grid->cell0 + nc_max_grid;
1518
1519     if (a1 > nbs->cell_nalloc)
1520     {
1521         nbs->cell_nalloc = over_alloc_large(a1);
1522         srenew(nbs->cell, nbs->cell_nalloc);
1523     }
1524
1525     /* To avoid conditionals we store the moved particles at the end of a,
1526      * make sure we have enough space.
1527      */
1528     if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
1529     {
1530         nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
1531         srenew(nbs->a, nbs->a_nalloc);
1532     }
1533
1534     /* We need padding up to a multiple of the buffer flag size: simply add */
1535     if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
1536     {
1537         nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
1538     }
1539
1540     calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
1541
1542     if (dd_zone == 0)
1543     {
1544         nbat->natoms_local = nbat->natoms;
1545     }
1546
1547     nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
1548 }
1549
1550 /* Calls nbnxn_put_on_grid for all non-local domains */
1551 void nbnxn_put_on_grid_nonlocal(nbnxn_search_t                   nbs,
1552                                 const struct gmx_domdec_zones_t *zones,
1553                                 const int                       *atinfo,
1554                                 rvec                            *x,
1555                                 int                              nb_kernel_type,
1556                                 nbnxn_atomdata_t                *nbat)
1557 {
1558     rvec c0, c1;
1559
1560     for (int zone = 1; zone < zones->n; zone++)
1561     {
1562         for (int d = 0; d < DIM; d++)
1563         {
1564             c0[d] = zones->size[zone].bb_x0[d];
1565             c1[d] = zones->size[zone].bb_x1[d];
1566         }
1567
1568         nbnxn_put_on_grid(nbs, nbs->ePBC, NULL,
1569                           zone, c0, c1,
1570                           zones->cg_range[zone],
1571                           zones->cg_range[zone+1],
1572                           -1,
1573                           atinfo,
1574                           x,
1575                           0, NULL,
1576                           nb_kernel_type,
1577                           nbat);
1578     }
1579 }
1580
1581 /* Add simple grid type information to the local super/sub grid */
1582 void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
1583                            nbnxn_atomdata_t *nbat)
1584 {
1585     nbnxn_grid_t *grid;
1586     float        *bbcz;
1587     nbnxn_bb_t   *bb;
1588     int           ncd;
1589
1590     grid = &nbs->grid[0];
1591
1592     if (grid->bSimple)
1593     {
1594         gmx_incons("nbnxn_grid_simple called with a simple grid");
1595     }
1596
1597     ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
1598
1599     if (grid->nc*ncd > grid->nc_nalloc_simple)
1600     {
1601         grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
1602         srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
1603         srenew(grid->bb_simple, grid->nc_nalloc_simple);
1604         srenew(grid->flags_simple, grid->nc_nalloc_simple);
1605         if (nbat->XFormat)
1606         {
1607             sfree_aligned(grid->bbj);
1608             snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
1609         }
1610     }
1611
1612     bbcz = grid->bbcz_simple;
1613     bb   = grid->bb_simple;
1614
1615 #if (defined GMX_OPENMP) && !(defined __clang_analyzer__)
1616     // cppcheck-suppress unreadVariable
1617     int nthreads = gmx_omp_nthreads_get(emntPairsearch);
1618 #endif
1619
1620 #pragma omp parallel for num_threads(nthreads) schedule(static)
1621     for (int sc = 0; sc < grid->nc; sc++)
1622     {
1623         try
1624         {
1625             for (int c = 0; c < ncd; c++)
1626             {
1627                 int tx = sc*ncd + c;
1628                 int na = NBNXN_CPU_CLUSTER_I_SIZE;
1629                 while (na > 0 &&
1630                        nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
1631                 {
1632                     na--;
1633                 }
1634
1635                 if (na > 0)
1636                 {
1637                     switch (nbat->XFormat)
1638                     {
1639                         case nbatX4:
1640                             /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
1641                             calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
1642                                                    bb+tx);
1643                             break;
1644                         case nbatX8:
1645                             /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
1646                             calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
1647                                                    bb+tx);
1648                             break;
1649                         default:
1650                             calc_bounding_box(na, nbat->xstride,
1651                                               nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
1652                                               bb+tx);
1653                             break;
1654                     }
1655                     bbcz[tx*NNBSBB_D+0] = bb[tx].lower[BB_Z];
1656                     bbcz[tx*NNBSBB_D+1] = bb[tx].upper[BB_Z];
1657
1658                     /* No interaction optimization yet here */
1659                     grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
1660                 }
1661                 else
1662                 {
1663                     grid->flags_simple[tx] = 0;
1664                 }
1665             }
1666         }
1667         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1668     }
1669
1670     if (grid->bSimple && nbat->XFormat == nbatX8)
1671     {
1672         combine_bounding_box_pairs(grid, grid->bb_simple);
1673     }
1674 }
1675
1676 void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
1677 {
1678     *ncx = nbs->grid[0].ncx;
1679     *ncy = nbs->grid[0].ncy;
1680 }
1681
1682 void nbnxn_get_atomorder(const nbnxn_search_t nbs, const int **a, int *n)
1683 {
1684     const nbnxn_grid_t *grid;
1685
1686     grid = &nbs->grid[0];
1687
1688     /* Return the atom order for the home cell (index 0) */
1689     *a  = nbs->a;
1690
1691     *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
1692 }
1693
1694 void nbnxn_set_atomorder(nbnxn_search_t nbs)
1695 {
1696     /* Set the atom order for the home cell (index 0) */
1697     nbnxn_grid_t *grid = &nbs->grid[0];
1698
1699     int           ao = 0;
1700     for (int cx = 0; cx < grid->ncx; cx++)
1701     {
1702         for (int cy = 0; cy < grid->ncy; cy++)
1703         {
1704             int cxy = cx*grid->ncy + cy;
1705             int j   = grid->cxy_ind[cxy]*grid->na_sc;
1706             for (int cz = 0; cz < grid->cxy_na[cxy]; cz++)
1707             {
1708                 nbs->a[j]     = ao;
1709                 nbs->cell[ao] = j;
1710                 ao++;
1711                 j++;
1712             }
1713         }
1714     }
1715 }