src/gromacs/nbnxm/grid.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  *
  38  * \brief
  39  * Implements the Grid class.
  40  *
  41  * \author Berk Hess <hess@kth.se>
  42  * \ingroup module_nbnxm
  43  */
  44
  45 #include "gmxpre.h"
  46
  47 #include "grid.h"
  48
  49 #include <cmath>
  50 #include <cstring>
  51
  52 #include <algorithm>
  53
  54 #include "gromacs/math/utilities.h"
  55 #include "gromacs/math/vec.h"
  56 #include "gromacs/mdlib/gmx_omp_nthreads.h"
  57 #include "gromacs/mdlib/updategroupscog.h"
  58 #include "gromacs/mdtypes/forcerec.h" // only for GET_CGINFO_*
  59 #include "gromacs/nbnxm/atomdata.h"
  60 #include "gromacs/nbnxm/nbnxm_geometry.h"
  61 #include "gromacs/simd/simd.h"
  62 #include "gromacs/simd/vector_operations.h"
  63
  64 #include "gridsetdata.h"
  65 #include "pairlistparams.h"
  66
  67 namespace Nbnxm
  68 {
  69
  70 Grid::Geometry::Geometry(const PairlistType pairlistType) :
  71     isSimple(pairlistType != PairlistType::HierarchicalNxN),
  72     numAtomsICluster(IClusterSizePerListType[pairlistType]),
  73     numAtomsJCluster(JClusterSizePerListType[pairlistType]),
  74     numAtomsPerCell((isSimple ? 1 : c_gpuNumClusterPerCell)*numAtomsICluster),
  75     numAtomsICluster2Log(get_2log(numAtomsICluster))
  76 {
  77 }
  78
  79 Grid::Grid(const PairlistType  pairlistType,
  80            const bool         &haveFep) :
  81     geometry_(pairlistType),
  82     haveFep_(haveFep)
  83 {
  84 }
  85
  86 /*! \brief Returns the atom density (> 0) of a rectangular grid */
  87 static real gridAtomDensity(int        numAtoms,
  88                             const rvec lowerCorner,
  89                             const rvec upperCorner)
  90 {
  91     rvec size;
  92
  93     if (numAtoms == 0)
  94     {
  95         /* To avoid zero density we use a minimum of 1 atom */
  96         numAtoms = 1;
  97     }
  98
  99     rvec_sub(upperCorner, lowerCorner, size);
 100
 101     return numAtoms/(size[XX]*size[YY]*size[ZZ]);
 102 }
 103
 104 void Grid::setDimensions(const int            ddZone,
 105                          const int            numAtoms,
 106                          const rvec           lowerCorner,
 107                          const rvec           upperCorner,
 108                          real                 atomDensity,
 109                          const real           maxAtomGroupRadius,
 110                          const bool           haveFep,
 111                          gmx::PinningPolicy   pinningPolicy)
 112 {
 113     /* For the home zone we compute the density when not set (=-1) or when =0 */
 114     if (ddZone == 0 && atomDensity <= 0)
 115     {
 116         atomDensity = gridAtomDensity(numAtoms, lowerCorner, upperCorner);
 117     }
 118
 119     dimensions_.atomDensity        = atomDensity;
 120     dimensions_.maxAtomGroupRadius = maxAtomGroupRadius;
 121
 122     rvec size;
 123     rvec_sub(upperCorner, lowerCorner, size);
 124
 125     if (numAtoms > geometry_.numAtomsPerCell)
 126     {
 127         GMX_ASSERT(atomDensity > 0, "With one or more atoms, the density should be positive");
 128
 129         /* target cell length */
 130         real tlen_x;
 131         real tlen_y;
 132         if (geometry_.isSimple)
 133         {
 134             /* To minimize the zero interactions, we should make
 135              * the largest of the i/j cell cubic.
 136              */
 137             int numAtomsInCell = std::max(geometry_.numAtomsICluster,
 138                                           geometry_.numAtomsJCluster);
 139
 140             /* Approximately cubic cells */
 141             real tlen = std::cbrt(numAtomsInCell/atomDensity);
 142             tlen_x    = tlen;
 143             tlen_y    = tlen;
 144         }
 145         else
 146         {
 147             /* Approximately cubic sub cells */
 148             real tlen = std::cbrt(geometry_.numAtomsICluster/atomDensity);
 149             tlen_x    = tlen*c_gpuNumClusterPerCellX;
 150             tlen_y    = tlen*c_gpuNumClusterPerCellY;
 151         }
 152         /* We round ncx and ncy down, because we get less cell pairs
 153          * in the nbsist when the fixed cell dimensions (x,y) are
 154          * larger than the variable one (z) than the other way around.
 155          */
 156         dimensions_.numCells[XX] = std::max(1, static_cast<int>(size[XX]/tlen_x));
 157         dimensions_.numCells[YY] = std::max(1, static_cast<int>(size[YY]/tlen_y));
 158     }
 159     else
 160     {
 161         dimensions_.numCells[XX] = 1;
 162         dimensions_.numCells[YY] = 1;
 163     }
 164
 165     for (int d = 0; d < DIM - 1; d++)
 166     {
 167         dimensions_.cellSize[d]    = size[d]/dimensions_.numCells[d];
 168         dimensions_.invCellSize[d] = 1/dimensions_.cellSize[d];
 169     }
 170
 171     if (ddZone > 0)
 172     {
 173         /* This is a non-home zone, add an extra row of cells
 174          * for particles communicated for bonded interactions.
 175          * These can be beyond the cut-off. It doesn't matter where
 176          * they end up on the grid, but for performance it's better
 177          * if they don't end up in cells that can be within cut-off range.
 178          */
 179         dimensions_.numCells[XX]++;
 180         dimensions_.numCells[YY]++;
 181     }
 182
 183     /* We need one additional cell entry for particles moved by DD */
 184     cxy_na_.resize(numColumns() + 1);
 185     cxy_ind_.resize(numColumns() + 2);
 186     changePinningPolicy(&cxy_na_, pinningPolicy);
 187     changePinningPolicy(&cxy_ind_, pinningPolicy);
 188
 189     /* Worst case scenario of 1 atom in each last cell */
 190     int maxNumCells;
 191     if (geometry_.numAtomsJCluster <= geometry_.numAtomsICluster)
 192     {
 193         maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns();
 194     }
 195     else
 196     {
 197         maxNumCells = numAtoms/geometry_.numAtomsPerCell + numColumns()*geometry_.numAtomsJCluster/geometry_.numAtomsICluster;
 198     }
 199
 200     if (!geometry_.isSimple)
 201     {
 202         numClusters_.resize(maxNumCells);
 203     }
 204     bbcz_.resize(maxNumCells);
 205
 206     /* This resize also zeros the contents, this avoid possible
 207      * floating exceptions in SIMD with the unused bb elements.
 208      */
 209     if (geometry_.isSimple)
 210     {
 211         bb_.resize(maxNumCells);
 212     }
 213     else
 214     {
 215 #if NBNXN_BBXXXX
 216         pbb_.resize(packedBoundingBoxesIndex(maxNumCells*c_gpuNumClusterPerCell));
 217 #else
 218         bb_.resize(maxNumCells*c_gpuNumClusterPerCell);
 219 #endif
 220     }
 221
 222     if (geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 223     {
 224         bbj_ = bb_;
 225     }
 226     else
 227     {
 228         GMX_ASSERT(geometry_.isSimple, "Only CPU lists should have different i/j cluster sizes");
 229
 230         bbjStorage_.resize(maxNumCells*geometry_.numAtomsICluster/geometry_.numAtomsJCluster);
 231         bbj_ = bbjStorage_;
 232     }
 233
 234     flags_.resize(maxNumCells);
 235     if (haveFep)
 236     {
 237         fep_.resize(maxNumCells*geometry_.numAtomsPerCell/geometry_.numAtomsICluster);
 238     }
 239
 240     copy_rvec(lowerCorner, dimensions_.lowerCorner);
 241     copy_rvec(upperCorner, dimensions_.upperCorner);
 242     copy_rvec(size,        dimensions_.gridSize);
 243 }
 244
 245 /* We need to sort paricles in grid columns on z-coordinate.
 246  * As particle are very often distributed homogeneously, we use a sorting
 247  * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 248  * by a factor, cast to an int and try to store in that hole. If the hole
 249  * is full, we move this or another particle. A second pass is needed to make
 250  * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 251  * 4 is the optimal value for homogeneous particle distribution and allows
 252  * for an O(#particles) sort up till distributions were all particles are
 253  * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 254  * as it can be expensive to detect imhomogeneous particle distributions.
 255  */
 256 /*! \brief Ratio of grid cells to atoms */
 257 static constexpr int c_sortGridRatio         = 4;
 258 /*! \brief Maximum ratio of holes used, in the worst case all particles
 259  * end up in the last hole and we need num. atoms extra holes at the end.
 260  */
 261 static constexpr int c_sortGridMaxSizeFactor = c_sortGridRatio + 1;
 262
 263 /*! \brief Sorts particle index a on coordinates x along dim.
 264  *
 265  * Backwards tells if we want decreasing iso increasing coordinates.
 266  * h0 is the minimum of the coordinate range.
 267  * invh is the 1/length of the sorting range.
 268  * n_per_h (>=n) is the expected average number of particles per 1/invh
 269  * sort is the sorting work array.
 270  * sort should have a size of at least n_per_h*c_sortGridRatio + n,
 271  * or easier, allocate at least n*c_sortGridMaxSizeFactor elements.
 272  */
 273 static void sort_atoms(int dim, gmx_bool Backwards,
 274                        int gmx_unused dd_zone,
 275                        bool gmx_unused relevantAtomsAreWithinGridBounds,
 276                        int *a, int n,
 277                        gmx::ArrayRef<const gmx::RVec> x,
 278                        real h0, real invh, int n_per_h,
 279                        gmx::ArrayRef<int> sort)
 280 {
 281     if (n <= 1)
 282     {
 283         /* Nothing to do */
 284         return;
 285     }
 286
 287     GMX_ASSERT(n <= n_per_h, "We require n <= n_per_h");
 288
 289     /* Transform the inverse range height into the inverse hole height */
 290     invh *= n_per_h*c_sortGridRatio;
 291
 292     /* Set nsort to the maximum possible number of holes used.
 293      * In worst case all n elements end up in the last bin.
 294      */
 295     int nsort = n_per_h*c_sortGridRatio + n;
 296
 297     /* Determine the index range used, so we can limit it for the second pass */
 298     int zi_min = INT_MAX;
 299     int zi_max = -1;
 300
 301     /* Sort the particles using a simple index sort */
 302     for (int i = 0; i < n; i++)
 303     {
 304         /* The cast takes care of float-point rounding effects below zero.
 305          * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 306          * times the box height out of the box.
 307          */
 308         int zi = static_cast<int>((x[a[i]][dim] - h0)*invh);
 309
 310 #ifndef NDEBUG
 311         /* As we can have rounding effect, we use > iso >= here */
 312         if (relevantAtomsAreWithinGridBounds &&
 313             (zi < 0 || (dd_zone == 0 && zi > n_per_h*c_sortGridRatio)))
 314         {
 315             gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 316                       a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 317                       n_per_h, c_sortGridRatio);
 318         }
 319 #endif
 320         if (zi < 0)
 321         {
 322             zi = 0;
 323         }
 324
 325         /* In a non-local domain, particles communcated for bonded interactions
 326          * can be far beyond the grid size, which is set by the non-bonded
 327          * cut-off distance. We sort such particles into the last cell.
 328          */
 329         if (zi > n_per_h*c_sortGridRatio)
 330         {
 331             zi = n_per_h*c_sortGridRatio;
 332         }
 333
 334         /* Ideally this particle should go in sort cell zi,
 335          * but that might already be in use,
 336          * in that case find the first empty cell higher up
 337          */
 338         if (sort[zi] < 0)
 339         {
 340             sort[zi] = a[i];
 341             zi_min   = std::min(zi_min, zi);
 342             zi_max   = std::max(zi_max, zi);
 343         }
 344         else
 345         {
 346             /* We have multiple atoms in the same sorting slot.
 347              * Sort on real z for minimal bounding box size.
 348              * There is an extra check for identical z to ensure
 349              * well-defined output order, independent of input order
 350              * to ensure binary reproducibility after restarts.
 351              */
 352             while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 353                                       (x[a[i]][dim] == x[sort[zi]][dim] &&
 354                                        a[i] > sort[zi])))
 355             {
 356                 zi++;
 357             }
 358
 359             if (sort[zi] >= 0)
 360             {
 361                 /* Shift all elements by one slot until we find an empty slot */
 362                 int cp  = sort[zi];
 363                 int zim = zi + 1;
 364                 while (sort[zim] >= 0)
 365                 {
 366                     int tmp   = sort[zim];
 367                     sort[zim] = cp;
 368                     cp        = tmp;
 369                     zim++;
 370                 }
 371                 sort[zim] = cp;
 372                 zi_max    = std::max(zi_max, zim);
 373             }
 374             sort[zi] = a[i];
 375             zi_max   = std::max(zi_max, zi);
 376         }
 377     }
 378
 379     int c = 0;
 380     if (!Backwards)
 381     {
 382         for (int zi = 0; zi < nsort; zi++)
 383         {
 384             if (sort[zi] >= 0)
 385             {
 386                 a[c++]   = sort[zi];
 387                 sort[zi] = -1;
 388             }
 389         }
 390     }
 391     else
 392     {
 393         for (int zi = zi_max; zi >= zi_min; zi--)
 394         {
 395             if (sort[zi] >= 0)
 396             {
 397                 a[c++]   = sort[zi];
 398                 sort[zi] = -1;
 399             }
 400         }
 401     }
 402     if (c < n)
 403     {
 404         gmx_incons("Lost particles while sorting");
 405     }
 406 }
 407
 408 #if GMX_DOUBLE
 409 //! Returns double up to one least significant float bit smaller than x
 410 static double R2F_D(const float x)
 411 {
 412     return static_cast<float>(x >= 0 ? (1 - GMX_FLOAT_EPS)*x : (1 + GMX_FLOAT_EPS)*x);
 413 }
 414 //! Returns double up to one least significant float bit larger than x
 415 static double R2F_U(const float x)
 416 {
 417     return static_cast<float>(x >= 0 ? (1 + GMX_FLOAT_EPS)*x : (1 - GMX_FLOAT_EPS)*x);
 418 }
 419 #else
 420 //! Returns x
 421 static float R2F_D(const float x)
 422 {
 423     return x;
 424 }
 425 //! Returns x
 426 static float R2F_U(const float x)
 427 {
 428     return x;
 429 }
 430 #endif
 431
 432 //! Computes the bounding box for na coordinates in order x,y,z, bb order xyz0
 433 static void calc_bounding_box(int na, int stride, const real *x,
 434                               BoundingBox *bb)
 435 {
 436     int  i;
 437     real xl, xh, yl, yh, zl, zh;
 438
 439     i  = 0;
 440     xl = x[i+XX];
 441     xh = x[i+XX];
 442     yl = x[i+YY];
 443     yh = x[i+YY];
 444     zl = x[i+ZZ];
 445     zh = x[i+ZZ];
 446     i += stride;
 447     for (int j = 1; j < na; j++)
 448     {
 449         xl = std::min(xl, x[i+XX]);
 450         xh = std::max(xh, x[i+XX]);
 451         yl = std::min(yl, x[i+YY]);
 452         yh = std::max(yh, x[i+YY]);
 453         zl = std::min(zl, x[i+ZZ]);
 454         zh = std::max(zh, x[i+ZZ]);
 455         i += stride;
 456     }
 457     /* Note: possible double to float conversion here */
 458     bb->lower.x = R2F_D(xl);
 459     bb->lower.y = R2F_D(yl);
 460     bb->lower.z = R2F_D(zl);
 461     bb->upper.x = R2F_U(xh);
 462     bb->upper.y = R2F_U(yh);
 463     bb->upper.z = R2F_U(zh);
 464 }
 465
 466 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 467 static void calc_bounding_box_x_x4(int na, const real *x,
 468                                    BoundingBox *bb)
 469 {
 470     real xl, xh, yl, yh, zl, zh;
 471
 472     xl = x[XX*c_packX4];
 473     xh = x[XX*c_packX4];
 474     yl = x[YY*c_packX4];
 475     yh = x[YY*c_packX4];
 476     zl = x[ZZ*c_packX4];
 477     zh = x[ZZ*c_packX4];
 478     for (int j = 1; j < na; j++)
 479     {
 480         xl = std::min(xl, x[j+XX*c_packX4]);
 481         xh = std::max(xh, x[j+XX*c_packX4]);
 482         yl = std::min(yl, x[j+YY*c_packX4]);
 483         yh = std::max(yh, x[j+YY*c_packX4]);
 484         zl = std::min(zl, x[j+ZZ*c_packX4]);
 485         zh = std::max(zh, x[j+ZZ*c_packX4]);
 486     }
 487     /* Note: possible double to float conversion here */
 488     bb->lower.x = R2F_D(xl);
 489     bb->lower.y = R2F_D(yl);
 490     bb->lower.z = R2F_D(zl);
 491     bb->upper.x = R2F_U(xh);
 492     bb->upper.y = R2F_U(yh);
 493     bb->upper.z = R2F_U(zh);
 494 }
 495
 496 /*! \brief Computes the bounding box for na coordinates, bb order xyz0 */
 497 static void calc_bounding_box_x_x8(int na, const real *x,
 498                                    BoundingBox *bb)
 499 {
 500     real xl, xh, yl, yh, zl, zh;
 501
 502     xl = x[XX*c_packX8];
 503     xh = x[XX*c_packX8];
 504     yl = x[YY*c_packX8];
 505     yh = x[YY*c_packX8];
 506     zl = x[ZZ*c_packX8];
 507     zh = x[ZZ*c_packX8];
 508     for (int j = 1; j < na; j++)
 509     {
 510         xl = std::min(xl, x[j+XX*c_packX8]);
 511         xh = std::max(xh, x[j+XX*c_packX8]);
 512         yl = std::min(yl, x[j+YY*c_packX8]);
 513         yh = std::max(yh, x[j+YY*c_packX8]);
 514         zl = std::min(zl, x[j+ZZ*c_packX8]);
 515         zh = std::max(zh, x[j+ZZ*c_packX8]);
 516     }
 517     /* Note: possible double to float conversion here */
 518     bb->lower.x = R2F_D(xl);
 519     bb->lower.y = R2F_D(yl);
 520     bb->lower.z = R2F_D(zl);
 521     bb->upper.x = R2F_U(xh);
 522     bb->upper.y = R2F_U(yh);
 523     bb->upper.z = R2F_U(zh);
 524 }
 525
 526 /*! \brief Computes the bounding box for na packed coordinates, bb order xyz0 */
 527 gmx_unused static void calc_bounding_box_x_x4_halves(int na, const real *x,
 528                                                      BoundingBox *bb,
 529                                                      BoundingBox *bbj)
 530 {
 531     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 532     using namespace gmx;
 533
 534     calc_bounding_box_x_x4(std::min(na, 2), x, bbj);
 535
 536     if (na > 2)
 537     {
 538         calc_bounding_box_x_x4(std::min(na-2, 2), x+(c_packX4>>1), bbj+1);
 539     }
 540     else
 541     {
 542         /* Set the "empty" bounding box to the same as the first one,
 543          * so we don't need to treat special cases in the rest of the code.
 544          */
 545 #if NBNXN_SEARCH_BB_SIMD4
 546         store4(bbj[1].lower.ptr(), load4(bbj[0].lower.ptr()));
 547         store4(bbj[1].upper.ptr(), load4(bbj[0].upper.ptr()));
 548 #else
 549         bbj[1] = bbj[0];
 550 #endif
 551     }
 552
 553 #if NBNXN_SEARCH_BB_SIMD4
 554     store4(bb->lower.ptr(),
 555            min(load4(bbj[0].lower.ptr()), load4(bbj[1].lower.ptr())));
 556     store4(bb->upper.ptr(),
 557            max(load4(bbj[0].upper.ptr()), load4(bbj[1].upper.ptr())));
 558 #else
 559     {
 560         bb->lower = BoundingBox::Corner::min(bbj[0].lower,
 561                                              bbj[1].lower);
 562         bb->upper = BoundingBox::Corner::max(bbj[0].upper,
 563                                              bbj[1].upper);
 564     }
 565 #endif
 566 }
 567
 568 #if NBNXN_BBXXXX
 569
 570 /*! \brief Computes the bounding box for na coordinates in order xyz, bb order xxxxyyyyzzzz */
 571 static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 572 {
 573     int  i;
 574     real xl, xh, yl, yh, zl, zh;
 575
 576     i  = 0;
 577     xl = x[i+XX];
 578     xh = x[i+XX];
 579     yl = x[i+YY];
 580     yh = x[i+YY];
 581     zl = x[i+ZZ];
 582     zh = x[i+ZZ];
 583     i += stride;
 584     for (int j = 1; j < na; j++)
 585     {
 586         xl = std::min(xl, x[i+XX]);
 587         xh = std::max(xh, x[i+XX]);
 588         yl = std::min(yl, x[i+YY]);
 589         yh = std::max(yh, x[i+YY]);
 590         zl = std::min(zl, x[i+ZZ]);
 591         zh = std::max(zh, x[i+ZZ]);
 592         i += stride;
 593     }
 594     /* Note: possible double to float conversion here */
 595     bb[0*c_packedBoundingBoxesDimSize] = R2F_D(xl);
 596     bb[1*c_packedBoundingBoxesDimSize] = R2F_D(yl);
 597     bb[2*c_packedBoundingBoxesDimSize] = R2F_D(zl);
 598     bb[3*c_packedBoundingBoxesDimSize] = R2F_U(xh);
 599     bb[4*c_packedBoundingBoxesDimSize] = R2F_U(yh);
 600     bb[5*c_packedBoundingBoxesDimSize] = R2F_U(zh);
 601 }
 602
 603 #endif /* NBNXN_BBXXXX */
 604
 605 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 606
 607 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xyz0 */
 608 static void calc_bounding_box_simd4(int na, const float *x,
 609                                     BoundingBox *bb)
 610 {
 611     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 612     using namespace gmx;
 613
 614     static_assert(sizeof(BoundingBox::Corner) == GMX_SIMD4_WIDTH*sizeof(float),
 615                   "The Corner struct should hold exactly 4 floats");
 616
 617     Simd4Float bb_0_S = load4(x);
 618     Simd4Float bb_1_S = bb_0_S;
 619
 620     for (int i = 1; i < na; i++)
 621     {
 622         Simd4Float x_S = load4(x + i*GMX_SIMD4_WIDTH);
 623         bb_0_S         = min(bb_0_S, x_S);
 624         bb_1_S         = max(bb_1_S, x_S);
 625     }
 626
 627     store4(bb->lower.ptr(), bb_0_S);
 628     store4(bb->upper.ptr(), bb_1_S);
 629 }
 630
 631 #if NBNXN_BBXXXX
 632
 633 /*! \brief Computes the bounding box for na coordinates in order xyz?, bb order xxxxyyyyzzzz */
 634 static void calc_bounding_box_xxxx_simd4(int na, const float *x,
 635                                          BoundingBox *bb_work_aligned,
 636                                          real *bb)
 637 {
 638     calc_bounding_box_simd4(na, x, bb_work_aligned);
 639
 640     bb[0*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.x;
 641     bb[1*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.y;
 642     bb[2*c_packedBoundingBoxesDimSize] = bb_work_aligned->lower.z;
 643     bb[3*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.x;
 644     bb[4*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.y;
 645     bb[5*c_packedBoundingBoxesDimSize] = bb_work_aligned->upper.z;
 646 }
 647
 648 #endif /* NBNXN_BBXXXX */
 649
 650 #endif /* NBNXN_SEARCH_SIMD4_FLOAT_X_BB */
 651
 652
 653 /*! \brief Combines pairs of consecutive bounding boxes */
 654 static void combine_bounding_box_pairs(const Grid                       &grid,
 655                                        gmx::ArrayRef<const BoundingBox>  bb,
 656                                        gmx::ArrayRef<BoundingBox>        bbj)
 657 {
 658     // TODO: During SIMDv2 transition only some archs use namespace (remove when done)
 659     using namespace gmx;
 660
 661     for (int i = 0; i < grid.numColumns(); i++)
 662     {
 663         /* Starting bb in a column is expected to be 2-aligned */
 664         const int sc2 = grid.firstCellInColumn(i) >> 1;
 665         /* For odd numbers skip the last bb here */
 666         const int nc2 = (grid.numAtomsInColumn(i) + 3) >> (2 + 1);
 667         int       c2;
 668         for (c2 = sc2; c2 < sc2 + nc2; c2++)
 669         {
 670 #if NBNXN_SEARCH_BB_SIMD4
 671             Simd4Float min_S, max_S;
 672
 673             min_S = min(load4(bb[c2*2+0].lower.ptr()),
 674                         load4(bb[c2*2+1].lower.ptr()));
 675             max_S = max(load4(bb[c2*2+0].upper.ptr()),
 676                         load4(bb[c2*2+1].upper.ptr()));
 677             store4(bbj[c2].lower.ptr(), min_S);
 678             store4(bbj[c2].upper.ptr(), max_S);
 679 #else
 680             bbj[c2].lower = BoundingBox::Corner::min(bb[c2*2 + 0].lower,
 681                                                      bb[c2*2 + 1].lower);
 682             bbj[c2].upper = BoundingBox::Corner::max(bb[c2*2 + 0].upper,
 683                                                      bb[c2*2 + 1].upper);
 684 #endif
 685         }
 686         if (((grid.numAtomsInColumn(i) + 3) >> 2) & 1)
 687         {
 688             /* The bb count in this column is odd: duplicate the last bb */
 689             bbj[c2].lower = bb[c2*2].lower;
 690             bbj[c2].upper = bb[c2*2].upper;
 691         }
 692     }
 693 }
 694
 695
 696 /*! \brief Prints the average bb size, used for debug output */
 697 static void print_bbsizes_simple(FILE       *fp,
 698                                  const Grid &grid)
 699 {
 700     dvec ba = { 0 };
 701     for (int c = 0; c < grid.numCells(); c++)
 702     {
 703         const BoundingBox &bb = grid.iBoundingBoxes()[c];
 704         ba[XX] += bb.upper.x - bb.lower.x;
 705         ba[YY] += bb.upper.y - bb.lower.y;
 706         ba[ZZ] += bb.upper.z - bb.lower.z;
 707     }
 708     dsvmul(1.0/grid.numCells(), ba, ba);
 709
 710     const Grid::Dimensions &dims         = grid.dimensions();
 711     real                    avgCellSizeZ = (dims.atomDensity > 0 ? grid.geometry().numAtomsICluster/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]) : 0.0);
 712
 713     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 714             dims.cellSize[XX],
 715             dims.cellSize[YY],
 716             avgCellSizeZ,
 717             ba[XX], ba[YY], ba[ZZ],
 718             ba[XX]*dims.invCellSize[XX],
 719             ba[YY]*dims.invCellSize[YY],
 720             dims.atomDensity > 0 ? ba[ZZ]/avgCellSizeZ : 0.0);
 721 }
 722
 723 /*! \brief Prints the average bb size, used for debug output */
 724 static void print_bbsizes_supersub(FILE       *fp,
 725                                    const Grid &grid)
 726 {
 727     int  ns;
 728     dvec ba;
 729
 730     clear_dvec(ba);
 731     ns = 0;
 732     for (int c = 0; c < grid.numCells(); c++)
 733     {
 734 #if NBNXN_BBXXXX
 735         for (int s = 0; s < grid.numClustersPerCell()[c]; s += c_packedBoundingBoxesDimSize)
 736         {
 737             int  cs_w          = (c*c_gpuNumClusterPerCell + s)/c_packedBoundingBoxesDimSize;
 738             auto boundingBoxes = grid.packedBoundingBoxes().subArray(cs_w*c_packedBoundingBoxesSize, c_packedBoundingBoxesSize);
 739             for (int i = 0; i < c_packedBoundingBoxesDimSize; i++)
 740             {
 741                 for (int d = 0; d < DIM; d++)
 742                 {
 743                     ba[d] +=
 744                         boundingBoxes[(DIM + d)*c_packedBoundingBoxesDimSize + i] -
 745                         boundingBoxes[(0   + d)*c_packedBoundingBoxesDimSize + i];
 746                 }
 747             }
 748         }
 749 #else
 750         for (int s = 0; s < grid.numClustersPerCell()[c]; s++)
 751         {
 752             const BoundingBox &bb = grid.iBoundingBoxes()[c*c_gpuNumClusterPerCell + s];
 753             ba[XX] += bb.upper.x - bb.lower.x;
 754             ba[YY] += bb.upper.y - bb.lower.y;
 755             ba[ZZ] += bb.upper.z - bb.lower.z;
 756         }
 757 #endif
 758         ns += grid.numClustersPerCell()[c];
 759     }
 760     dsvmul(1.0/ns, ba, ba);
 761
 762     const Grid::Dimensions &dims            = grid.dimensions();
 763     const real              avgClusterSizeZ =
 764         (dims.atomDensity > 0 ? grid.geometry().numAtomsPerCell/(dims.atomDensity*dims.cellSize[XX]*dims.cellSize[YY]*c_gpuNumClusterPerCellZ) : 0.0);
 765
 766     fprintf(fp, "ns bb: grid %4.2f %4.2f %4.2f abs %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 767             dims.cellSize[XX]/c_gpuNumClusterPerCellX,
 768             dims.cellSize[YY]/c_gpuNumClusterPerCellY,
 769             avgClusterSizeZ,
 770             ba[XX], ba[YY], ba[ZZ],
 771             ba[XX]*c_gpuNumClusterPerCellX*dims.invCellSize[XX],
 772             ba[YY]*c_gpuNumClusterPerCellY*dims.invCellSize[YY],
 773             dims.atomDensity > 0 ? ba[ZZ]/avgClusterSizeZ : 0.0);
 774 }
 775
 776 /*!\brief Set non-bonded interaction flags for the current cluster.
 777  *
 778  * Sorts atoms on LJ coefficients: !=0 first, ==0 at the end.
 779  */
 780 static void sort_cluster_on_flag(int                 numAtomsInCluster,
 781                                  int                 atomStart,
 782                                  int                 atomEnd,
 783                                  const int          *atinfo,
 784                                  gmx::ArrayRef<int>  order,
 785                                  int                *flags)
 786 {
 787     constexpr int c_maxNumAtomsInCluster = 8;
 788     int           sort1[c_maxNumAtomsInCluster];
 789     int           sort2[c_maxNumAtomsInCluster];
 790
 791     GMX_ASSERT(numAtomsInCluster <= c_maxNumAtomsInCluster, "Need to increase c_maxNumAtomsInCluster to support larger clusters");
 792
 793     *flags = 0;
 794
 795     int subc = 0;
 796     for (int s = atomStart; s < atomEnd; s += numAtomsInCluster)
 797     {
 798         /* Make lists for this (sub-)cell on atoms with and without LJ */
 799         int      n1         = 0;
 800         int      n2         = 0;
 801         gmx_bool haveQ      = FALSE;
 802         int      a_lj_max   = -1;
 803         for (int a = s; a < std::min(s + numAtomsInCluster, atomEnd); a++)
 804         {
 805             haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 806
 807             if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 808             {
 809                 sort1[n1++] = order[a];
 810                 a_lj_max    = a;
 811             }
 812             else
 813             {
 814                 sort2[n2++] = order[a];
 815             }
 816         }
 817
 818         /* If we don't have atoms with LJ, there's nothing to sort */
 819         if (n1 > 0)
 820         {
 821             *flags |= NBNXN_CI_DO_LJ(subc);
 822
 823             if (2*n1 <= numAtomsInCluster)
 824             {
 825                 /* Only sort when strictly necessary. Ordering particles
 826                  * Ordering particles can lead to less accurate summation
 827                  * due to rounding, both for LJ and Coulomb interactions.
 828                  */
 829                 if (2*(a_lj_max - s) >= numAtomsInCluster)
 830                 {
 831                     for (int i = 0; i < n1; i++)
 832                     {
 833                         order[atomStart + i]      = sort1[i];
 834                     }
 835                     for (int j = 0; j < n2; j++)
 836                     {
 837                         order[atomStart + n1 + j] = sort2[j];
 838                     }
 839                 }
 840
 841                 *flags |= NBNXN_CI_HALF_LJ(subc);
 842             }
 843         }
 844         if (haveQ)
 845         {
 846             *flags |= NBNXN_CI_DO_COUL(subc);
 847         }
 848         subc++;
 849     }
 850 }
 851
 852 /*! \brief Fill a pair search cell with atoms.
 853  *
 854  * Potentially sorts atoms and sets the interaction flags.
 855  */
 856 void Grid::fillCell(GridSetData                    *gridSetData,
 857                     nbnxn_atomdata_t               *nbat,
 858                     int                             atomStart,
 859                     int                             atomEnd,
 860                     const int                      *atinfo,
 861                     gmx::ArrayRef<const gmx::RVec>  x,
 862                     BoundingBox gmx_unused         *bb_work_aligned)
 863 {
 864     const int                 numAtoms = atomEnd - atomStart;
 865
 866     const gmx::ArrayRef<int> &cells       = gridSetData->cells;
 867     const gmx::ArrayRef<int> &atomIndices = gridSetData->atomIndices;
 868
 869     if (geometry_.isSimple)
 870     {
 871         /* Note that non-local grids are already sorted.
 872          * Then sort_cluster_on_flag will only set the flags and the sorting
 873          * will not affect the atom order.
 874          */
 875         sort_cluster_on_flag(geometry_.numAtomsICluster, atomStart, atomEnd, atinfo, atomIndices,
 876                              flags_.data() + atomToCluster(atomStart) - cellOffset_);
 877     }
 878
 879     if (haveFep_)
 880     {
 881         /* Set the fep flag for perturbed atoms in this (sub-)cell */
 882
 883         /* The grid-local cluster/(sub-)cell index */
 884         int cell = atomToCluster(atomStart) - cellOffset_*(geometry_.isSimple ? 1 : c_gpuNumClusterPerCell);
 885         fep_[cell] = 0;
 886         for (int at = atomStart; at < atomEnd; at++)
 887         {
 888             if (atomIndices[at] >= 0 && GET_CGINFO_FEP(atinfo[atomIndices[at]]))
 889             {
 890                 fep_[cell] |= (1 << (at - atomStart));
 891             }
 892         }
 893     }
 894
 895     /* Now we have sorted the atoms, set the cell indices */
 896     for (int at = atomStart; at < atomEnd; at++)
 897     {
 898         cells[atomIndices[at]] = at;
 899     }
 900
 901     copy_rvec_to_nbat_real(atomIndices.data() + atomStart, numAtoms, geometry_.numAtomsICluster,
 902                            as_rvec_array(x.data()),
 903                            nbat->XFormat, nbat->x().data(), atomStart);
 904
 905     if (nbat->XFormat == nbatX4)
 906     {
 907         /* Store the bounding boxes as xyz.xyz. */
 908         size_t       offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
 909         BoundingBox *bb_ptr = bb_.data() + offset;
 910
 911 #if GMX_SIMD && GMX_SIMD_REAL_WIDTH == 2
 912         if (2*geometry_.numAtomsJCluster == geometry_.numAtomsICluster)
 913         {
 914             calc_bounding_box_x_x4_halves(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr,
 915                                           bbj_.data() + offset*2);
 916         }
 917         else
 918 #endif
 919         {
 920             calc_bounding_box_x_x4(numAtoms, nbat->x().data() + atom_to_x_index<c_packX4>(atomStart), bb_ptr);
 921         }
 922     }
 923     else if (nbat->XFormat == nbatX8)
 924     {
 925         /* Store the bounding boxes as xyz.xyz. */
 926         size_t       offset = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsICluster);
 927         BoundingBox *bb_ptr = bb_.data() + offset;
 928
 929         calc_bounding_box_x_x8(numAtoms, nbat->x().data() + atom_to_x_index<c_packX8>(atomStart), bb_ptr);
 930     }
 931 #if NBNXN_BBXXXX
 932     else if (!geometry_.isSimple)
 933     {
 934         /* Store the bounding boxes in a format convenient
 935          * for SIMD4 calculations: xxxxyyyyzzzz...
 936          */
 937         const int clusterIndex    = ((atomStart - cellOffset_*geometry_.numAtomsPerCell) >> geometry_.numAtomsICluster2Log);
 938         float    *pbb_ptr         =
 939             pbb_.data() +
 940             packedBoundingBoxesIndex(clusterIndex) +
 941             (clusterIndex & (c_packedBoundingBoxesDimSize - 1));
 942
 943 #if NBNXN_SEARCH_SIMD4_FLOAT_X_BB
 944         if (nbat->XFormat == nbatXYZQ)
 945         {
 946             calc_bounding_box_xxxx_simd4(numAtoms, nbat->x().data() + atomStart*nbat->xstride,
 947                                          bb_work_aligned, pbb_ptr);
 948         }
 949         else
 950 #endif
 951         {
 952             calc_bounding_box_xxxx(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 953                                    pbb_ptr);
 954         }
 955         if (gmx_debug_at)
 956         {
 957             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 958                     atomToCluster(atomStart),
 959                     pbb_ptr[0*c_packedBoundingBoxesDimSize], pbb_ptr[3*c_packedBoundingBoxesDimSize],
 960                     pbb_ptr[1*c_packedBoundingBoxesDimSize], pbb_ptr[4*c_packedBoundingBoxesDimSize],
 961                     pbb_ptr[2*c_packedBoundingBoxesDimSize], pbb_ptr[5*c_packedBoundingBoxesDimSize]);
 962         }
 963     }
 964 #endif
 965     else
 966     {
 967         /* Store the bounding boxes as xyz.xyz. */
 968         BoundingBox *bb_ptr = bb_.data() + atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
 969
 970         calc_bounding_box(numAtoms, nbat->xstride, nbat->x().data() + atomStart*nbat->xstride,
 971                           bb_ptr);
 972
 973         if (gmx_debug_at)
 974         {
 975             int bbo = atomToCluster(atomStart - cellOffset_*geometry_.numAtomsPerCell);
 976             fprintf(debug, "cell %4d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 977                     atomToCluster(atomStart),
 978                     bb_[bbo].lower.x,
 979                     bb_[bbo].lower.y,
 980                     bb_[bbo].lower.z,
 981                     bb_[bbo].upper.x,
 982                     bb_[bbo].upper.y,
 983                     bb_[bbo].upper.z);
 984         }
 985     }
 986 }
 987
 988 void Grid::sortColumnsCpuGeometry(GridSetData *gridSetData,
 989                                   int dd_zone,
 990                                   int atomStart, int atomEnd,
 991                                   const int *atinfo,
 992                                   gmx::ArrayRef<const gmx::RVec> x,
 993                                   nbnxn_atomdata_t *nbat,
 994                                   int cxy_start, int cxy_end,
 995                                   gmx::ArrayRef<int> sort_work)
 996 {
 997     if (debug)
 998     {
 999         fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
1000                 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
1001     }
1002
1003     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1004
1005     const int  numAtomsPerCell = geometry_.numAtomsPerCell;
1006
1007     /* Sort the atoms within each x,y column in 3 dimensions */
1008     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1009     {
1010         const int numAtoms   = numAtomsInColumn(cxy);
1011         const int numCellsZ  = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1012         const int atomOffset = firstAtomInColumn(cxy);
1013
1014         /* Sort the atoms within each x,y column on z coordinate */
1015         sort_atoms(ZZ, FALSE, dd_zone,
1016                    relevantAtomsAreWithinGridBounds,
1017                    gridSetData->atomIndices.data() + atomOffset, numAtoms, x,
1018                    dimensions_.lowerCorner[ZZ],
1019                    1.0/dimensions_.gridSize[ZZ], numCellsZ*numAtomsPerCell,
1020                    sort_work);
1021
1022         /* Fill the ncz cells in this column */
1023         const int firstCell  = firstCellInColumn(cxy);
1024         int       cellFilled = firstCell;
1025         for (int cellZ = 0; cellZ < numCellsZ; cellZ++)
1026         {
1027             const int cell           = firstCell + cellZ;
1028
1029             const int atomOffsetCell = atomOffset + cellZ*numAtomsPerCell;
1030             const int numAtomsCell   = std::min(numAtomsPerCell, numAtoms - (atomOffsetCell - atomOffset));
1031
1032             fillCell(gridSetData, nbat,
1033                      atomOffsetCell, atomOffsetCell + numAtomsCell, atinfo, x,
1034                      nullptr);
1035
1036             /* This copy to bbcz is not really necessary.
1037              * But it allows to use the same grid search code
1038              * for the simple and supersub cell setups.
1039              */
1040             if (numAtomsCell > 0)
1041             {
1042                 cellFilled = cell;
1043             }
1044             bbcz_[cell].lower = bb_[cellFilled].lower.z;
1045             bbcz_[cell].upper = bb_[cellFilled].upper.z;
1046         }
1047
1048         /* Set the unused atom indices to -1 */
1049         for (int ind = numAtoms; ind < numCellsZ*numAtomsPerCell; ind++)
1050         {
1051             gridSetData->atomIndices[atomOffset + ind] = -1;
1052         }
1053     }
1054 }
1055
1056 /* Spatially sort the atoms within one grid column */
1057 void Grid::sortColumnsGpuGeometry(GridSetData *gridSetData,
1058                                   int dd_zone,
1059                                   int atomStart, int atomEnd,
1060                                   const int *atinfo,
1061                                   gmx::ArrayRef<const gmx::RVec> x,
1062                                   nbnxn_atomdata_t *nbat,
1063                                   int cxy_start, int cxy_end,
1064                                   gmx::ArrayRef<int> sort_work)
1065 {
1066     BoundingBox  bb_work_array[2];
1067     BoundingBox *bb_work_aligned = reinterpret_cast<BoundingBox *>((reinterpret_cast<std::size_t>(bb_work_array + 1)) & (~(static_cast<std::size_t>(15))));
1068
1069     if (debug)
1070     {
1071         fprintf(debug, "cell_offset %d sorting columns %d - %d, atoms %d - %d\n",
1072                 cellOffset_, cxy_start, cxy_end, atomStart, atomEnd);
1073     }
1074
1075     const bool relevantAtomsAreWithinGridBounds = (dimensions_.maxAtomGroupRadius == 0);
1076
1077     const int  numAtomsPerCell = geometry_.numAtomsPerCell;
1078
1079     const int  subdiv_x = geometry_.numAtomsICluster;
1080     const int  subdiv_y = c_gpuNumClusterPerCellX*subdiv_x;
1081     const int  subdiv_z = c_gpuNumClusterPerCellY*subdiv_y;
1082
1083     /* Extract the atom index array that will be filled here */
1084     const gmx::ArrayRef<int> &atomIndices = gridSetData->atomIndices;
1085
1086     /* Sort the atoms within each x,y column in 3 dimensions.
1087      * Loop over all columns on the x/y grid.
1088      */
1089     for (int cxy = cxy_start; cxy < cxy_end; cxy++)
1090     {
1091         const int gridX            = cxy/dimensions_.numCells[YY];
1092         const int gridY            = cxy - gridX*dimensions_.numCells[YY];
1093
1094         const int numAtomsInColumn = cxy_na_[cxy];
1095         const int numCellsInColumn = cxy_ind_[cxy + 1] - cxy_ind_[cxy];
1096         const int atomOffset       = firstAtomInColumn(cxy);
1097
1098         /* Sort the atoms within each x,y column on z coordinate */
1099         sort_atoms(ZZ, FALSE, dd_zone,
1100                    relevantAtomsAreWithinGridBounds,
1101                    atomIndices.data() + atomOffset, numAtomsInColumn, x,
1102                    dimensions_.lowerCorner[ZZ],
1103                    1.0/dimensions_.gridSize[ZZ], numCellsInColumn*numAtomsPerCell,
1104                    sort_work);
1105
1106         /* This loop goes over the cells and clusters along z at once */
1107         for (int sub_z = 0; sub_z < numCellsInColumn*c_gpuNumClusterPerCellZ; sub_z++)
1108         {
1109             const int atomOffsetZ = atomOffset + sub_z*subdiv_z;
1110             const int numAtomsZ   = std::min(subdiv_z, numAtomsInColumn - (atomOffsetZ - atomOffset));
1111             int       cz          = -1;
1112             /* We have already sorted on z */
1113
1114             if (sub_z % c_gpuNumClusterPerCellZ == 0)
1115             {
1116                 cz = sub_z/c_gpuNumClusterPerCellZ;
1117                 const int cell = cxy_ind_[cxy] + cz;
1118
1119                 /* The number of atoms in this cell/super-cluster */
1120                 const int numAtoms = std::min(numAtomsPerCell, numAtomsInColumn - (atomOffsetZ - atomOffset));
1121
1122                 numClusters_[cell] = std::min(c_gpuNumClusterPerCell,
1123                                               (numAtoms + geometry_.numAtomsICluster - 1)/ geometry_.numAtomsICluster);
1124
1125                 /* Store the z-boundaries of the bounding box of the cell */
1126                 bbcz_[cell].lower = x[atomIndices[atomOffsetZ]][ZZ];
1127                 bbcz_[cell].upper = x[atomIndices[atomOffsetZ + numAtoms - 1]][ZZ];
1128             }
1129
1130             if (c_gpuNumClusterPerCellY > 1)
1131             {
1132                 /* Sort the atoms along y */
1133                 sort_atoms(YY, (sub_z & 1) != 0, dd_zone,
1134                            relevantAtomsAreWithinGridBounds,
1135                            atomIndices.data() + atomOffsetZ, numAtomsZ, x,
1136                            dimensions_.lowerCorner[YY] + gridY*dimensions_.cellSize[YY],
1137                            dimensions_.invCellSize[YY], subdiv_z,
1138                            sort_work);
1139             }
1140
1141             for (int sub_y = 0; sub_y < c_gpuNumClusterPerCellY; sub_y++)
1142             {
1143                 const int atomOffsetY = atomOffsetZ + sub_y*subdiv_y;
1144                 const int numAtomsY   = std::min(subdiv_y, numAtomsInColumn - (atomOffsetY - atomOffset));
1145
1146                 if (c_gpuNumClusterPerCellX > 1)
1147                 {
1148                     /* Sort the atoms along x */
1149                     sort_atoms(XX, ((cz*c_gpuNumClusterPerCellY + sub_y) & 1) != 0, dd_zone,
1150                                relevantAtomsAreWithinGridBounds,
1151                                atomIndices.data() + atomOffsetY, numAtomsY, x,
1152                                dimensions_.lowerCorner[XX] + gridX*dimensions_.cellSize[XX],
1153                                dimensions_.invCellSize[XX], subdiv_y,
1154                                sort_work);
1155                 }
1156
1157                 for (int sub_x = 0; sub_x < c_gpuNumClusterPerCellX; sub_x++)
1158                 {
1159                     const int atomOffsetX = atomOffsetY + sub_x*subdiv_x;
1160                     const int numAtomsX   = std::min(subdiv_x, numAtomsInColumn - (atomOffsetX - atomOffset));
1161
1162                     fillCell(gridSetData, nbat,
1163                              atomOffsetX, atomOffsetX + numAtomsX, atinfo, x,
1164                              bb_work_aligned);
1165                 }
1166             }
1167         }
1168
1169         /* Set the unused atom indices to -1 */
1170         for (int ind = numAtomsInColumn; ind < numCellsInColumn*numAtomsPerCell; ind++)
1171         {
1172             atomIndices[atomOffset + ind] = -1;
1173         }
1174     }
1175 }
1176
1177 /*! \brief Sets the cell index in the cell array for atom \p atomIndex and increments the atom count for the grid column */
1178 static void setCellAndAtomCount(gmx::ArrayRef<int>  cell,
1179                                 int                 cellIndex,
1180                                 gmx::ArrayRef<int>  cxy_na,
1181                                 int                 atomIndex)
1182 {
1183     cell[atomIndex]    = cellIndex;
1184     cxy_na[cellIndex] += 1;
1185 }
1186
1187 void Grid::calcColumnIndices(const Grid::Dimensions         &gridDims,
1188                              const gmx::UpdateGroupsCog     *updateGroupsCog,
1189                              const int                       atomStart,
1190                              const int                       atomEnd,
1191                              gmx::ArrayRef<const gmx::RVec>  x,
1192                              const int                       dd_zone,
1193                              const int                      *move,
1194                              const int                       thread,
1195                              const int                       nthread,
1196                              gmx::ArrayRef<int>              cell,
1197                              gmx::ArrayRef<int>              cxy_na)
1198 {
1199     const int numColumns = gridDims.numCells[XX]*gridDims.numCells[YY];
1200
1201     /* We add one extra cell for particles which moved during DD */
1202     for (int i = 0; i < numColumns; i++)
1203     {
1204         cxy_na[i] = 0;
1205     }
1206
1207     int taskAtomStart = atomStart + static_cast<int>((thread + 0)*(atomEnd - atomStart))/nthread;
1208     int taskAtomEnd   = atomStart + static_cast<int>((thread + 1)*(atomEnd - atomStart))/nthread;
1209
1210     if (dd_zone == 0)
1211     {
1212         /* Home zone */
1213         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1214         {
1215             if (move == nullptr || move[i] >= 0)
1216             {
1217
1218                 const gmx::RVec &coord = (updateGroupsCog ? updateGroupsCog->cogForAtom(i) : x[i]);
1219
1220                 /* We need to be careful with rounding,
1221                  * particles might be a few bits outside the local zone.
1222                  * The int cast takes care of the lower bound,
1223                  * we will explicitly take care of the upper bound.
1224                  */
1225                 int cx = static_cast<int>((coord[XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1226                 int cy = static_cast<int>((coord[YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1227
1228 #ifndef NDEBUG
1229                 if (cx < 0 || cx > gridDims.numCells[XX] ||
1230                     cy < 0 || cy > gridDims.numCells[YY])
1231                 {
1232                     gmx_fatal(FARGS,
1233                               "grid cell cx %d cy %d out of range (max %d %d)\n"
1234                               "atom %f %f %f, grid->c0 %f %f",
1235                               cx, cy,
1236                               gridDims.numCells[XX],
1237                               gridDims.numCells[YY],
1238                               x[i][XX], x[i][YY], x[i][ZZ],
1239                               gridDims.lowerCorner[XX],
1240                               gridDims.lowerCorner[YY]);
1241                 }
1242 #endif
1243                 /* Take care of potential rouding issues */
1244                 cx = std::min(cx, gridDims.numCells[XX] - 1);
1245                 cy = std::min(cy, gridDims.numCells[YY] - 1);
1246
1247                 /* For the moment cell will contain only the, grid local,
1248                  * x and y indices, not z.
1249                  */
1250                 setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1251             }
1252             else
1253             {
1254                 /* Put this moved particle after the end of the grid,
1255                  * so we can process it later without using conditionals.
1256                  */
1257                 setCellAndAtomCount(cell, numColumns, cxy_na, i);
1258             }
1259         }
1260     }
1261     else
1262     {
1263         /* Non-home zone */
1264         for (int i = taskAtomStart; i < taskAtomEnd; i++)
1265         {
1266             int cx = static_cast<int>((x[i][XX] - gridDims.lowerCorner[XX])*gridDims.invCellSize[XX]);
1267             int cy = static_cast<int>((x[i][YY] - gridDims.lowerCorner[YY])*gridDims.invCellSize[YY]);
1268
1269             /* For non-home zones there could be particles outside
1270              * the non-bonded cut-off range, which have been communicated
1271              * for bonded interactions only. For the result it doesn't
1272              * matter where these end up on the grid. For performance
1273              * we put them in an extra row at the border.
1274              */
1275             cx = std::max(cx, 0);
1276             cx = std::min(cx, gridDims.numCells[XX] - 1);
1277             cy = std::max(cy, 0);
1278             cy = std::min(cy, gridDims.numCells[YY] - 1);
1279
1280             /* For the moment cell will contain only the, grid local,
1281              * x and y indices, not z.
1282              */
1283             setCellAndAtomCount(cell, cx*gridDims.numCells[YY] + cy, cxy_na, i);
1284         }
1285     }
1286 }
1287
1288 /*! \brief Resizes grid and atom data which depend on the number of cells */
1289 static void resizeForNumberOfCells(const int           numNbnxnAtoms,
1290                                    const int           numAtomsMoved,
1291                                    GridSetData        *gridSetData,
1292                                    nbnxn_atomdata_t   *nbat)
1293 {
1294     /* Note: gridSetData->cellIndices was already resized before */
1295
1296     /* To avoid conditionals we store the moved particles at the end of a,
1297      * make sure we have enough space.
1298      */
1299     gridSetData->atomIndices.resize(numNbnxnAtoms + numAtomsMoved);
1300
1301     /* Make space in nbat for storing the atom coordinates */
1302     nbat->resizeCoordinateBuffer(numNbnxnAtoms);
1303 }
1304
1305 void Grid::setCellIndices(int                             ddZone,
1306                           int                             cellOffset,
1307                           GridSetData                    *gridSetData,
1308                           gmx::ArrayRef<GridWork>         gridWork,
1309                           int                             atomStart,
1310                           int                             atomEnd,
1311                           const int                      *atinfo,
1312                           gmx::ArrayRef<const gmx::RVec>  x,
1313                           const int                       numAtomsMoved,
1314                           nbnxn_atomdata_t               *nbat)
1315 {
1316     cellOffset_   = cellOffset;
1317
1318     srcAtomBegin_ = atomStart;
1319     srcAtomEnd_   = atomEnd;
1320
1321     const int nthread = gmx_omp_nthreads_get(emntPairsearch);
1322
1323     const int numAtomsPerCell = geometry_.numAtomsPerCell;
1324
1325     /* Make the cell index as a function of x and y */
1326     int ncz_max      = 0;
1327     int ncz          = 0;
1328     cxy_ind_[0]      = 0;
1329     for (int i = 0; i < numColumns() + 1; i++)
1330     {
1331         /* We set ncz_max at the beginning of the loop iso at the end
1332          * to skip i=grid->ncx*grid->numCells[YY] which are moved particles
1333          * that do not need to be ordered on the grid.
1334          */
1335         if (ncz > ncz_max)
1336         {
1337             ncz_max = ncz;
1338         }
1339         int cxy_na_i = gridWork[0].numAtomsPerColumn[i];
1340         for (int thread = 1; thread < nthread; thread++)
1341         {
1342             cxy_na_i += gridWork[thread].numAtomsPerColumn[i];
1343         }
1344         ncz = (cxy_na_i + numAtomsPerCell - 1)/numAtomsPerCell;
1345         if (nbat->XFormat == nbatX8)
1346         {
1347             /* Make the number of cell a multiple of 2 */
1348             ncz = (ncz + 1) & ~1;
1349         }
1350         cxy_ind_[i+1] = cxy_ind_[i] + ncz;
1351         /* Clear cxy_na_, so we can reuse the array below */
1352         cxy_na_[i] = 0;
1353     }
1354     numCellsTotal_     = cxy_ind_[numColumns()] - cxy_ind_[0];
1355     numCellsColumnMax_ = ncz_max;
1356
1357     /* Resize grid and atom data which depend on the number of cells */
1358     resizeForNumberOfCells(atomIndexEnd(), numAtomsMoved, gridSetData, nbat);
1359
1360     if (debug)
1361     {
1362         fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
1363                 numAtomsPerCell, geometry_.numAtomsICluster, numCellsTotal_,
1364                 dimensions_.numCells[XX], dimensions_.numCells[YY],
1365                 numCellsTotal_/(static_cast<double>(numColumns())),
1366                 ncz_max);
1367         if (gmx_debug_at)
1368         {
1369             int i = 0;
1370             for (int cy = 0; cy < dimensions_.numCells[YY]; cy++)
1371             {
1372                 for (int cx = 0; cx < dimensions_.numCells[XX]; cx++)
1373                 {
1374                     fprintf(debug, " %2d", cxy_ind_[i + 1] - cxy_ind_[i]);
1375                     i++;
1376                 }
1377                 fprintf(debug, "\n");
1378             }
1379         }
1380     }
1381
1382     /* Make sure the work array for sorting is large enough */
1383     const int worstCaseSortBufferSize = ncz_max*numAtomsPerCell*c_sortGridMaxSizeFactor;
1384     if (worstCaseSortBufferSize > gmx::index(gridWork[0].sortBuffer.size()))
1385     {
1386         for (GridWork &work : gridWork)
1387         {
1388             /* Elements not in use should be -1 */
1389             work.sortBuffer.resize(worstCaseSortBufferSize, -1);
1390         }
1391     }
1392
1393     /* Now we know the dimensions we can fill the grid.
1394      * This is the first, unsorted fill. We sort the columns after this.
1395      */
1396     gmx::ArrayRef<int> cells       = gridSetData->cells;
1397     gmx::ArrayRef<int> atomIndices = gridSetData->atomIndices;
1398     for (int i = atomStart; i < atomEnd; i++)
1399     {
1400         /* At this point nbs->cell contains the local grid x,y indices */
1401         const int cxy = cells[i];
1402         atomIndices[firstAtomInColumn(cxy) + cxy_na_[cxy]++] = i;
1403     }
1404
1405     if (ddZone == 0)
1406     {
1407         /* Set the cell indices for the moved particles */
1408         int n0 = numCellsTotal_*numAtomsPerCell;
1409         int n1 = numCellsTotal_*numAtomsPerCell + cxy_na_[numColumns()];
1410         if (ddZone == 0)
1411         {
1412             for (int i = n0; i < n1; i++)
1413             {
1414                 cells[atomIndices[i]] = i;
1415             }
1416         }
1417     }
1418
1419     /* Sort the super-cell columns along z into the sub-cells. */
1420 #pragma omp parallel for num_threads(nthread) schedule(static)
1421     for (int thread = 0; thread < nthread; thread++)
1422     {
1423         try
1424         {
1425             int columnStart = ((thread + 0)*numColumns())/nthread;
1426             int columnEnd   = ((thread + 1)*numColumns())/nthread;
1427             if (geometry_.isSimple)
1428             {
1429                 sortColumnsCpuGeometry(gridSetData, ddZone,
1430                                        atomStart, atomEnd, atinfo, x, nbat,
1431                                        columnStart, columnEnd,
1432                                        gridWork[thread].sortBuffer);
1433             }
1434             else
1435             {
1436                 sortColumnsGpuGeometry(gridSetData, ddZone,
1437                                        atomStart, atomEnd, atinfo, x, nbat,
1438                                        columnStart, columnEnd,
1439                                        gridWork[thread].sortBuffer);
1440             }
1441         }
1442         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
1443     }
1444
1445     if (geometry_.isSimple && nbat->XFormat == nbatX8)
1446     {
1447         combine_bounding_box_pairs(*this, bb_, bbj_);
1448     }
1449
1450     if (!geometry_.isSimple)
1451     {
1452         numClustersTotal_ = 0;
1453         for (int i = 0; i < numCellsTotal_; i++)
1454         {
1455             numClustersTotal_ += numClusters_[i];
1456         }
1457     }
1458
1459     if (debug)
1460     {
1461         if (geometry_.isSimple)
1462         {
1463             print_bbsizes_simple(debug, *this);
1464         }
1465         else
1466         {
1467             fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
1468                     numClustersTotal_,
1469                     (atomEnd - atomStart)/static_cast<double>(numClustersTotal_));
1470
1471             print_bbsizes_supersub(debug, *this);
1472         }
1473     }
1474 }
1475
1476 } // namespace Nbnxm