src/gromacs/ewald/pme_internal.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   7  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   8  * and including many others, as listed in the AUTHORS file in the
   9  * top-level source directory and at http://www.gromacs.org.
  10  *
  11  * GROMACS is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public License
  13  * as published by the Free Software Foundation; either version 2.1
  14  * of the License, or (at your option) any later version.
  15  *
  16  * GROMACS is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with GROMACS; if not, see
  23  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  24  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  25  *
  26  * If you want to redistribute modifications to GROMACS, please
  27  * consider that scientific software is very special. Version
  28  * control is crucial - bugs must be traceable. We will be happy to
  29  * consider code for inclusion in the official distribution, but
  30  * derived work must not be called official GROMACS. Details are found
  31  * in the README & COPYING files - if they are missing, get the
  32  * official version at http://www.gromacs.org.
  33  *
  34  * To help us fund GROMACS development, we humbly ask that you cite
  35  * the research papers on the package. Check out http://www.gromacs.org.
  36  */
  37 /*! \internal \file
  38  *
  39  * \brief This file contains function declarations necessary for
  40  * computing energies and forces for the PME long-ranged part (Coulomb
  41  * and LJ).
  42  *
  43  * \author Berk Hess <hess@kth.se>
  44  * \author Mark Abraham <mark.j.abraham@gmail.com>
  45  * \ingroup module_ewald
  46  */
  47
  48 /* TODO This file is a temporary holding area for stuff local to the
  49  * PME code, before it acquires some more normal ewald/file.c and
  50  * ewald/file.h structure.  In future clean up, get rid of this file,
  51  * to build more normal. */
  52
  53 #ifndef GMX_EWALD_PME_INTERNAL_H
  54 #define GMX_EWALD_PME_INTERNAL_H
  55
  56 #include "config.h"
  57
  58 #include "gromacs/math/gmxcomplex.h"
  59 #include "gromacs/utility/basedefinitions.h"
  60 #include "gromacs/utility/defaultinitializationallocator.h"
  61 #include "gromacs/utility/gmxmpi.h"
  62 #include "gromacs/utility/smalloc.h"
  63
  64 #include "pme_gpu_types_host.h"
  65
  66 //! A repeat of typedef from parallel_3dfft.h
  67 typedef struct gmx_parallel_3dfft *gmx_parallel_3dfft_t;
  68
  69 struct t_commrec;
  70 struct t_inputrec;
  71 struct PmeGpu;
  72
  73 //@{
  74 //! Grid indices for A state for charge and Lennard-Jones C6
  75 #define PME_GRID_QA    0
  76 #define PME_GRID_C6A   2
  77 //@}
  78
  79 //@{
  80 /*! \brief Flags that indicate the number of PME grids in use */
  81 #define DO_Q           2 /* Electrostatic grids have index q<2 */
  82 #define DO_Q_AND_LJ    4 /* non-LB LJ grids have index 2 <= q < 4 */
  83 #define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
  84 //@}
  85
  86 /*! \brief Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
  87 static const real lb_scale_factor[] = {
  88     1.0/64, 6.0/64, 15.0/64, 20.0/64,
  89     15.0/64, 6.0/64, 1.0/64
  90 };
  91
  92 /*! \brief Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
  93 static const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
  94
  95 /*! \brief We only define a maximum to be able to use local arrays without allocation.
  96  * An order larger than 12 should never be needed, even for test cases.
  97  * If needed it can be changed here.
  98  */
  99 #define PME_ORDER_MAX 12
 100
 101 /*! \brief As gmx_pme_init, but takes most settings, except the grid/Ewald coefficients, from pme_src.
 102  * This is only called when the PME cut-off/grid size changes.
 103  */
 104 void gmx_pme_reinit(struct gmx_pme_t **pmedata,
 105                     const t_commrec   *cr,
 106                     struct gmx_pme_t * pme_src,
 107                     const t_inputrec * ir,
 108                     const ivec         grid_size,
 109                     real               ewaldcoeff_q,
 110                     real               ewaldcoeff_lj);
 111
 112
 113 /* Temporary suppression until these structs become opaque and don't live in
 114  * a header that is included by other headers. Also, until then I have no
 115  * idea what some of the names mean. */
 116
 117 //! @cond Doxygen_Suppress
 118
 119 /*! \brief Data structure for grid communication */
 120 struct pme_grid_comm_t
 121 {
 122     int send_id;         //!< Source rank id
 123     int send_index0;
 124     int send_nindex;
 125     int recv_id;         //!< Destination rank id
 126     int recv_index0;
 127     int recv_nindex;
 128     int recv_size = 0;   //!< Receive buffer width, used with OpenMP
 129 };
 130
 131 /*! \brief Data structure for grid overlap communication in a single dimension */
 132 struct pme_overlap_t
 133 {
 134     MPI_Comm                     mpi_comm;       //!< MPI communcator
 135     int                          nnodes;         //!< Number of ranks
 136     int                          nodeid;         //!< Unique rank identifcator
 137     std::vector<int>             s2g0;           //!< The local interpolation grid start
 138     std::vector<int>             s2g1;           //!< The local interpolation grid end
 139     int                          send_size;      //!< Send buffer width, used with OpenMP
 140     std::vector<pme_grid_comm_t> comm_data;      //!< All the individual communication data for each rank
 141     std::vector<real>            sendbuf;        //!< Shared buffer for sending
 142     std::vector<real>            recvbuf;        //!< Shared buffer for receiving
 143 };
 144
 145 template<typename T>
 146 using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>;
 147
 148 template<typename T>
 149 using FastVector = std::vector < T, gmx::DefaultInitializationAllocator < T>>;
 150
 151 /*! \brief Data structure for organizing particle allocation to threads */
 152 struct AtomToThreadMap
 153 {
 154     //! Cumulative counts of the number of particles per thread
 155     int             *n = nullptr;
 156     //! Storage buffer for n
 157     std::vector<int> nBuffer;
 158     //! Particle indices ordered on thread index (n)
 159     FastVector<int>  i;
 160 };
 161
 162 /*! \brief Helper typedef for spline vectors */
 163 typedef real *splinevec[DIM];
 164
 165 /*! \internal
 166  * \brief Coefficients for theta or dtheta
 167  */
 168 class SplineCoefficients
 169 {
 170     public:
 171         //! Reallocate for use with up to nalloc coefficients
 172         void realloc(int nalloc);
 173
 174         //! Pointers to the coefficient buffer for x, y, z
 175         splinevec           coefficients = { nullptr };
 176     private:
 177         //! Storage for x coefficients
 178         std::vector<real>   bufferX_;
 179         //! Storage for y coefficients
 180         std::vector<real>   bufferY_;
 181         //! Storage for z coefficients, aligned for SIMD load
 182         AlignedVector<real> bufferZ_;
 183 };
 184
 185 /*! \brief Data structure for beta-spline interpolation */
 186 struct splinedata_t
 187 {
 188     int                 n      = 0;
 189     FastVector<int>     ind;
 190     SplineCoefficients  theta;
 191     SplineCoefficients  dtheta;
 192     int                 nalloc = 0;
 193 };
 194
 195 /*! \brief PME slab MPI communication setup */
 196 struct SlabCommSetup
 197 {
 198     //! The nodes to send x and q to with DD
 199     int node_dest;
 200     //! The nodes to receive x and q from with DD
 201     int node_src;
 202     //! Index for commnode into the buffers
 203     int buf_index;
 204     //! The number of atoms to receive
 205     int rcount;
 206 };
 207
 208 /*! \internal
 209  * \brief Data structure for coordinating transfers between PME ranks along one dimension
 210  *
 211  * Also used for passing coordinates, coefficients and forces to and from PME routines.
 212  */
 213 class PmeAtomComm
 214 {
 215     public:
 216         //! Constructor, \p PmeMpiCommunicator is the communicator for this dimension
 217         PmeAtomComm(MPI_Comm PmeMpiCommunicator,
 218                     int      numThreads,
 219                     int      pmeOrder,
 220                     int      dimIndex,
 221                     bool     doSpread);
 222
 223         //! Set the atom count and when necessary resizes atom buffers
 224         void setNumAtoms(int numAtoms);
 225
 226         //! Returns the atom count
 227         int numAtoms() const
 228         {
 229             return numAtoms_;
 230         }
 231
 232         //! Returns the number of atoms to send to each rank
 233         gmx::ArrayRef<int> sendCount()
 234         {
 235             GMX_ASSERT(!count_thread.empty(), "Need at least one thread_count");
 236             return count_thread[0];
 237         }
 238
 239         //! The index of the dimension, 0=x, 1=y
 240         int      dimind = 0;
 241         //! The number of slabs and ranks this dimension is decomposed over
 242         int      nslab  = 1;
 243         //! Our MPI rank index
 244         int      nodeid = 0;
 245         //! Communicator for this dimension
 246         MPI_Comm mpi_comm;
 247
 248         //! Communication setup for each slab, only present with nslab > 1
 249         std::vector<SlabCommSetup> slabCommSetup;
 250         //! The maximum communication distance counted in MPI ranks
 251         int                        maxshift = 0;
 252
 253         //! The target slab index for each particle
 254         FastVector<int>                    pd;
 255         //! Target particle counts for each slab, for each thread
 256         std::vector < std::vector < int>>  count_thread;
 257
 258     private:
 259         //! The number of atoms
 260         int                            numAtoms_ = 0;
 261     public:
 262         //! The coordinates
 263         gmx::ArrayRef<const gmx::RVec> x;
 264         //! The coefficient, charges or LJ C6
 265         gmx::ArrayRef<const real>      coefficient;
 266         //! The forces
 267         gmx::ArrayRef<gmx::RVec>       f;
 268         //! Coordinate buffer, used only with nslab > 1
 269         FastVector<gmx::RVec>          xBuffer;
 270         //! Coefficient buffer, used only with nslab > 1
 271         FastVector<real>               coefficientBuffer;
 272         //! Force buffer, used only with nslab > 1
 273         FastVector<gmx::RVec>          fBuffer;
 274         //! Tells whether these coordinates are used for spreading
 275         bool                           bSpread;
 276         //! The PME order
 277         int                            pme_order;
 278         //! The grid index per atom
 279         FastVector<gmx::IVec>          idx;
 280         //! Fractional atom coordinates relative to the lower cell boundary
 281         FastVector<gmx::RVec>          fractx;
 282
 283         //! The number of threads to use in PME
 284         int                            nthread;
 285         //! Thread index for each atom
 286         FastVector<int>                thread_idx;
 287         std::vector<AtomToThreadMap>   threadMap;
 288         std::vector<splinedata_t>      spline;
 289 };
 290
 291 /*! \brief Data structure for a single PME grid */
 292 struct pmegrid_t{
 293     ivec  ci;     /* The spatial location of this grid         */
 294     ivec  n;      /* The used size of *grid, including order-1 */
 295     ivec  offset; /* The grid offset from the full node grid   */
 296     int   order;  /* PME spreading order                       */
 297     ivec  s;      /* The allocated size of *grid, s >= n       */
 298     real *grid;   /* The grid local thread, size n             */
 299 };
 300
 301 /*! \brief Data structures for PME grids */
 302 struct pmegrids_t{
 303     pmegrid_t  grid;         /* The full node grid (non thread-local)            */
 304     int        nthread;      /* The number of threads operating on this grid     */
 305     ivec       nc;           /* The local spatial decomposition over the threads */
 306     pmegrid_t *grid_th;      /* Array of grids for each thread                   */
 307     real      *grid_all;     /* Allocated array for the grids in *grid_th        */
 308     int       *g2t[DIM];     /* The grid to thread index                         */
 309     ivec       nthread_comm; /* The number of threads to communicate with        */
 310 };
 311
 312 /*! \brief Data structure for spline-interpolation working buffers */
 313 struct pme_spline_work;
 314
 315 /*! \brief Data structure for working buffers */
 316 struct pme_solve_work_t;
 317
 318 /*! \brief Master PME data structure */
 319 struct gmx_pme_t {            //NOLINT(clang-analyzer-optin.performance.Padding)
 320     int           ndecompdim; /* The number of decomposition dimensions */
 321     int           nodeid;     /* Our nodeid in mpi->mpi_comm */
 322     int           nodeid_major;
 323     int           nodeid_minor;
 324     int           nnodes;    /* The number of nodes doing PME */
 325     int           nnodes_major;
 326     int           nnodes_minor;
 327
 328     MPI_Comm      mpi_comm;
 329     MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 330 #if GMX_MPI
 331     MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 332 #endif
 333
 334     gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
 335     int        nthread;       /* The number of threads doing PME on our rank */
 336
 337     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
 338     bool       doCoulomb;     /* Apply PME to electrostatics */
 339     bool       doLJ;          /* Apply PME to Lennard-Jones r^-6 interactions */
 340     gmx_bool   bFEP;          /* Compute Free energy contribution */
 341     gmx_bool   bFEP_q;
 342     gmx_bool   bFEP_lj;
 343     int        nkx, nky, nkz; /* Grid dimensions */
 344     gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
 345     int        pme_order;
 346     real       ewaldcoeff_q;  /* Ewald splitting coefficient for Coulomb */
 347     real       ewaldcoeff_lj; /* Ewald splitting coefficient for r^-6 */
 348     real       epsilon_r;
 349
 350
 351     enum PmeRunMode runMode; /* Which codepath is the PME runner taking - CPU, GPU, mixed;
 352                               * TODO: this is the information that should be owned by the task scheduler,
 353                               * and ideally not be duplicated here.
 354                               */
 355
 356     PmeGpu      *gpu;        /* A pointer to the GPU data.
 357                               * TODO: this should be unique or a shared pointer.
 358                               * Currently in practice there is a single gmx_pme_t instance while a code
 359                               * is partially set up for many of them. The PME tuning calls gmx_pme_reinit()
 360                               * which fully reinitializes the one and only PME structure anew while maybe
 361                               * keeping the old grid buffers if they were already large enough.
 362                               * This small choice should be made clear in the later refactoring -
 363                               * do we store many PME objects for different grid sizes,
 364                               * or a single PME object that handles different grid sizes gracefully.
 365                               */
 366
 367
 368     class EwaldBoxZScaler *boxScaler;   /**< The scaling data Ewald uses with walls (set at pme_init constant for the entire run) */
 369
 370
 371     int        ljpme_combination_rule;  /* Type of combination rule in LJ-PME */
 372
 373     int        ngrids;                  /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
 374
 375     pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
 376                                          * includes overlap Grid indices are ordered as
 377                                          * follows:
 378                                          * 0: Coloumb PME, state A
 379                                          * 1: Coloumb PME, state B
 380                                          * 2-8: LJ-PME
 381                                          * This can probably be done in a better way
 382                                          * but this simple hack works for now
 383                                          */
 384
 385     /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
 386     int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
 387     /* pmegrid_nz might be larger than strictly necessary to ensure
 388      * memory alignment, pmegrid_nz_base gives the real base size.
 389      */
 390     int     pmegrid_nz_base;
 391     /* The local PME grid starting indices */
 392     int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 393
 394     /* Work data for spreading and gathering */
 395     pme_spline_work          *spline_work;
 396
 397     real                    **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 398     /* inside the interpolation grid, but separate for 2D PME decomp. */
 399     int                       fftgrid_nx, fftgrid_ny, fftgrid_nz;
 400
 401     t_complex               **cfftgrid; /* Grids for complex FFT data */
 402
 403     int                       cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 404
 405     gmx_parallel_3dfft_t     *pfft_setup;
 406
 407     int                      *nnx, *nny, *nnz;
 408     real                     *fshx, *fshy, *fshz;
 409
 410     std::vector<PmeAtomComm>  atc; /* Indexed on decomposition index */
 411     matrix                    recipbox;
 412     real                      boxVolume;
 413     splinevec                 bsp_mod;
 414     /* Buffers to store data for local atoms for L-B combination rule
 415      * calculations in LJ-PME. lb_buf1 stores either the coefficients
 416      * for spreading/gathering (in serial), or the C6 coefficient for
 417      * local atoms (in parallel).  lb_buf2 is only used in parallel,
 418      * and stores the sigma values for local atoms. */
 419     FastVector<real>          lb_buf1;
 420     FastVector<real>          lb_buf2;
 421
 422     pme_overlap_t             overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 423
 424     /* Atom step for energy only calculation in gmx_pme_calc_energy() */
 425     std::unique_ptr<PmeAtomComm> atc_energy;
 426
 427     /* Communication buffers */
 428     rvec                 *bufv;          /* Communication buffer */
 429     real                 *bufr;          /* Communication buffer */
 430     int                   buf_nalloc;    /* The communication buffer size */
 431
 432     /* thread local work data for solve_pme */
 433     struct pme_solve_work_t *solve_work;
 434
 435     /* Work data for sum_qgrid */
 436     real *   sum_qgrid_tmp;
 437     real *   sum_qgrid_dd_tmp;
 438 };
 439
 440 //! @endcond
 441
 442 /*! \brief
 443  * Finds out if PME is currently running on GPU.
 444  * TODO: should this be removed eventually?
 445  *
 446  * \param[in] pme  The PME structure.
 447  * \returns        True if PME runs on GPU currently, false otherwise.
 448  */
 449 inline bool pme_gpu_active(const gmx_pme_t *pme)
 450 {
 451     return (pme != nullptr) && (pme->runMode != PmeRunMode::CPU);
 452 }
 453
 454 /*! \brief Tell our PME-only node to switch to a new grid size */
 455 void gmx_pme_send_switchgrid(const t_commrec *cr,
 456                              ivec             grid_size,
 457                              real             ewaldcoeff_q,
 458                              real             ewaldcoeff_lj);
 459
 460 #endif