libhoomd/communication/CommunicatorGPU.cc

   1 /*
   2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
   3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
   4 the University of Michigan All rights reserved.
   5
   6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
   7 copyright is held, by various Contributors who have granted The Regents of the
   8 University of Michigan the right to modify and/or distribute such Contributions.
   9
  10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
  11 and binary forms, provided you abide by the following conditions:
  12
  13 * Redistributions of source code must retain the above copyright notice, this
  14 list of conditions, and the following disclaimer both in the code and
  15 prominently in any materials provided with the distribution.
  16
  17 * Redistributions in binary form must reproduce the above copyright notice, this
  18 list of conditions, and the following disclaimer in the documentation and/or
  19 other materials provided with the distribution.
  20
  21 * All publications and presentations based on HOOMD-blue, including any reports
  22 or published results obtained, in whole or in part, with HOOMD-blue, will
  23 acknowledge its use according to the terms posted at the time of submission on:
  24 http://codeblue.umich.edu/hoomd-blue/citations.html
  25
  26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
  27 http://codeblue.umich.edu/hoomd-blue/
  28
  29 * Apart from the above required attributions, neither the name of the copyright
  30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
  31 promote products derived from this software without specific prior written
  32 permission.
  33
  34 Disclaimer
  35
  36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
  37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
  39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
  40
  41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
  46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48 */
  49
  50 // Maintainer: jglaser
  51
  52 /*! \file CommunicatorGPU.cc
  53     \brief Implements the CommunicatorGPU class
  54 */
  55
  56 #ifdef ENABLE_MPI
  57 #ifdef ENABLE_CUDA
  58
  59 #include "CommunicatorGPU.h"
  60 #include "Profiler.h"
  61 #include "System.h"
  62
  63 #include <boost/python.hpp>
  64 #include <algorithm>
  65 #include <functional>
  66
  67 //! Constructor
  68 CommunicatorGPU::CommunicatorGPU(boost::shared_ptr<SystemDefinition> sysdef,
  69                                  boost::shared_ptr<DomainDecomposition> decomposition)
  70     : Communicator(sysdef, decomposition),
  71       m_max_stages(1),
  72       m_num_stages(0),
  73       m_comm_mask(0),
  74       m_bond_comm(*this, m_sysdef->getBondData()),
  75       m_angle_comm(*this, m_sysdef->getAngleData()),
  76       m_dihedral_comm(*this, m_sysdef->getDihedralData()),
  77       m_improper_comm(*this, m_sysdef->getImproperData())
  78     {
  79     // default value
  80     #ifndef ENABLE_MPI_CUDA
  81     m_mapped_ghost_recv = true;
  82     m_mapped_ghost_send = true;
  83     #else
  84     // do not use host buffers with CUDA-aware MPI
  85     m_mapped_ghost_recv = false;
  86     m_mapped_ghost_send = false;
  87     #endif
  88
  89     // allocate memory
  90     allocateBuffers();
  91
  92     // initialize communciation stages
  93     initializeCommunicationStages();
  94
  95     // Initialize cache configuration
  96     gpu_communicator_initialize_cache_config();
  97
  98     // create at ModernGPU context
  99     m_mgpu_context = mgpu::CreateCudaDeviceAttachStream(0);
 100
 101     GPUArray<unsigned int> begin(NEIGH_MAX,m_exec_conf,true);
 102     m_begin.swap(begin);
 103
 104     GPUArray<unsigned int> end(NEIGH_MAX,m_exec_conf,true);
 105     m_end.swap(end);
 106
 107     // create cuda event
 108 //    cudaEventCreate(&m_event, cudaEventBlockingSync | cudaEventDisableTiming);
 109     cudaEventCreate(&m_event, cudaEventDisableTiming);
 110
 111     #ifndef ENABLE_MPI_CUDA
 112     // set up autotuners to determine whether to use mapped memory (boolean values)
 113     std::vector<unsigned int> valid_params(2);
 114     valid_params[0] = 0; valid_params[1]  = 1;
 115
 116     m_tuner_ghost_recv.reset(new Autotuner(valid_params, 5, 100000, "comm_ghost_recv_mapped", this->m_exec_conf));
 117     m_tuner_ghost_send.reset(new Autotuner(valid_params, 5, 100000, "comm_ghost_send_mapped", this->m_exec_conf));
 118
 119     // synchronize autotuner results across ranks
 120     m_tuner_ghost_recv->setSync(true);
 121     m_tuner_ghost_send->setSync(true);
 122     #endif
 123     }
 124
 125 //! Destructor
 126 CommunicatorGPU::~CommunicatorGPU()
 127     {
 128     m_exec_conf->msg->notice(5) << "Destroying CommunicatorGPU";
 129     cudaEventDestroy(m_event);
 130     }
 131
 132 void CommunicatorGPU::allocateBuffers()
 133     {
 134     #ifndef ENABLE_MPI_CUDA
 135     // mapped buffers for particle migration
 136     bool mapped = true;
 137     #else
 138     bool mapped = false;
 139     #endif
 140
 141     /*
 142      * Particle migration
 143      */
 144     GPUVector<pdata_element> gpu_sendbuf(m_exec_conf,mapped);
 145     m_gpu_sendbuf.swap(gpu_sendbuf);
 146
 147     GPUVector<pdata_element> gpu_recvbuf(m_exec_conf,mapped);
 148     m_gpu_recvbuf.swap(gpu_recvbuf);
 149
 150     // Communciation flags for every particle sent
 151     GPUVector<unsigned int> comm_flags(m_exec_conf);
 152     m_comm_flags.swap(comm_flags);
 153
 154     // Key for every particle sent
 155     GPUVector<unsigned int> send_keys(m_exec_conf);
 156     m_send_keys.swap(send_keys);
 157
 158     /*
 159      * Ghost communication
 160      */
 161
 162     GPUVector<unsigned int> tag_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
 163     m_tag_ghost_sendbuf.swap(tag_ghost_sendbuf);
 164
 165     GPUVector<unsigned int> tag_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
 166     m_tag_ghost_recvbuf.swap(tag_ghost_recvbuf);
 167
 168     GPUVector<Scalar4> pos_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
 169     m_pos_ghost_sendbuf.swap(pos_ghost_sendbuf);
 170
 171     GPUVector<Scalar4> pos_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
 172     m_pos_ghost_recvbuf.swap(pos_ghost_recvbuf);
 173
 174     GPUVector<Scalar4> vel_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
 175     m_vel_ghost_sendbuf.swap(vel_ghost_sendbuf);
 176
 177     GPUVector<Scalar4> vel_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
 178     m_vel_ghost_recvbuf.swap(vel_ghost_recvbuf);
 179
 180     GPUVector<Scalar> charge_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
 181     m_charge_ghost_sendbuf.swap(charge_ghost_sendbuf);
 182
 183     GPUVector<Scalar> charge_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
 184     m_charge_ghost_recvbuf.swap(charge_ghost_recvbuf);
 185
 186     GPUVector<Scalar> diameter_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
 187     m_diameter_ghost_sendbuf.swap(diameter_ghost_sendbuf);
 188
 189     GPUVector<Scalar> diameter_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
 190     m_diameter_ghost_recvbuf.swap(diameter_ghost_recvbuf);
 191
 192     GPUVector<Scalar4> orientation_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
 193     m_orientation_ghost_sendbuf.swap(orientation_ghost_sendbuf);
 194
 195     GPUVector<Scalar4> orientation_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
 196     m_orientation_ghost_recvbuf.swap(orientation_ghost_recvbuf);
 197
 198     #ifndef ENABLE_MPI_CUDA
 199     // initialize standby buffers (oppposite mapped setting)
 200     GPUVector<unsigned int> tag_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
 201     m_tag_ghost_sendbuf_alt.swap(tag_ghost_sendbuf_alt);
 202
 203     GPUVector<unsigned int> tag_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
 204     m_tag_ghost_recvbuf_alt.swap(tag_ghost_recvbuf_alt);
 205
 206     GPUVector<Scalar4> pos_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
 207     m_pos_ghost_sendbuf_alt.swap(pos_ghost_sendbuf_alt);
 208
 209     GPUVector<Scalar4> pos_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
 210     m_pos_ghost_recvbuf_alt.swap(pos_ghost_recvbuf_alt);
 211
 212     GPUVector<Scalar4> vel_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
 213     m_vel_ghost_sendbuf_alt.swap(vel_ghost_sendbuf_alt);
 214
 215     GPUVector<Scalar4> vel_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
 216     m_vel_ghost_recvbuf_alt.swap(vel_ghost_recvbuf_alt);
 217
 218     GPUVector<Scalar> charge_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
 219     m_charge_ghost_sendbuf_alt.swap(charge_ghost_sendbuf_alt);
 220
 221     GPUVector<Scalar> charge_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
 222     m_charge_ghost_recvbuf_alt.swap(charge_ghost_recvbuf_alt);
 223
 224     GPUVector<Scalar> diameter_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
 225     m_diameter_ghost_sendbuf_alt.swap(diameter_ghost_sendbuf_alt);
 226
 227     GPUVector<Scalar> diameter_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
 228     m_diameter_ghost_recvbuf_alt.swap(diameter_ghost_recvbuf_alt);
 229
 230     GPUVector<Scalar4> orientation_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
 231     m_orientation_ghost_sendbuf_alt.swap(orientation_ghost_sendbuf_alt);
 232
 233     GPUVector<Scalar4> orientation_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
 234     m_orientation_ghost_recvbuf_alt.swap(orientation_ghost_recvbuf_alt);
 235     #endif
 236
 237     GPUVector<unsigned int> ghost_begin(m_exec_conf,true);
 238     m_ghost_begin.swap(ghost_begin);
 239
 240     GPUVector<unsigned int> ghost_end(m_exec_conf,true);
 241     m_ghost_end.swap(ghost_end);
 242
 243     GPUVector<unsigned int> ghost_plan(m_exec_conf);
 244     m_ghost_plan.swap(ghost_plan);
 245
 246     GPUVector<uint2> ghost_idx_adj(m_exec_conf);
 247     m_ghost_idx_adj.swap(ghost_idx_adj);
 248
 249     GPUVector<unsigned int> ghost_neigh(m_exec_conf);
 250     m_ghost_neigh.swap(ghost_neigh);
 251
 252     GPUVector<unsigned int> neigh_counts(m_exec_conf);
 253     m_neigh_counts.swap(neigh_counts);
 254     }
 255
 256 void CommunicatorGPU::initializeCommunicationStages()
 257     {
 258     // sanity check for user input
 259     if (m_max_stages == 0)
 260         {
 261         m_exec_conf->msg->warning()
 262             << "Maximum number of communication stages needs to be greater than zero. Assuming one."
 263             << std::endl;
 264         m_max_stages = 1;
 265         }
 266
 267     if (m_max_stages > 3)
 268         {
 269         m_exec_conf->msg->warning()
 270             << "Maximum number of communication stages too large. Assuming three."
 271             << std::endl;
 272         m_max_stages = 3;
 273         }
 274
 275     // accesss neighbors and adjacency  array
 276     ArrayHandle<unsigned int> h_adj_mask(m_adj_mask, access_location::host, access_mode::read);
 277
 278     Index3D di= m_decomposition->getDomainIndexer();
 279
 280     // number of stages in every communication step
 281     m_num_stages = 0;
 282
 283     m_comm_mask.clear();
 284     m_comm_mask.resize(m_max_stages,0);
 285
 286     const unsigned int mask_east = 1 << 2 | 1 << 5 | 1 << 8 | 1 << 11
 287         | 1 << 14 | 1 << 17 | 1 << 20 | 1 << 23 | 1 << 26;
 288     const unsigned int mask_west = mask_east >> 2;
 289     const unsigned int mask_north = 1 << 6 | 1 << 7 | 1 << 8 | 1 << 15
 290         | 1 << 16 | 1 << 17 | 1 << 24 | 1 << 25 | 1 << 26;
 291     const unsigned int mask_south = mask_north >> 6;
 292     const unsigned int mask_up = 1 << 18 | 1 << 19 | 1 << 20 | 1 << 21
 293         | 1 << 22 | 1 << 23 | 1 << 24 | 1 << 25 | 1 << 26;
 294     const unsigned int mask_down = mask_up >> 18;
 295
 296     // loop through neighbors to determine the communication stages
 297     std::vector<unsigned int> neigh_flags(m_n_unique_neigh);
 298     unsigned int max_stage = 0;
 299     for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ++ineigh)
 300         {
 301         int stage = 0;
 302         int n = -1;
 303
 304         // determine stage
 305         if (di.getW() > 1 && (n+1 < (int) m_max_stages)) n++;
 306         if (h_adj_mask.data[ineigh] & (mask_east | mask_west)) stage = n;
 307         if (di.getH() > 1 && (n+1 < (int) m_max_stages)) n++;
 308         if (h_adj_mask.data[ineigh] & (mask_north | mask_south)) stage = n;
 309         if (di.getD() > 1 && (n+1 < (int) m_max_stages)) n++;
 310         if (h_adj_mask.data[ineigh] & (mask_up | mask_down)) stage = n;
 311
 312         assert(stage >= 0);
 313         assert(n >= 0);
 314
 315         unsigned int mask = 0;
 316         if (h_adj_mask.data[ineigh] & mask_east)
 317             mask |= send_east;
 318         if (h_adj_mask.data[ineigh] & mask_west)
 319             mask |= send_west;
 320         if (h_adj_mask.data[ineigh] & mask_north)
 321             mask |= send_north;
 322         if (h_adj_mask.data[ineigh] & mask_south)
 323             mask |= send_south;
 324         if (h_adj_mask.data[ineigh] & mask_up)
 325             mask |= send_up;
 326         if (h_adj_mask.data[ineigh] & mask_down)
 327             mask |= send_down;
 328
 329         neigh_flags[ineigh] = mask;
 330
 331         // set communication flags for stage
 332         m_comm_mask[stage] |= mask;
 333
 334         if (stage > (int)max_stage) max_stage = stage;
 335         }
 336
 337     // number of communication stages
 338     m_num_stages = max_stage + 1;
 339
 340     // every direction occurs in one and only one stages
 341     // number of communications per stage is constant or decreases with stage number
 342     for (unsigned int istage = 0; istage < m_num_stages; ++istage)
 343         for (unsigned int jstage = istage+1; jstage < m_num_stages; ++jstage)
 344             m_comm_mask[jstage] &= ~m_comm_mask[istage];
 345
 346     // access unique neighbors
 347     ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
 348
 349     // initialize stages array
 350     m_stages.resize(m_n_unique_neigh,-1);
 351
 352     // assign stages to unique neighbors
 353     for (unsigned int i= 0; i < m_n_unique_neigh; i++)
 354         for (unsigned int istage = 0; istage < m_num_stages; ++istage)
 355             // compare adjacency masks of neighbors to the mask for this stage
 356             if ((neigh_flags[i] & m_comm_mask[istage]) == neigh_flags[i])
 357                 {
 358                 m_stages[i] = istage;
 359                 break; // associate neighbor with stage of lowest index
 360                 }
 361
 362     m_exec_conf->msg->notice(4) << "ComunicatorGPU: Using " << m_num_stages
 363         << " communication stage(s)." << std::endl;
 364     }
 365
 366 //! Select a particle for migration
 367 struct get_migrate_key : public std::unary_function<const unsigned int, unsigned int >
 368     {
 369     const uint3 my_pos;      //!< My domain decomposition position
 370     const Index3D di;        //!< Domain indexer
 371     const unsigned int mask; //!< Mask of allowed directions
 372     const unsigned int *h_cart_ranks; //!< Rank lookup table
 373
 374     //! Constructor
 375     /*!
 376      */
 377     get_migrate_key(const uint3 _my_pos, const Index3D _di, const unsigned int _mask,
 378         const unsigned int *_h_cart_ranks)
 379         : my_pos(_my_pos), di(_di), mask(_mask), h_cart_ranks(_h_cart_ranks)
 380         { }
 381
 382     //! Generate key for a sent particle
 383     unsigned int operator()(const unsigned int flags)
 384         {
 385         int ix, iy, iz;
 386         ix = iy = iz = 0;
 387
 388         if ((flags & Communicator::send_east) && (mask & Communicator::send_east))
 389             ix = 1;
 390         else if ((flags & Communicator::send_west) && (mask & Communicator::send_west))
 391             ix = -1;
 392
 393         if ((flags & Communicator::send_north) && (mask & Communicator::send_north))
 394             iy = 1;
 395         else if ((flags & Communicator::send_south) && (mask & Communicator::send_south))
 396             iy = -1;
 397
 398         if ((flags & Communicator::send_up) && (mask & Communicator::send_up))
 399             iz = 1;
 400         else if ((flags & Communicator::send_down) && (mask & Communicator::send_down))
 401             iz = -1;
 402
 403         // sanity check: particle has to be sent somewhere
 404         assert(ix || iy || iz);
 405
 406         int i = my_pos.x;
 407         int j = my_pos.y;
 408         int k = my_pos.z;
 409
 410         i += ix;
 411         if (i == (int)di.getW())
 412             i = 0;
 413         else if (i < 0)
 414             i += di.getW();
 415
 416         j += iy;
 417         if (j == (int)di.getH())
 418             j = 0;
 419         else if (j < 0)
 420             j += di.getH();
 421
 422         k += iz;
 423         if (k == (int)di.getD())
 424             k = 0;
 425         else if (k < 0)
 426             k += di.getD();
 427
 428         return h_cart_ranks[di(i,j,k)];
 429         }
 430
 431      };
 432
 433 //! Constructor
 434 template<class group_data>
 435 CommunicatorGPU::GroupCommunicatorGPU<group_data>::GroupCommunicatorGPU(CommunicatorGPU& gpu_comm, boost::shared_ptr<group_data> gdata)
 436     : m_gpu_comm(gpu_comm), m_exec_conf(m_gpu_comm.m_exec_conf), m_gdata(gdata)
 437     {
 438     // accelerate copying of data for host MPI
 439     #ifdef ENABLE_MPI_CUDA
 440     bool mapped = false;
 441     #else
 442     bool mapped = true;
 443     #endif
 444
 445     GPUVector<unsigned int> rank_mask(m_gpu_comm.m_exec_conf);
 446     m_rank_mask.swap(rank_mask);
 447
 448     GPUVector<unsigned int> scratch(m_gpu_comm.m_exec_conf);
 449     m_scan.swap(scratch);
 450
 451     GPUVector<rank_element_t> ranks_out(m_gpu_comm.m_exec_conf,mapped);
 452     m_ranks_out.swap(ranks_out);
 453
 454     GPUVector<rank_element_t> ranks_sendbuf(m_gpu_comm.m_exec_conf,mapped);
 455     m_ranks_sendbuf.swap(ranks_sendbuf);
 456
 457     GPUVector<rank_element_t> ranks_recvbuf(m_gpu_comm.m_exec_conf,mapped);
 458     m_ranks_recvbuf.swap(ranks_recvbuf);
 459
 460     GPUVector<group_element_t> groups_out(m_gpu_comm.m_exec_conf,mapped);
 461     m_groups_out.swap(groups_out);
 462
 463     GPUVector<unsigned int> rank_mask_out(m_gpu_comm.m_exec_conf,mapped);
 464     m_rank_mask_out.swap(rank_mask_out);
 465
 466     GPUVector<group_element_t> groups_sendbuf(m_gpu_comm.m_exec_conf,mapped);
 467     m_groups_sendbuf.swap(groups_sendbuf);
 468
 469     GPUVector<group_element_t> groups_recvbuf(m_gpu_comm.m_exec_conf,mapped);
 470     m_groups_recvbuf.swap(groups_recvbuf);
 471
 472     GPUVector<group_element_t> groups_in(m_gpu_comm.m_exec_conf, mapped);
 473     m_groups_in.swap(groups_in);
 474
 475     // the size of the bit field must be larger or equal the group size
 476     assert(sizeof(unsigned int)*8 >= group_data::size);
 477     }
 478
 479 //! Migrate groups
 480 template<class group_data>
 481 void CommunicatorGPU::GroupCommunicatorGPU<group_data>::migrateGroups(bool incomplete)
 482     {
 483     if (m_gdata->getNGlobal())
 484         {
 485         m_exec_conf->msg->notice(7) << "GroupCommunicator<" << m_gdata->getName() << ">: migrate" << std::endl;
 486
 487         if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf, m_gdata->getName());
 488
 489         // resize bitmasks
 490         m_rank_mask.resize(m_gdata->getN());
 491
 492         // resize temporary arry
 493         m_scan.resize(m_gdata->getN());
 494
 495         unsigned int n_out_ranks;
 496             {
 497             ArrayHandle<unsigned int> d_comm_flags(m_gpu_comm.m_pdata->getCommFlags(), access_location::device, access_mode::read);
 498             ArrayHandle<typename group_data::members_t> d_members(m_gdata->getMembersArray(), access_location::device, access_mode::read);
 499             ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::readwrite);
 500             ArrayHandle<unsigned int> d_rank_mask(m_rank_mask, access_location::device, access_mode::overwrite);
 501             ArrayHandle<unsigned int> d_rtag(m_gpu_comm.m_pdata->getRTags(), access_location::device, access_mode::read);
 502             ArrayHandle<unsigned int> d_scan(m_scan, access_location::device, access_mode::overwrite);
 503
 504             boost::shared_ptr<DomainDecomposition> decomposition = m_gpu_comm.m_pdata->getDomainDecomposition();
 505             ArrayHandle<unsigned int> d_cart_ranks(decomposition->getCartRanks(), access_location::device, access_mode::read);
 506
 507             Index3D di = decomposition->getDomainIndexer();
 508             uint3 my_pos = decomposition->getGridPos();
 509
 510             // mark groups that have members leaving this domain
 511             gpu_mark_groups<group_data::size>(
 512                 m_gpu_comm.m_pdata->getN(),
 513                 d_comm_flags.data,
 514                 m_gdata->getN(),
 515                 d_members.data,
 516                 d_group_ranks.data,
 517                 d_rank_mask.data,
 518                 d_rtag.data,
 519                 d_scan.data,
 520                 n_out_ranks,
 521                 di,
 522                 my_pos,
 523                 d_cart_ranks.data,
 524                 incomplete,
 525                 m_gpu_comm.m_mgpu_context);
 526
 527             if (m_gpu_comm.m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
 528             }
 529
 530         // resize output array
 531         m_ranks_out.resize(n_out_ranks);
 532
 533         unsigned int n_out_groups;
 534             {
 535             ArrayHandle<unsigned int> d_group_tag(m_gdata->getTags(), access_location::device, access_mode::read);
 536             ArrayHandle<typename group_data::members_t> d_members(m_gdata->getMembersArray(), access_location::device, access_mode::read);
 537             ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::read);
 538             ArrayHandle<unsigned int> d_rank_mask(m_rank_mask, access_location::device, access_mode::readwrite);
 539
 540             ArrayHandle<unsigned int> d_rtag(m_gpu_comm.m_pdata->getRTags(), access_location::device, access_mode::read);
 541             ArrayHandle<unsigned int> d_comm_flags(m_gpu_comm.m_pdata->getCommFlags(), access_location::device, access_mode::readwrite);
 542
 543             ArrayHandle<rank_element_t> d_ranks_out(m_ranks_out, access_location::device, access_mode::overwrite);
 544
 545             ArrayHandle<unsigned int> d_scan(m_scan, access_location::device, access_mode::readwrite);
 546
 547             // scatter groups into output arrays according to scan result (d_scan), determine send groups and scan
 548             gpu_scatter_ranks_and_mark_send_groups<group_data::size>(
 549                 m_gdata->getN(),
 550                 d_group_tag.data,
 551                 d_group_ranks.data,
 552                 d_rank_mask.data,
 553                 d_members.data,
 554                 d_rtag.data,
 555                 d_comm_flags.data,
 556                 d_scan.data,
 557                 n_out_groups,
 558                 d_ranks_out.data,
 559                 m_gpu_comm.m_mgpu_context);
 560
 561             if (m_gpu_comm.m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
 562             }
 563
 564         #if defined(ENABLE_MPI_CUDA) && 0
 565         #else
 566         // fill host send buffers on host
 567         unsigned int my_rank = m_gpu_comm.m_exec_conf->getRank();
 568
 569         typedef std::multimap<unsigned int, rank_element_t> map_t;
 570         map_t send_map;
 571
 572             {
 573             // access output buffers
 574             ArrayHandle<rank_element_t> h_ranks_out(m_ranks_out, access_location::host, access_mode::read);
 575             ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location:: host, access_mode::read);
 576
 577             for (unsigned int i = 0; i < n_out_ranks; ++i)
 578                 {
 579                 rank_element_t el = h_ranks_out.data[i];
 580                 typename group_data::ranks_t r = el.ranks;
 581                 unsigned int mask = el.mask;
 582
 583                 if (incomplete)
 584                     // in initialization, send to all neighbors
 585                     for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ++ineigh)
 586                         send_map.insert(std::make_pair(h_unique_neighbors.data[ineigh], el));
 587                 else
 588                     // send to other ranks owning the bonded group
 589                     for (unsigned int j = 0; j < group_data::size; ++j)
 590                         {
 591                         unsigned int rank = r.idx[j];
 592                         bool updated = mask & (1 << j);
 593                         // send out to ranks different from ours
 594                         if (rank != my_rank && !updated)
 595                             send_map.insert(std::make_pair(rank, el));
 596                         }
 597                 }
 598             }
 599
 600         // resize send buffers
 601         m_ranks_sendbuf.resize(send_map.size());
 602
 603             {
 604             // access send buffers
 605             ArrayHandle<rank_element_t> h_ranks_sendbuf(m_ranks_sendbuf, access_location::host, access_mode::overwrite);
 606
 607             // output send data sorted by rank
 608             unsigned int n = 0;
 609             for (typename map_t::iterator it = send_map.begin(); it != send_map.end(); ++it)
 610                 {
 611                 h_ranks_sendbuf.data[n] = it->second;
 612                 n++;
 613                 }
 614
 615             ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
 616             ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::overwrite);
 617             ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::overwrite);
 618
 619             // Find start and end indices
 620             for (unsigned int i = 0; i < m_gpu_comm.m_n_unique_neigh; ++i)
 621                 {
 622                 typename map_t::iterator lower = send_map.lower_bound(h_unique_neighbors.data[i]);
 623                 typename map_t::iterator upper = send_map.upper_bound(h_unique_neighbors.data[i]);
 624                 h_begin.data[i] = std::distance(send_map.begin(),lower);
 625                 h_end.data[i] = std::distance(send_map.begin(),upper);
 626                 }
 627             }
 628         #endif
 629
 630         /*
 631          * communicate rank information (phase 1)
 632          */
 633         unsigned int n_send_groups[m_gpu_comm.m_n_unique_neigh];
 634         unsigned int n_recv_groups[m_gpu_comm.m_n_unique_neigh];
 635         unsigned int offs[m_gpu_comm.m_n_unique_neigh];
 636         unsigned int n_recv_tot = 0;
 637
 638             {
 639             ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::read);
 640             ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::read);
 641             ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
 642
 643             unsigned int send_bytes = 0;
 644             unsigned int recv_bytes = 0;
 645             if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf, "MPI send/recv");
 646
 647             // compute send counts
 648             for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
 649                 n_send_groups[ineigh] = h_end.data[ineigh] - h_begin.data[ineigh];
 650
 651             MPI_Request req[2*m_gpu_comm.m_n_unique_neigh];
 652             MPI_Status stat[2*m_gpu_comm.m_n_unique_neigh];
 653
 654             unsigned int nreq = 0;
 655
 656             // loop over neighbors
 657             for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
 658                 {
 659                 // rank of neighbor processor
 660                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
 661
 662                 MPI_Isend(&n_send_groups[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_gpu_comm.m_mpi_comm, & req[nreq++]);
 663                 MPI_Irecv(&n_recv_groups[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_gpu_comm.m_mpi_comm, & req[nreq++]);
 664                 send_bytes += sizeof(unsigned int);
 665                 recv_bytes += sizeof(unsigned int);
 666                 } // end neighbor loop
 667
 668             MPI_Waitall(nreq, req, stat);
 669
 670             // sum up receive counts
 671             for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
 672                 {
 673                 if (ineigh == 0)
 674                     offs[ineigh] = 0;
 675                 else
 676                     offs[ineigh] = offs[ineigh-1] + n_recv_groups[ineigh-1];
 677
 678                 n_recv_tot += n_recv_groups[ineigh];
 679                 }
 680
 681             if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
 682             }
 683
 684         // Resize receive buffer
 685         m_ranks_recvbuf.resize(n_recv_tot);
 686
 687             {
 688             ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::read);
 689             ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::read);
 690             ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
 691
 692             if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf,"MPI send/recv");
 693
 694             #ifdef ENABLE_MPI_CUDA
 695             ArrayHandle<rank_element_t> ranks_sendbuf_handle(m_ranks_sendbuf, access_location::device, access_mode::read);
 696             ArrayHandle<rank_element_t> ranks_recvbuf_handle(m_ranks_recvbuf, access_location::device, access_mode::overwrite);
 697
 698             // MPI library may use non-zero stream
 699             cudaDeviceSynchronize();
 700             #else
 701             ArrayHandle<rank_element_t> ranks_sendbuf_handle(m_ranks_sendbuf, access_location::host, access_mode::read);
 702             ArrayHandle<rank_element_t> ranks_recvbuf_handle(m_ranks_recvbuf, access_location::host, access_mode::overwrite);
 703             #endif
 704
 705             std::vector<MPI_Request> reqs;
 706             MPI_Request req;
 707
 708             unsigned int send_bytes = 0;
 709             unsigned int recv_bytes = 0;
 710
 711             // loop over neighbors
 712             for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
 713                 {
 714                 // rank of neighbor processor
 715                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
 716
 717                 // exchange particle data
 718                 if (n_send_groups[ineigh])
 719                     {
 720                     MPI_Isend(ranks_sendbuf_handle.data+h_begin.data[ineigh],
 721                         n_send_groups[ineigh]*sizeof(rank_element_t),
 722                         MPI_BYTE,
 723                         neighbor,
 724                         1,
 725                         m_gpu_comm.m_mpi_comm,
 726                         &req);
 727                     reqs.push_back(req);
 728                     }
 729                 send_bytes+= n_send_groups[ineigh]*sizeof(rank_element_t);
 730
 731                 if (n_recv_groups[ineigh])
 732                     {
 733                     MPI_Irecv(ranks_recvbuf_handle.data+offs[ineigh],
 734                         n_recv_groups[ineigh]*sizeof(rank_element_t),
 735                         MPI_BYTE,
 736                         neighbor,
 737                         1,
 738                         m_gpu_comm.m_mpi_comm,
 739                         &req);
 740                     reqs.push_back(req);
 741                     }
 742                 recv_bytes += n_recv_groups[ineigh]*sizeof(rank_element_t);
 743                 }
 744
 745             std::vector<MPI_Status> stats(reqs.size());
 746             MPI_Waitall(reqs.size(), &reqs.front(), &stats.front());
 747
 748             if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
 749             }
 750
 751             {
 752             // access receive buffers
 753             ArrayHandle<rank_element_t> d_ranks_recvbuf(m_ranks_recvbuf, access_location::device, access_mode::read);
 754             ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::readwrite);
 755             ArrayHandle<unsigned int> d_group_rtag(m_gdata->getRTags(), access_location::device, access_mode::read);
 756
 757             // update local rank information
 758             gpu_update_ranks_table<group_data::size>(
 759                 m_gdata->getN(),
 760                 d_group_ranks.data,
 761                 d_group_rtag.data,
 762                 n_recv_tot,
 763                 d_ranks_recvbuf.data);
 764             if (m_gpu_comm.m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
 765             }
 766
 767         // resize output buffer
 768         m_groups_out.resize(n_out_groups);
 769         m_rank_mask_out.resize(n_out_groups);
 770
 771             {
 772             ArrayHandle<typename group_data::members_t> d_groups(m_gdata->getMembersArray(), access_location::device, access_mode::read);
 773             ArrayHandle<unsigned int> d_group_type(m_gdata->getTypesArray(), access_location::device, access_mode::read);
 774             ArrayHandle<unsigned int> d_group_tag(m_gdata->getTags(), access_location::device, access_mode::read);
 775             ArrayHandle<unsigned int> d_group_rtag(m_gdata->getRTags(), access_location::device, access_mode::readwrite);
 776             ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::read);
 777             ArrayHandle<unsigned int> d_rank_mask(m_rank_mask, access_location::device, access_mode::readwrite);
 778             ArrayHandle<unsigned int> d_scan(m_scan, access_location::device, access_mode::readwrite);
 779             ArrayHandle<group_element_t> d_groups_out(m_groups_out, access_location::device, access_mode::overwrite);
 780             ArrayHandle<unsigned int> d_rank_mask_out(m_rank_mask_out, access_location::device, access_mode::overwrite);
 781             ArrayHandle<unsigned int> d_rtag(m_gpu_comm.m_pdata->getRTags(), access_location::device, access_mode::read);
 782             ArrayHandle<unsigned int> d_comm_flags(m_gpu_comm.m_pdata->getCommFlags(), access_location::device, access_mode::read);
 783
 784             // scatter groups to be sent into output buffer, mark groups that have no local members for removal
 785             gpu_scatter_and_mark_groups_for_removal<group_data::size>(
 786                 m_gdata->getN(),
 787                 d_groups.data,
 788                 d_group_type.data,
 789                 d_group_tag.data,
 790                 d_group_rtag.data,
 791                 d_group_ranks.data,
 792                 d_rank_mask.data,
 793                 d_rtag.data,
 794                 d_comm_flags.data,
 795                 m_exec_conf->getRank(),
 796                 d_scan.data,
 797                 d_groups_out.data,
 798                 d_rank_mask_out.data);
 799             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
 800             }
 801
 802         unsigned int new_ngroups;
 803             {
 804             // access primary arrays to read from
 805             ArrayHandle<typename group_data::members_t> d_groups(m_gdata->getMembersArray(), access_location::device, access_mode::read);
 806             ArrayHandle<unsigned int> d_group_type(m_gdata->getTypesArray(), access_location::device, access_mode::read);
 807             ArrayHandle<unsigned int> d_group_tag(m_gdata->getTags(), access_location::device, access_mode::read);
 808             ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::read);
 809
 810             // access alternate arrays to write to
 811             ArrayHandle<typename group_data::members_t> d_groups_alt(m_gdata->getAltMembersArray(), access_location::device, access_mode::overwrite);
 812             ArrayHandle<unsigned int> d_group_type_alt(m_gdata->getAltTypesArray(), access_location::device, access_mode::overwrite);
 813             ArrayHandle<unsigned int> d_group_tag_alt(m_gdata->getAltTags(), access_location::device, access_mode::overwrite);
 814             ArrayHandle<typename group_data::ranks_t> d_group_ranks_alt(m_gdata->getAltRanksArray(), access_location::device, access_mode::overwrite);
 815
 816             ArrayHandle<unsigned int> d_scan(m_scan, access_location::device, access_mode::readwrite);
 817
 818             // access rtags
 819             ArrayHandle<unsigned int> d_group_rtag(m_gdata->getRTags(), access_location::device, access_mode::readwrite);
 820
 821             unsigned int ngroups = m_gdata->getN();
 822
 823             // remove groups from local table
 824             gpu_remove_groups(ngroups,
 825                 d_groups.data,
 826                 d_groups_alt.data,
 827                 d_group_type.data,
 828                 d_group_type_alt.data,
 829                 d_group_tag.data,
 830                 d_group_tag_alt.data,
 831                 d_group_ranks.data,
 832                 d_group_ranks_alt.data,
 833                 d_group_rtag.data,
 834                 new_ngroups,
 835                 d_scan.data,
 836                 m_gpu_comm.m_mgpu_context);
 837             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
 838             }
 839
 840         // resize alternate arrays to number of groups
 841         GPUVector<typename group_data::members_t>& alt_groups_array = m_gdata->getAltMembersArray();
 842         GPUVector<unsigned int>& alt_group_type_array = m_gdata->getAltTypesArray();
 843         GPUVector<unsigned int>& alt_group_tag_array = m_gdata->getAltTags();
 844         GPUVector<typename group_data::ranks_t>& alt_group_ranks_array = m_gdata->getAltRanksArray();
 845
 846         assert(new_ngroups <= m_gdata->getN());
 847         alt_groups_array.resize(new_ngroups);
 848         alt_group_type_array.resize(new_ngroups);
 849         alt_group_tag_array.resize(new_ngroups);
 850         alt_group_ranks_array.resize(new_ngroups);
 851
 852         // make alternate arrays current
 853         m_gdata->swapMemberArrays();
 854         m_gdata->swapTypeArrays();
 855         m_gdata->swapTagArrays();
 856         m_gdata->swapRankArrays();
 857
 858         assert(m_gdata->getN() == new_ngroups);
 859
 860         #if defined(ENABLE_MPI_CUDA) && 0
 861         #else
 862         // fill host send buffers on host
 863         typedef std::multimap<unsigned int, group_element_t> group_map_t;
 864         group_map_t group_send_map;
 865
 866             {
 867             // access output buffers
 868             ArrayHandle<group_element_t> h_groups_out(m_groups_out, access_location::host, access_mode::read);
 869             ArrayHandle<unsigned int> h_rank_mask_out(m_rank_mask_out, access_location::host, access_mode::read);
 870
 871             for (unsigned int i = 0; i < n_out_groups; ++i)
 872                 {
 873                 group_element_t el = h_groups_out.data[i];
 874                 typename group_data::ranks_t ranks = el.ranks;
 875
 876                 for (unsigned int j = 0; j < group_data::size; ++j)
 877                     {
 878                     unsigned int rank = ranks.idx[j];
 879                     // are we sending to this rank?
 880                     if (h_rank_mask_out.data[i] & (1 << j))
 881                         group_send_map.insert(std::make_pair(rank, el));
 882                     }
 883                 }
 884             }
 885
 886         // resize send buffers
 887         m_groups_sendbuf.resize(group_send_map.size());
 888
 889             {
 890             // access send buffers
 891             ArrayHandle<group_element_t> h_groups_sendbuf(m_groups_sendbuf, access_location::host, access_mode::overwrite);
 892
 893             // output send data sorted by rank
 894             unsigned int n = 0;
 895             for (typename group_map_t::iterator it = group_send_map.begin(); it != group_send_map.end(); ++it)
 896                 {
 897                 h_groups_sendbuf.data[n] = it->second;
 898                 n++;
 899                 }
 900
 901             ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
 902             ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::overwrite);
 903             ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::overwrite);
 904
 905             // Find start and end indices
 906             for (unsigned int i = 0; i < m_gpu_comm.m_n_unique_neigh; ++i)
 907                 {
 908                 typename group_map_t::iterator lower = group_send_map.lower_bound(h_unique_neighbors.data[i]);
 909                 typename group_map_t::iterator upper = group_send_map.upper_bound(h_unique_neighbors.data[i]);
 910                 h_begin.data[i] = std::distance(group_send_map.begin(),lower);
 911                 h_end.data[i] = std::distance(group_send_map.begin(),upper);
 912                 }
 913             }
 914         #endif
 915
 916         /*
 917          * communicate groups (phase 2)
 918          */
 919
 920        n_recv_tot = 0;
 921             {
 922             ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::read);
 923             ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::read);
 924             ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
 925
 926             unsigned int send_bytes = 0;
 927             unsigned int recv_bytes = 0;
 928             if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf, "MPI send/recv");
 929
 930             // compute send counts
 931             for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
 932                 n_send_groups[ineigh] = h_end.data[ineigh] - h_begin.data[ineigh];
 933
 934             MPI_Request req[2*m_gpu_comm.m_n_unique_neigh];
 935             MPI_Status stat[2*m_gpu_comm.m_n_unique_neigh];
 936
 937             unsigned int nreq = 0;
 938
 939             // loop over neighbors
 940             for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
 941                 {
 942                 // rank of neighbor processor
 943                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
 944
 945                 MPI_Isend(&n_send_groups[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_gpu_comm.m_mpi_comm, & req[nreq++]);
 946                 MPI_Irecv(&n_recv_groups[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_gpu_comm.m_mpi_comm, & req[nreq++]);
 947                 send_bytes += sizeof(unsigned int);
 948                 recv_bytes += sizeof(unsigned int);
 949                 } // end neighbor loop
 950
 951             MPI_Waitall(nreq, req, stat);
 952
 953             // sum up receive counts
 954             for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
 955                 {
 956                 if (ineigh == 0)
 957                     offs[ineigh] = 0;
 958                 else
 959                     offs[ineigh] = offs[ineigh-1] + n_recv_groups[ineigh-1];
 960
 961                 n_recv_tot += n_recv_groups[ineigh];
 962                 }
 963
 964             if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
 965             }
 966
 967         // Resize receive buffer
 968         m_groups_recvbuf.resize(n_recv_tot);
 969
 970             {
 971             ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::read);
 972             ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::read);
 973             ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
 974
 975             if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf,"MPI send/recv");
 976
 977             #if defined(ENABLE_MPI_CUDA) && 0
 978             #else
 979             ArrayHandle<group_element_t> groups_sendbuf_handle(m_groups_sendbuf, access_location::host, access_mode::read);
 980             ArrayHandle<group_element_t> groups_recvbuf_handle(m_groups_recvbuf, access_location::host, access_mode::overwrite);
 981             #endif
 982
 983             std::vector<MPI_Request> reqs;
 984             MPI_Request req;
 985
 986             unsigned int send_bytes = 0;
 987             unsigned int recv_bytes = 0;
 988
 989             // loop over neighbors
 990             for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
 991                 {
 992                 // rank of neighbor processor
 993                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
 994
 995                 // exchange particle data
 996                 if (n_send_groups[ineigh])
 997                     {
 998                     MPI_Isend(groups_sendbuf_handle.data+h_begin.data[ineigh],
 999                         n_send_groups[ineigh]*sizeof(group_element_t),
1000                         MPI_BYTE,
1001                         neighbor,
1002                         1,
1003                         m_gpu_comm.m_mpi_comm,
1004                         &req);
1005                     reqs.push_back(req);
1006                     }
1007                 send_bytes+= n_send_groups[ineigh]*sizeof(group_element_t);
1008
1009                 if (n_recv_groups[ineigh])
1010                     {
1011                     MPI_Irecv(groups_recvbuf_handle.data+offs[ineigh],
1012                         n_recv_groups[ineigh]*sizeof(group_element_t),
1013                         MPI_BYTE,
1014                         neighbor,
1015                         1,
1016                         m_gpu_comm.m_mpi_comm,
1017                         &req);
1018                     reqs.push_back(req);
1019                     }
1020                 recv_bytes += n_recv_groups[ineigh]*sizeof(group_element_t);
1021                 }
1022
1023             std::vector<MPI_Status> stats(reqs.size());
1024             MPI_Waitall(reqs.size(), &reqs.front(), &stats.front());
1025
1026             if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1027             }
1028
1029         unsigned int n_recv_unique = 0;
1030         #if defined(ENABLE_MPI_CUDA) && 0
1031         #else
1032             {
1033             ArrayHandle<group_element_t> h_groups_recvbuf(m_groups_recvbuf, access_location::host, access_mode::read);
1034
1035             // use a std::map, i.e. single-key, to filter out duplicate groups in input buffer
1036             typedef std::map<unsigned int, group_element_t> recv_map_t;
1037             recv_map_t recv_map;
1038
1039             for (unsigned int recv_idx = 0; recv_idx < n_recv_tot; recv_idx++)
1040                 {
1041                 group_element_t el = h_groups_recvbuf.data[recv_idx];
1042                 unsigned int tag= el.group_tag;
1043                 recv_map.insert(std::make_pair(tag, el));
1044                 }
1045
1046             // resize input array of unique groups
1047             m_groups_in.resize(recv_map.size());
1048
1049             // write out unique groups
1050             ArrayHandle<group_element_t> h_groups_in(m_groups_in, access_location::host, access_mode::overwrite);
1051             for (typename recv_map_t::iterator it = recv_map.begin(); it != recv_map.end(); ++it)
1052                 h_groups_in.data[n_recv_unique++] = it->second;
1053             assert(n_recv_unique == recv_map.size());
1054             }
1055         #endif
1056
1057         unsigned int old_ngroups = m_gdata->getN();
1058         new_ngroups = old_ngroups + n_recv_unique;
1059
1060         // resize group arrays to accomodate additional groups (there can still be duplicates with local groups)
1061         GPUVector<typename group_data::members_t>& groups_array = m_gdata->getMembersArray();
1062         GPUVector<unsigned int>& group_type_array = m_gdata->getTypesArray();
1063         GPUVector<unsigned int>& group_tag_array = m_gdata->getTags();
1064         GPUVector<typename group_data::ranks_t>& group_ranks_array = m_gdata->getRanksArray();
1065
1066         groups_array.resize(new_ngroups);
1067         group_type_array.resize(new_ngroups);
1068         group_tag_array.resize(new_ngroups);
1069         group_ranks_array.resize(new_ngroups);
1070
1071             {
1072             ArrayHandle<group_element_t> d_groups_in(m_groups_in, access_location::device, access_mode::read);
1073             ArrayHandle<typename group_data::members_t> d_groups(m_gdata->getMembersArray(), access_location::device, access_mode::readwrite);
1074             ArrayHandle<unsigned int> d_group_type(m_gdata->getTypesArray(), access_location::device, access_mode::readwrite);
1075             ArrayHandle<unsigned int> d_group_tag(m_gdata->getTags(), access_location::device, access_mode::readwrite);
1076             ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::readwrite);
1077             ArrayHandle<unsigned int> d_group_rtag(m_gdata->getRTags(), access_location::device, access_mode::readwrite);
1078
1079             // get temp buffer
1080             ScopedAllocation<unsigned int> d_tmp(m_exec_conf->getCachedAllocator(), n_recv_unique);
1081
1082             // add new groups, updating groups that are already present locally
1083             gpu_add_groups(old_ngroups,
1084                 n_recv_unique,
1085                 d_groups_in.data,
1086                 d_groups.data,
1087                 d_group_type.data,
1088                 d_group_tag.data,
1089                 d_group_ranks.data,
1090                 d_group_rtag.data,
1091                 new_ngroups,
1092                 d_tmp.data,
1093                 m_gpu_comm.m_mgpu_context);
1094             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1095             }
1096
1097         // resize arrays to final size
1098         groups_array.resize(new_ngroups);
1099         group_type_array.resize(new_ngroups);
1100         group_tag_array.resize(new_ngroups);
1101         group_ranks_array.resize(new_ngroups);
1102
1103         // indicate that group table has changed
1104         m_gdata->setDirty();
1105
1106         if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf);
1107         }
1108     }
1109
1110
1111 //! Mark ghost particles
1112 template<class group_data>
1113 void CommunicatorGPU::GroupCommunicatorGPU<group_data>::markGhostParticles(
1114     const GPUArray<unsigned int>& plans,
1115     unsigned int mask)
1116     {
1117     if (m_gdata->getNGlobal())
1118         {
1119         m_exec_conf->msg->notice(7) << "GroupCommunicator<" << m_gdata->getName() << ">: find incomplete groups" << std::endl;
1120
1121         ArrayHandle<typename group_data::members_t> d_groups(m_gdata->getMembersArray(), access_location::device, access_mode::read);
1122         ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::read);
1123         ArrayHandle<unsigned int> d_rtag(m_gpu_comm.m_pdata->getRTags(), access_location::device, access_mode::read);
1124         ArrayHandle<Scalar4> d_pos(m_gpu_comm.m_pdata->getPositions(), access_location::device, access_mode::read);
1125         ArrayHandle<unsigned int> d_plan(plans, access_location::device, access_mode::readwrite);
1126
1127         boost::shared_ptr<DomainDecomposition> decomposition = m_gpu_comm.m_pdata->getDomainDecomposition();
1128         ArrayHandle<unsigned int> d_cart_ranks_inv(decomposition->getInverseCartRanks(), access_location::device, access_mode::read);
1129         Index3D di = decomposition->getDomainIndexer();
1130         uint3 my_pos = decomposition->getGridPos();
1131
1132         gpu_mark_bonded_ghosts<group_data::size>(
1133             m_gdata->getN(),
1134             d_groups.data,
1135             d_group_ranks.data,
1136             d_pos.data,
1137             m_gpu_comm.m_pdata->getBox(),
1138             d_rtag.data,
1139             d_plan.data,
1140             di,
1141             my_pos,
1142             d_cart_ranks_inv.data,
1143             m_exec_conf->getRank(),
1144             mask);
1145         if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1146         }
1147     }
1148
1149 //! Transfer particles between neighboring domains
1150 void CommunicatorGPU::migrateParticles()
1151     {
1152     m_exec_conf->msg->notice(7) << "CommunicatorGPU: migrate particles" << std::endl;
1153
1154     // check if box is sufficiently large for communication
1155     Scalar3 L= m_pdata->getBox().getNearestPlaneDistance();
1156     const Index3D& di = m_decomposition->getDomainIndexer();
1157     if ((m_r_ghost >= L.x/Scalar(2.0) && di.getW() > 1) ||
1158         (m_r_ghost >= L.y/Scalar(2.0) && di.getH() > 1) ||
1159         (m_r_ghost >= L.z/Scalar(2.0) && di.getD() > 1))
1160         {
1161         m_exec_conf->msg->error() << "Simulation box too small for domain decomposition." << std::endl;
1162         throw std::runtime_error("Error during communication");
1163         }
1164
1165     if (m_prof)
1166         m_prof->push(m_exec_conf,"comm_migrate");
1167
1168     if (m_last_flags[comm_flag::tag])
1169         {
1170         // Reset reverse lookup tags of old ghost atoms
1171         ArrayHandle<unsigned int> d_rtag(m_pdata->getRTags(), access_location::device, access_mode::readwrite);
1172         ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
1173
1174         gpu_reset_rtags(m_pdata->getNGhosts(),
1175                         d_tag.data + m_pdata->getN(),
1176                         d_rtag.data);
1177
1178         if (m_exec_conf->isCUDAErrorCheckingEnabled())
1179             CHECK_CUDA_ERROR();
1180         }
1181
1182     // reset ghost particle number
1183     m_pdata->removeAllGhostParticles();
1184
1185     // main communication loop
1186     for (unsigned int stage = 0; stage < m_num_stages; stage++)
1187         {
1188             {
1189             ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
1190             ArrayHandle<unsigned int> d_comm_flag(m_pdata->getCommFlags(), access_location::device, access_mode::readwrite);
1191
1192             assert(stage < m_comm_mask.size());
1193
1194             // mark all particles which have left the box for sending (rtag=NOT_LOCAL)
1195             gpu_stage_particles(m_pdata->getN(),
1196                 d_pos.data,
1197                 d_comm_flag.data,
1198                 m_pdata->getBox(),
1199                 m_comm_mask[stage]);
1200
1201             if (m_exec_conf->isCUDAErrorCheckingEnabled())
1202                 CHECK_CUDA_ERROR();
1203
1204             }
1205
1206         /*
1207          * Bonded group communication, determine groups to be sent
1208          */
1209         // Bonds
1210         m_bond_comm.migrateGroups(m_bonds_changed);
1211         m_bonds_changed = false;
1212
1213         // Angles
1214         m_angle_comm.migrateGroups(m_angles_changed);
1215         m_angles_changed = false;
1216
1217         // Dihedrals
1218         m_dihedral_comm.migrateGroups(m_dihedrals_changed);
1219         m_dihedrals_changed = false;
1220
1221         // Dihedrals
1222         m_improper_comm.migrateGroups(m_impropers_changed);
1223         m_impropers_changed = false;
1224
1225         // fill send buffer
1226         m_pdata->removeParticlesGPU(m_gpu_sendbuf, m_comm_flags);
1227
1228         const Index3D& di = m_decomposition->getDomainIndexer();
1229         // determine local particles that are to be sent to neighboring processors and fill send buffer
1230         uint3 mypos = m_decomposition->getGridPos();
1231
1232         /* We need some better heuristics to decide whether to take the GPU or CPU code path */
1233         #if defined(ENABLE_MPI_CUDA) && 0
1234             {
1235             // resize keys
1236             m_send_keys.resize(m_gpu_sendbuf.size());
1237
1238             ArrayHandle<pdata_element> d_gpu_sendbuf(m_gpu_sendbuf, access_location::device, access_mode::readwrite);
1239             ArrayHandle<unsigned int> d_send_keys(m_send_keys, access_location::device, access_mode::overwrite);
1240             ArrayHandle<unsigned int> d_begin(m_begin, access_location::device, access_mode::overwrite);
1241             ArrayHandle<unsigned int> d_end(m_end, access_location::device, access_mode::overwrite);
1242             ArrayHandle<unsigned int> d_unique_neighbors(m_unique_neighbors, access_location::device, access_mode::read);
1243             ArrayHandle<unsigned int> d_comm_flags(m_comm_flags, access_location::device, access_mode::read);
1244
1245             ArrayHandle<unsigned int> d_cart_ranks(m_decomposition->getCartRanks(), access_location::device, access_mode::read);
1246
1247             // get temporary buffers
1248             unsigned int nsend = m_gpu_sendbuf.size();
1249             const CachedAllocator& alloc = m_exec_conf->getCachedAllocator();
1250             ScopedAllocation<pdata_element> d_in_copy(alloc, nsend);
1251             ScopedAllocation<unsigned int> d_tmp(alloc, nsend);
1252
1253             gpu_sort_migrating_particles(m_gpu_sendbuf.size(),
1254                        d_gpu_sendbuf.data,
1255                        d_comm_flags.data,
1256                        di,
1257                        mypos,
1258                        d_cart_ranks.data,
1259                        d_send_keys.data,
1260                        d_begin.data,
1261                        d_end.data,
1262                        d_unique_neighbors.data,
1263                        m_n_unique_neigh,
1264                        m_comm_mask[stage],
1265                        m_mgpu_context,
1266                        d_tmp.data,
1267                        d_in_copy.data);
1268
1269             if (m_exec_conf->isCUDAErrorCheckingEnabled())
1270                 CHECK_CUDA_ERROR();
1271             }
1272         #else
1273             {
1274             ArrayHandle<pdata_element> h_gpu_sendbuf(m_gpu_sendbuf, access_location::host, access_mode::readwrite);
1275             ArrayHandle<unsigned int> h_begin(m_begin, access_location::host, access_mode::overwrite);
1276             ArrayHandle<unsigned int> h_end(m_end, access_location::host, access_mode::overwrite);
1277             ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1278             ArrayHandle<unsigned int> h_comm_flags(m_comm_flags, access_location::host, access_mode::read);
1279
1280             ArrayHandle<unsigned int> h_cart_ranks(m_pdata->getDomainDecomposition()->getCartRanks(), access_location::host,
1281                 access_mode::read);
1282             typedef std::multimap<unsigned int,pdata_element> key_t;
1283             key_t keys;
1284
1285             // generate keys
1286             get_migrate_key t(mypos, di, m_comm_mask[stage],h_cart_ranks.data);
1287             for (unsigned int i = 0; i < m_comm_flags.size(); ++i)
1288                 keys.insert(std::pair<unsigned int, pdata_element>(t(h_comm_flags.data[i]),h_gpu_sendbuf.data[i]));
1289
1290             // Find start and end indices
1291             for (unsigned int i = 0; i < m_n_unique_neigh; ++i)
1292                 {
1293                 key_t::iterator lower = keys.lower_bound(h_unique_neighbors.data[i]);
1294                 key_t::iterator upper = keys.upper_bound(h_unique_neighbors.data[i]);
1295                 h_begin.data[i] = std::distance(keys.begin(),lower);
1296                 h_end.data[i] = std::distance(keys.begin(),upper);
1297                 }
1298
1299             // sort send buffer
1300             unsigned int i = 0;
1301             for (key_t::iterator it = keys.begin(); it != keys.end(); ++it)
1302                 h_gpu_sendbuf.data[i++] = it->second;
1303             }
1304         #endif
1305
1306         unsigned int n_send_ptls[m_n_unique_neigh];
1307         unsigned int n_recv_ptls[m_n_unique_neigh];
1308         unsigned int offs[m_n_unique_neigh];
1309         unsigned int n_recv_tot = 0;
1310
1311             {
1312             ArrayHandle<unsigned int> h_begin(m_begin, access_location::host, access_mode::read);
1313             ArrayHandle<unsigned int> h_end(m_end, access_location::host, access_mode::read);
1314             ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1315
1316             unsigned int send_bytes = 0;
1317             unsigned int recv_bytes = 0;
1318             if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
1319
1320             // compute send counts
1321             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1322                 n_send_ptls[ineigh] = h_end.data[ineigh] - h_begin.data[ineigh];
1323
1324             MPI_Request req[2*m_n_unique_neigh];
1325             MPI_Status stat[2*m_n_unique_neigh];
1326
1327             unsigned int nreq = 0;
1328
1329             // loop over neighbors
1330             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1331                 {
1332                 if (m_stages[ineigh] != (int) stage)
1333                     {
1334                     // skip neighbor if not participating in this communication stage
1335                     n_send_ptls[ineigh] = 0;
1336                     n_recv_ptls[ineigh] = 0;
1337                     continue;
1338                     }
1339
1340                 // rank of neighbor processor
1341                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
1342
1343                 MPI_Isend(&n_send_ptls[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_mpi_comm, & req[nreq++]);
1344                 MPI_Irecv(&n_recv_ptls[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_mpi_comm, & req[nreq++]);
1345                 send_bytes += sizeof(unsigned int);
1346                 recv_bytes += sizeof(unsigned int);
1347                 } // end neighbor loop
1348
1349             MPI_Waitall(nreq, req, stat);
1350
1351             // sum up receive counts
1352             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1353                 {
1354                 if (ineigh == 0)
1355                     offs[ineigh] = 0;
1356                 else
1357                     offs[ineigh] = offs[ineigh-1] + n_recv_ptls[ineigh-1];
1358
1359                 n_recv_tot += n_recv_ptls[ineigh];
1360                 }
1361
1362             if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1363             }
1364
1365         // Resize receive buffer
1366         m_gpu_recvbuf.resize(n_recv_tot);
1367
1368             {
1369             ArrayHandle<unsigned int> h_begin(m_begin, access_location::host, access_mode::read);
1370             ArrayHandle<unsigned int> h_end(m_end, access_location::host, access_mode::read);
1371             ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1372
1373             if (m_prof) m_prof->push(m_exec_conf,"MPI send/recv");
1374
1375             #if defined(ENABLE_MPI_CUDA) && 0
1376             ArrayHandle<pdata_element> gpu_sendbuf_handle(m_gpu_sendbuf, access_location::device, access_mode::read);
1377             ArrayHandle<pdata_element> gpu_recvbuf_handle(m_gpu_recvbuf, access_location::device, access_mode::overwrite);
1378             #else
1379             ArrayHandle<pdata_element> gpu_sendbuf_handle(m_gpu_sendbuf, access_location::host, access_mode::read);
1380             ArrayHandle<pdata_element> gpu_recvbuf_handle(m_gpu_recvbuf, access_location::host, access_mode::overwrite);
1381             #endif
1382
1383             std::vector<MPI_Request> reqs;
1384             MPI_Request req;
1385
1386             unsigned int send_bytes = 0;
1387             unsigned int recv_bytes = 0;
1388
1389             // loop over neighbors
1390             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1391                 {
1392                 // rank of neighbor processor
1393                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
1394
1395                 // exchange particle data
1396                 if (n_send_ptls[ineigh])
1397                     {
1398                     MPI_Isend(gpu_sendbuf_handle.data+h_begin.data[ineigh],
1399                         n_send_ptls[ineigh]*sizeof(pdata_element),
1400                         MPI_BYTE,
1401                         neighbor,
1402                         1,
1403                         m_mpi_comm,
1404                         &req);
1405                     reqs.push_back(req);
1406                     }
1407                 send_bytes+= n_send_ptls[ineigh]*sizeof(pdata_element);
1408
1409                 if (n_recv_ptls[ineigh])
1410                     {
1411                     MPI_Irecv(gpu_recvbuf_handle.data+offs[ineigh],
1412                         n_recv_ptls[ineigh]*sizeof(pdata_element),
1413                         MPI_BYTE,
1414                         neighbor,
1415                         1,
1416                         m_mpi_comm,
1417                         &req);
1418                     reqs.push_back(req);
1419                     }
1420                 recv_bytes += n_recv_ptls[ineigh]*sizeof(pdata_element);
1421                 }
1422
1423             std::vector<MPI_Status> stats(reqs.size());
1424             MPI_Waitall(reqs.size(), &reqs.front(), &stats.front());
1425
1426             if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1427             }
1428
1429             {
1430             ArrayHandle<pdata_element> d_gpu_recvbuf(m_gpu_recvbuf, access_location::device, access_mode::readwrite);
1431             const BoxDim shifted_box = getShiftedBox();
1432
1433             // Apply boundary conditions
1434             gpu_wrap_particles(n_recv_tot,
1435                                d_gpu_recvbuf.data,
1436                                shifted_box);
1437             if (m_exec_conf->isCUDAErrorCheckingEnabled())
1438                 CHECK_CUDA_ERROR();
1439             }
1440
1441         // remove particles that were sent and fill particle data with received particles
1442         m_pdata->addParticlesGPU(m_gpu_recvbuf);
1443
1444         } // end communication stage
1445
1446     if (m_prof) m_prof->pop(m_exec_conf);
1447     }
1448
1449 //! Build a ghost particle list, exchange ghost particle data with neighboring processors
1450 void CommunicatorGPU::exchangeGhosts()
1451     {
1452     if (m_prof) m_prof->push(m_exec_conf, "comm_ghost_exch");
1453
1454     m_exec_conf->msg->notice(7) << "CommunicatorGPU: ghost exchange" << std::endl;
1455
1456     // the ghost layer must be at_least m_r_ghost wide along every lattice direction
1457     Scalar3 ghost_fraction = m_r_ghost/m_pdata->getBox().getNearestPlaneDistance();
1458
1459     // resize arrays
1460     m_n_send_ghosts.resize(m_num_stages);
1461     m_n_recv_ghosts.resize(m_num_stages);
1462
1463     for (unsigned int istage = 0; istage < m_num_stages; ++istage)
1464         {
1465         m_n_send_ghosts[istage].resize(m_n_unique_neigh);
1466         m_n_recv_ghosts[istage].resize(m_n_unique_neigh);
1467         }
1468
1469     m_n_send_ghosts_tot.resize(m_num_stages);
1470     m_n_recv_ghosts_tot.resize(m_num_stages);
1471     m_ghost_offs.resize(m_num_stages);
1472     for (unsigned int istage = 0; istage < m_num_stages; ++istage)
1473         m_ghost_offs[istage].resize(m_n_unique_neigh);
1474
1475     m_ghost_begin.resize(m_n_unique_neigh*m_num_stages);
1476     m_ghost_end.resize(m_n_unique_neigh*m_num_stages);
1477
1478     m_idx_offs.resize(m_num_stages);
1479
1480     // get requested ghost fields
1481     CommFlags flags = getFlags();
1482
1483     // main communication loop
1484     for (unsigned int stage = 0; stage < m_num_stages; stage++)
1485         {
1486         // make room for plans
1487         m_ghost_plan.resize(m_pdata->getN()+m_pdata->getNGhosts());
1488
1489             {
1490             // compute plans for all particles, including already received ghosts
1491             ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
1492             ArrayHandle<unsigned int> d_ghost_plan(m_ghost_plan, access_location::device, access_mode::overwrite);
1493
1494             gpu_make_ghost_exchange_plan(d_ghost_plan.data,
1495                                          m_pdata->getN()+m_pdata->getNGhosts(),
1496                                          d_pos.data,
1497                                          m_pdata->getBox(),
1498                                          ghost_fraction,
1499                                          m_comm_mask[stage]);
1500
1501             if (m_exec_conf->isCUDAErrorCheckingEnabled())
1502                 CHECK_CUDA_ERROR();
1503             }
1504
1505         // mark particles that are members of incomplete of bonded groups as ghost
1506
1507         // bonds
1508         m_bond_comm.markGhostParticles(m_ghost_plan,m_comm_mask[stage]);
1509
1510         // angles
1511         m_angle_comm.markGhostParticles(m_ghost_plan,m_comm_mask[stage]);
1512
1513         // dihedrals
1514         m_dihedral_comm.markGhostParticles(m_ghost_plan,m_comm_mask[stage]);
1515
1516         // impropers
1517         m_improper_comm.markGhostParticles(m_ghost_plan,m_comm_mask[stage]);
1518
1519         // resize temporary number of neighbors array
1520         m_neigh_counts.resize(m_pdata->getN()+m_pdata->getNGhosts());
1521
1522             {
1523             ArrayHandle<unsigned int> d_ghost_plan(m_ghost_plan, access_location::device, access_mode::read);
1524             ArrayHandle<unsigned int> d_adj_mask(m_adj_mask, access_location::device, access_mode::read);
1525             ArrayHandle<unsigned int> d_neigh_counts(m_neigh_counts, access_location::device, access_mode::overwrite);
1526
1527             // count number of neighbors (total and per particle) the ghost ptls are sent to
1528             m_n_send_ghosts_tot[stage] =
1529                 gpu_exchange_ghosts_count_neighbors(
1530                     m_pdata->getN()+m_pdata->getNGhosts(),
1531                     d_ghost_plan.data,
1532                     d_adj_mask.data,
1533                     d_neigh_counts.data,
1534                     m_n_unique_neigh,
1535                     m_mgpu_context);
1536
1537             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1538             }
1539
1540         // compute offset into ghost idx list
1541         m_idx_offs[stage] = 0;
1542         for (unsigned int i = 0; i < stage; ++i)
1543             m_idx_offs[stage] +=  m_n_send_ghosts_tot[i];
1544
1545         // compute maximum send buf size
1546         unsigned int n_max = 0;
1547         for (unsigned int istage = 0; istage <= stage; ++istage)
1548             if (m_n_send_ghosts_tot[istage] > n_max) n_max = m_n_send_ghosts_tot[istage];
1549
1550         // make room for ghost indices and neighbor ranks
1551         m_ghost_idx_adj.resize(m_idx_offs[stage] + m_n_send_ghosts_tot[stage]);
1552         m_ghost_neigh.resize(m_idx_offs[stage] + m_n_send_ghosts_tot[stage]);
1553
1554         if (flags[comm_flag::tag]) m_tag_ghost_sendbuf.resize(n_max);
1555         if (flags[comm_flag::position]) m_pos_ghost_sendbuf.resize(n_max);
1556         if (flags[comm_flag::velocity]) m_vel_ghost_sendbuf.resize(n_max);
1557         if (flags[comm_flag::charge]) m_charge_ghost_sendbuf.resize(n_max);
1558         if (flags[comm_flag::diameter]) m_diameter_ghost_sendbuf.resize(n_max);
1559         if (flags[comm_flag::orientation]) m_orientation_ghost_sendbuf.resize(n_max);
1560
1561             {
1562             ArrayHandle<unsigned int> d_ghost_plan(m_ghost_plan, access_location::device, access_mode::read);
1563             ArrayHandle<unsigned int> d_adj_mask(m_adj_mask, access_location::device, access_mode::read);
1564             ArrayHandle<unsigned int> d_neigh_counts(m_neigh_counts, access_location::device, access_mode::read);
1565             ArrayHandle<unsigned int> d_unique_neighbors(m_unique_neighbors, access_location::device, access_mode::read);
1566
1567             ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
1568
1569             ArrayHandle<uint2> d_ghost_idx_adj(m_ghost_idx_adj, access_location::device, access_mode::overwrite);
1570             ArrayHandle<unsigned int> d_ghost_neigh(m_ghost_neigh, access_location::device, access_mode::overwrite);
1571             ArrayHandle<unsigned int> d_ghost_begin(m_ghost_begin, access_location::device, access_mode::overwrite);
1572             ArrayHandle<unsigned int> d_ghost_end(m_ghost_end, access_location::device, access_mode::overwrite);
1573
1574             //! Fill ghost send list and compute start and end indices per unique neighbor in list
1575             gpu_exchange_ghosts_make_indices(
1576                 m_pdata->getN() + m_pdata->getNGhosts(),
1577                 d_ghost_plan.data,
1578                 d_tag.data,
1579                 d_adj_mask.data,
1580                 d_unique_neighbors.data,
1581                 d_neigh_counts.data,
1582                 d_ghost_idx_adj.data + m_idx_offs[stage],
1583                 d_ghost_neigh.data + m_idx_offs[stage],
1584                 d_ghost_begin.data + stage*m_n_unique_neigh,
1585                 d_ghost_end.data + stage*m_n_unique_neigh,
1586                 m_n_unique_neigh,
1587                 m_n_send_ghosts_tot[stage],
1588                 m_comm_mask[stage],
1589                 m_mgpu_context);
1590
1591             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1592             }
1593
1594             {
1595             // access particle data
1596             ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
1597             ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
1598             ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::read);
1599             ArrayHandle<Scalar> d_charge(m_pdata->getCharges(), access_location::device, access_mode::read);
1600             ArrayHandle<Scalar> d_diameter(m_pdata->getDiameters(), access_location::device, access_mode::read);
1601             ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::read);
1602             ArrayHandle<unsigned int> d_rtag(m_pdata->getRTags(), access_location::device, access_mode::read);
1603
1604             // access ghost send indices
1605             ArrayHandle<uint2> d_ghost_idx_adj(m_ghost_idx_adj, access_location::device, access_mode::read);
1606
1607             // access output buffers
1608             ArrayHandle<unsigned int> d_tag_ghost_sendbuf(m_tag_ghost_sendbuf, access_location::device, access_mode::overwrite);
1609             ArrayHandle<Scalar4> d_pos_ghost_sendbuf(m_pos_ghost_sendbuf, access_location::device, access_mode::overwrite);
1610             ArrayHandle<Scalar4> d_vel_ghost_sendbuf(m_vel_ghost_sendbuf, access_location::device, access_mode::overwrite);
1611             ArrayHandle<Scalar> d_charge_ghost_sendbuf(m_charge_ghost_sendbuf, access_location::device, access_mode::overwrite);
1612             ArrayHandle<Scalar> d_diameter_ghost_sendbuf(m_diameter_ghost_sendbuf, access_location::device, access_mode::overwrite);
1613             ArrayHandle<Scalar4> d_orientation_ghost_sendbuf(m_orientation_ghost_sendbuf, access_location::device, access_mode::overwrite);
1614
1615             const BoxDim& global_box = m_pdata->getGlobalBox();
1616             const Index3D& di = m_pdata->getDomainDecomposition()->getDomainIndexer();
1617             uint3 my_pos = m_pdata->getDomainDecomposition()->getGridPos();
1618
1619             // Pack ghosts into send buffers
1620             gpu_exchange_ghosts_pack(
1621                 m_n_send_ghosts_tot[stage],
1622                 d_ghost_idx_adj.data + m_idx_offs[stage],
1623                 d_tag.data,
1624                 d_pos.data,
1625                 d_vel.data,
1626                 d_charge.data,
1627                 d_diameter.data,
1628                 d_orientation.data,
1629                 d_tag_ghost_sendbuf.data,
1630                 d_pos_ghost_sendbuf.data,
1631                 d_vel_ghost_sendbuf.data,
1632                 d_charge_ghost_sendbuf.data,
1633                 d_diameter_ghost_sendbuf.data,
1634                 d_orientation_ghost_sendbuf.data,
1635                 flags[comm_flag::tag],
1636                 flags[comm_flag::position],
1637                 flags[comm_flag::velocity],
1638                 flags[comm_flag::charge],
1639                 flags[comm_flag::diameter],
1640                 flags[comm_flag::orientation],
1641                 di,
1642                 my_pos,
1643                 global_box);
1644
1645             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1646             }
1647
1648         /*
1649          * Ghost particle communication
1650          */
1651         m_n_recv_ghosts_tot[stage] = 0;
1652
1653         unsigned int send_bytes = 0;
1654         unsigned int recv_bytes = 0;
1655
1656             {
1657             ArrayHandle<unsigned int> h_ghost_begin(m_ghost_begin, access_location::host, access_mode::read);
1658             ArrayHandle<unsigned int> h_ghost_end(m_ghost_end, access_location::host, access_mode::read);
1659             ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1660
1661             if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
1662
1663             // compute send counts
1664             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1665                 m_n_send_ghosts[stage][ineigh] = h_ghost_end.data[ineigh+stage*m_n_unique_neigh]
1666                     - h_ghost_begin.data[ineigh+stage*m_n_unique_neigh];
1667
1668             MPI_Request req[2*m_n_unique_neigh];
1669             MPI_Status stat[2*m_n_unique_neigh];
1670
1671             unsigned int nreq = 0;
1672
1673             // loop over neighbors
1674             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1675                 {
1676                 if (m_stages[ineigh] != (int) stage)
1677                     {
1678                     // skip neighbor if not participating in this communication stage
1679                     m_n_send_ghosts[stage][ineigh] = 0;
1680                     m_n_recv_ghosts[stage][ineigh] = 0;
1681                     continue;
1682                     }
1683
1684                 // rank of neighbor processor
1685                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
1686
1687                 MPI_Isend(&m_n_send_ghosts[stage][ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_mpi_comm, & req[nreq++]);
1688                 MPI_Irecv(&m_n_recv_ghosts[stage][ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_mpi_comm, & req[nreq++]);
1689
1690                 send_bytes += sizeof(unsigned int);
1691                 recv_bytes += sizeof(unsigned int);
1692                 }
1693
1694             MPI_Waitall(nreq, req, stat);
1695
1696             // total up receive counts
1697             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1698                 {
1699                 if (ineigh == 0)
1700                     m_ghost_offs[stage][ineigh] = 0;
1701                 else
1702                     m_ghost_offs[stage][ineigh] = m_ghost_offs[stage][ineigh-1] + m_n_recv_ghosts[stage][ineigh-1];
1703
1704                 m_n_recv_ghosts_tot[stage] += m_n_recv_ghosts[stage][ineigh];
1705                 }
1706
1707             if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1708             }
1709
1710         n_max = 0;
1711         // compute maximum number of received ghosts
1712         for (unsigned int istage = 0; istage <= stage; ++istage)
1713             if (m_n_recv_ghosts_tot[istage] > n_max) n_max = m_n_recv_ghosts_tot[istage];
1714
1715         if (flags[comm_flag::tag]) m_tag_ghost_recvbuf.resize(n_max);
1716         if (flags[comm_flag::position]) m_pos_ghost_recvbuf.resize(n_max);
1717         if (flags[comm_flag::velocity]) m_vel_ghost_recvbuf.resize(n_max);
1718         if (flags[comm_flag::charge]) m_charge_ghost_recvbuf.resize(n_max);
1719         if (flags[comm_flag::diameter]) m_diameter_ghost_recvbuf.resize(n_max);
1720         if (flags[comm_flag::orientation]) m_orientation_ghost_recvbuf.resize(n_max);
1721
1722         // first ghost ptl index
1723         unsigned int first_idx = m_pdata->getN()+m_pdata->getNGhosts();
1724
1725         // update number of ghost particles
1726         m_pdata->addGhostParticles(m_n_recv_ghosts_tot[stage]);
1727
1728             {
1729             unsigned int offs = 0;
1730             #ifdef ENABLE_MPI_CUDA
1731             // recv buffers, write directly into particle data arrays
1732             offs = first_idx;
1733             ArrayHandle<unsigned int> tag_ghost_recvbuf_handle(m_pdata->getTags(), access_location::device, access_mode::readwrite);
1734             ArrayHandle<Scalar4> pos_ghost_recvbuf_handle(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
1735             ArrayHandle<Scalar4> vel_ghost_recvbuf_handle(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
1736             ArrayHandle<Scalar> charge_ghost_recvbuf_handle(m_pdata->getCharges(), access_location::device, access_mode::readwrite);
1737             ArrayHandle<Scalar> diameter_ghost_recvbuf_handle(m_pdata->getDiameters(), access_location::device, access_mode::readwrite);
1738             ArrayHandle<Scalar4> orientation_ghost_recvbuf_handle(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
1739
1740             // send buffers
1741             ArrayHandle<unsigned int> tag_ghost_sendbuf_handle(m_tag_ghost_sendbuf, access_location::device, access_mode::read);
1742             ArrayHandle<Scalar4> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf, access_location::device, access_mode::read);
1743             ArrayHandle<Scalar4> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf, access_location::device, access_mode::read);
1744             ArrayHandle<Scalar> charge_ghost_sendbuf_handle(m_charge_ghost_sendbuf, access_location::device, access_mode::read);
1745             ArrayHandle<Scalar> diameter_ghost_sendbuf_handle(m_diameter_ghost_sendbuf, access_location::device, access_mode::read);
1746             ArrayHandle<Scalar4> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf, access_location::device, access_mode::read);
1747
1748             // MPI library may use non-zero stream
1749             cudaDeviceSynchronize();
1750             #else
1751             // recv buffers
1752             ArrayHandleAsync<unsigned int> tag_ghost_recvbuf_handle(m_tag_ghost_recvbuf, access_location::host, access_mode::overwrite);
1753             ArrayHandleAsync<Scalar4> pos_ghost_recvbuf_handle(m_pos_ghost_recvbuf, access_location::host, access_mode::overwrite);
1754             ArrayHandleAsync<Scalar4> vel_ghost_recvbuf_handle(m_vel_ghost_recvbuf, access_location::host, access_mode::overwrite);
1755             ArrayHandleAsync<Scalar> charge_ghost_recvbuf_handle(m_charge_ghost_recvbuf, access_location::host, access_mode::overwrite);
1756             ArrayHandleAsync<Scalar> diameter_ghost_recvbuf_handle(m_diameter_ghost_recvbuf, access_location::host, access_mode::overwrite);
1757             ArrayHandleAsync<Scalar4> orientation_ghost_recvbuf_handle(m_orientation_ghost_recvbuf, access_location::host, access_mode::overwrite);
1758             // send buffers
1759             ArrayHandleAsync<unsigned int> tag_ghost_sendbuf_handle(m_tag_ghost_sendbuf, access_location::host, access_mode::read);
1760             ArrayHandleAsync<Scalar4> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf, access_location::host, access_mode::read);
1761             ArrayHandleAsync<Scalar4> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf, access_location::host, access_mode::read);
1762             ArrayHandleAsync<Scalar> charge_ghost_sendbuf_handle(m_charge_ghost_sendbuf, access_location::host, access_mode::read);
1763             ArrayHandleAsync<Scalar> diameter_ghost_sendbuf_handle(m_diameter_ghost_sendbuf, access_location::host, access_mode::read);
1764             ArrayHandleAsync<Scalar4> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf, access_location::host, access_mode::read);
1765
1766             // lump together into one synchronization call
1767             cudaDeviceSynchronize();
1768             #endif
1769
1770             ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1771             ArrayHandle<unsigned int> h_ghost_begin(m_ghost_begin, access_location::host, access_mode::read);
1772             ArrayHandle<unsigned int> h_ghost_end(m_ghost_end, access_location::host, access_mode::read);
1773
1774             if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
1775
1776             std::vector<MPI_Request> reqs;
1777             MPI_Request req;
1778
1779             // loop over neighbors
1780             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1781                 {
1782                 // rank of neighbor processor
1783                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
1784
1785                 if (flags[comm_flag::tag])
1786                     {
1787                     // when sending/receiving 0 ptls, the send/recv buffer may be uninitialized
1788                     if (m_n_send_ghosts[stage][ineigh])
1789                         {
1790                         MPI_Isend(tag_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh+stage*m_n_unique_neigh],
1791                             m_n_send_ghosts[stage][ineigh]*sizeof(unsigned int),
1792                             MPI_BYTE,
1793                             neighbor,
1794                             1,
1795                             m_mpi_comm,
1796                             &req);
1797                         reqs.push_back(req);
1798                         }
1799                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(unsigned int);
1800                     if (m_n_recv_ghosts[stage][ineigh])
1801                         {
1802                         MPI_Irecv(tag_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1803                             m_n_recv_ghosts[stage][ineigh]*sizeof(unsigned int),
1804                             MPI_BYTE,
1805                             neighbor,
1806                             1,
1807                             m_mpi_comm,
1808                             &req);
1809                         reqs.push_back(req);
1810                         }
1811                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(unsigned int);
1812                     }
1813
1814                 if (flags[comm_flag::position])
1815                     {
1816                     MPI_Request req;
1817                     if (m_n_send_ghosts[stage][ineigh])
1818                         {
1819                         MPI_Isend(pos_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1820                             m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
1821                             MPI_BYTE,
1822                             neighbor,
1823                             2,
1824                             m_mpi_comm,
1825                             &req);
1826                         reqs.push_back(req);
1827                         }
1828                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
1829                     if (m_n_recv_ghosts[stage][ineigh])
1830                         {
1831                         MPI_Irecv(pos_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1832                             m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
1833                             MPI_BYTE,
1834                             neighbor,
1835                             2,
1836                             m_mpi_comm,
1837                             &req);
1838                         reqs.push_back(req);
1839                         }
1840                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(unsigned int);
1841                     }
1842
1843                 if (flags[comm_flag::velocity])
1844                     {
1845                     if (m_n_send_ghosts[stage][ineigh])
1846                         {
1847                         MPI_Isend(vel_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1848                             m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
1849                             MPI_BYTE,
1850                             neighbor,
1851                             3,
1852                             m_mpi_comm,
1853                             &req);
1854                         reqs.push_back(req);
1855                         }
1856                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
1857                     if (m_n_recv_ghosts[stage][ineigh])
1858                         {
1859                         MPI_Irecv(vel_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1860                             m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
1861                             MPI_BYTE,
1862                             neighbor,
1863                             3,
1864                             m_mpi_comm,
1865                             &req);
1866                         reqs.push_back(req);
1867                         }
1868                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
1869                     }
1870
1871                 if (flags[comm_flag::charge])
1872                     {
1873                     if (m_n_send_ghosts[stage][ineigh])
1874                         {
1875                         MPI_Isend(charge_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1876                             m_n_send_ghosts[stage][ineigh]*sizeof(Scalar),
1877                             MPI_BYTE,
1878                             neighbor,
1879                             4,
1880                             m_mpi_comm,
1881                             &req);
1882                         reqs.push_back(req);
1883                         }
1884                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar);
1885                     if (m_n_recv_ghosts[stage][ineigh])
1886                         {
1887                         MPI_Irecv(charge_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1888                             m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar),
1889                             MPI_BYTE,
1890                             neighbor,
1891                             4,
1892                             m_mpi_comm,
1893                             &req);
1894                         reqs.push_back(req);
1895                         }
1896                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar);
1897                     }
1898
1899                 if (flags[comm_flag::diameter])
1900                     {
1901                     if (m_n_send_ghosts[stage][ineigh])
1902                         {
1903                         MPI_Isend(diameter_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1904                             m_n_send_ghosts[stage][ineigh]*sizeof(Scalar),
1905                             MPI_BYTE,
1906                             neighbor,
1907                             5,
1908                             m_mpi_comm,
1909                             &req);
1910                         reqs.push_back(req);
1911                         }
1912                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar);
1913                     if (m_n_recv_ghosts[stage][ineigh])
1914                         {
1915                         MPI_Irecv(diameter_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1916                             m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar),
1917                             MPI_BYTE,
1918                             neighbor,
1919                             5,
1920                             m_mpi_comm,
1921                             &req);
1922                         reqs.push_back(req);
1923                         }
1924                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar);
1925                     }
1926
1927                 if (flags[comm_flag::orientation])
1928                     {
1929                     if (m_n_send_ghosts[stage][ineigh])
1930                         {
1931                         MPI_Isend(orientation_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1932                             m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
1933                             MPI_BYTE,
1934                             neighbor,
1935                             6,
1936                             m_mpi_comm,
1937                             &req);
1938                         reqs.push_back(req);
1939                         }
1940                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
1941                     if (m_n_recv_ghosts[stage][ineigh])
1942                         {
1943                         MPI_Irecv(orientation_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1944                             m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
1945                             MPI_BYTE,
1946                             neighbor,
1947                             6,
1948                             m_mpi_comm,
1949                             &req);
1950                         reqs.push_back(req);
1951                         }
1952                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
1953                     }
1954                 } // end neighbor loop
1955
1956             std::vector<MPI_Status> stats(reqs.size());
1957             MPI_Waitall(reqs.size(), &reqs.front(), &stats.front());
1958
1959             if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1960             } // end ArrayHandle scope
1961
1962         #ifdef ENABLE_MPI_CUDA
1963         // MPI library may use non-zero stream
1964         cudaDeviceSynchronize();
1965         #else
1966         // only unpack in non-CUDA MPI builds
1967             {
1968             // access receive buffers
1969             ArrayHandle<unsigned int> d_tag_ghost_recvbuf(m_tag_ghost_recvbuf, access_location::device, access_mode::read);
1970             ArrayHandle<Scalar4> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf, access_location::device, access_mode::read);
1971             ArrayHandle<Scalar4> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf, access_location::device, access_mode::read);
1972             ArrayHandle<Scalar> d_charge_ghost_recvbuf(m_charge_ghost_recvbuf, access_location::device, access_mode::read);
1973             ArrayHandle<Scalar> d_diameter_ghost_recvbuf(m_diameter_ghost_recvbuf, access_location::device, access_mode::read);
1974             ArrayHandle<Scalar4> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf, access_location::device, access_mode::read);
1975             // access particle data
1976             ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::readwrite);
1977             ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
1978             ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
1979             ArrayHandle<Scalar> d_charge(m_pdata->getCharges(), access_location::device, access_mode::readwrite);
1980             ArrayHandle<Scalar> d_diameter(m_pdata->getDiameters(), access_location::device, access_mode::readwrite);
1981             ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
1982
1983             // copy recv buf into particle data
1984             gpu_exchange_ghosts_copy_buf(
1985                 m_n_recv_ghosts_tot[stage],
1986                 d_tag_ghost_recvbuf.data,
1987                 d_pos_ghost_recvbuf.data,
1988                 d_vel_ghost_recvbuf.data,
1989                 d_charge_ghost_recvbuf.data,
1990                 d_diameter_ghost_recvbuf.data,
1991                 d_orientation_ghost_recvbuf.data,
1992                 d_tag.data + first_idx,
1993                 d_pos.data + first_idx,
1994                 d_vel.data + first_idx,
1995                 d_charge.data + first_idx,
1996                 d_diameter.data + first_idx,
1997                 d_orientation.data + first_idx,
1998                 flags[comm_flag::tag],
1999                 flags[comm_flag::position],
2000                 flags[comm_flag::velocity],
2001                 flags[comm_flag::charge],
2002                 flags[comm_flag::diameter],
2003                 flags[comm_flag::orientation]);
2004
2005             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2006             }
2007         #endif
2008
2009         if (flags[comm_flag::tag])
2010             {
2011             ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
2012             ArrayHandle<unsigned int> d_rtag(m_pdata->getRTags(), access_location::device, access_mode::readwrite);
2013
2014             // update reverse-lookup table
2015             gpu_compute_ghost_rtags(first_idx,
2016                 m_n_recv_ghosts_tot[stage],
2017                 d_tag.data + first_idx,
2018                 d_rtag.data);
2019             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2020             }
2021         } // end main communication loop
2022
2023     m_last_flags = flags;
2024
2025     if (m_prof) m_prof->pop(m_exec_conf);
2026
2027     // we have updated ghost particles, so notify subscribers about this
2028     m_pdata->notifyGhostParticleNumberChange();
2029     }
2030
2031 //! Perform ghosts update
2032 void CommunicatorGPU::beginUpdateGhosts(unsigned int timestep)
2033     {
2034     m_exec_conf->msg->notice(7) << "CommunicatorGPU: ghost update" << std::endl;
2035
2036     if (m_prof) m_prof->push(m_exec_conf, "comm_ghost_update");
2037
2038     CommFlags flags = getFlags();
2039
2040     // main communication loop
2041     for (unsigned int stage = 0; stage < m_num_stages; ++stage)
2042         {
2043         #ifndef ENABLE_MPI_CUDA
2044         bool mapped_ghost_recv = m_tuner_ghost_recv->getParam();
2045         bool mapped_ghost_send = m_tuner_ghost_send->getParam();
2046
2047         // switch between mapped and non-mapped
2048         if (mapped_ghost_recv != m_mapped_ghost_recv)
2049             {
2050             m_pos_ghost_recvbuf_alt.resize(m_pos_ghost_recvbuf.size());
2051             m_pos_ghost_recvbuf.swap(m_pos_ghost_recvbuf_alt);
2052
2053             m_vel_ghost_recvbuf_alt.resize(m_vel_ghost_recvbuf.size());
2054             m_vel_ghost_recvbuf.swap(m_vel_ghost_recvbuf_alt);
2055
2056             m_orientation_ghost_recvbuf_alt.resize(m_orientation_ghost_recvbuf.size());
2057             m_orientation_ghost_recvbuf.swap(m_orientation_ghost_recvbuf_alt);
2058
2059             m_tag_ghost_recvbuf_alt.resize(m_tag_ghost_recvbuf.size());
2060             m_tag_ghost_recvbuf.swap(m_tag_ghost_recvbuf_alt);
2061
2062             m_charge_ghost_recvbuf_alt.resize(m_charge_ghost_recvbuf.size());
2063             m_charge_ghost_recvbuf.swap(m_charge_ghost_recvbuf_alt);
2064
2065             m_diameter_ghost_recvbuf_alt.resize(m_diameter_ghost_recvbuf.size());
2066             m_diameter_ghost_recvbuf.swap(m_diameter_ghost_recvbuf_alt);
2067
2068             m_mapped_ghost_recv = mapped_ghost_recv;
2069             }
2070         if (mapped_ghost_send != m_mapped_ghost_send)
2071             {
2072             m_pos_ghost_sendbuf_alt.resize(m_pos_ghost_sendbuf.size());
2073             m_pos_ghost_sendbuf.swap(m_pos_ghost_sendbuf_alt);
2074
2075             m_vel_ghost_sendbuf_alt.resize(m_vel_ghost_sendbuf.size());
2076             m_vel_ghost_sendbuf.swap(m_vel_ghost_sendbuf_alt);
2077
2078             m_orientation_ghost_sendbuf_alt.resize(m_orientation_ghost_sendbuf.size());
2079             m_orientation_ghost_sendbuf.swap(m_orientation_ghost_sendbuf_alt);
2080
2081             m_tag_ghost_sendbuf_alt.resize(m_tag_ghost_sendbuf.size());
2082             m_tag_ghost_sendbuf.swap(m_tag_ghost_sendbuf_alt);
2083
2084             m_charge_ghost_sendbuf_alt.resize(m_charge_ghost_sendbuf.size());
2085             m_charge_ghost_sendbuf.swap(m_charge_ghost_sendbuf_alt);
2086
2087             m_diameter_ghost_sendbuf_alt.resize(m_diameter_ghost_sendbuf.size());
2088             m_diameter_ghost_sendbuf.swap(m_diameter_ghost_sendbuf_alt);
2089
2090             m_mapped_ghost_send = mapped_ghost_send;
2091             }
2092         #endif
2093
2094         if (m_prof) m_prof->push(m_exec_conf,"pack");
2095             {
2096             // access particle data
2097             ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
2098             ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::read);
2099             ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::read);
2100
2101             // access ghost send indices
2102             ArrayHandle<uint2> d_ghost_idx_adj(m_ghost_idx_adj, access_location::device, access_mode::read);
2103
2104             #ifndef ENABLE_MPI_CUDA
2105             // begin measurement of device-host transfer time
2106             m_tuner_ghost_send->begin();
2107             #endif
2108
2109             // access output buffers
2110             ArrayHandle<unsigned int> d_tag_ghost_sendbuf(m_tag_ghost_sendbuf, access_location::device, access_mode::overwrite);
2111             ArrayHandle<Scalar4> d_pos_ghost_sendbuf(m_pos_ghost_sendbuf, access_location::device, access_mode::overwrite);
2112             ArrayHandle<Scalar4> d_vel_ghost_sendbuf(m_vel_ghost_sendbuf, access_location::device, access_mode::overwrite);
2113             ArrayHandle<Scalar4> d_orientation_ghost_sendbuf(m_orientation_ghost_sendbuf, access_location::device, access_mode::overwrite);
2114
2115             const BoxDim& global_box = m_pdata->getGlobalBox();
2116             const Index3D& di = m_pdata->getDomainDecomposition()->getDomainIndexer();
2117             uint3 my_pos = m_pdata->getDomainDecomposition()->getGridPos();
2118
2119             // Pack ghosts into send buffers
2120             gpu_exchange_ghosts_pack(
2121                 m_n_send_ghosts_tot[stage],
2122                 d_ghost_idx_adj.data + m_idx_offs[stage],
2123                 NULL,
2124                 d_pos.data,
2125                 d_vel.data,
2126                 NULL,
2127                 NULL,
2128                 d_orientation.data,
2129                 NULL,
2130                 d_pos_ghost_sendbuf.data,
2131                 d_vel_ghost_sendbuf.data,
2132                 NULL,
2133                 NULL,
2134                 d_orientation_ghost_sendbuf.data,
2135                 false,
2136                 flags[comm_flag::position],
2137                 flags[comm_flag::velocity],
2138                 false,
2139                 false,
2140                 flags[comm_flag::orientation],
2141                 di,
2142                 my_pos,
2143                 global_box);
2144
2145             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2146             }
2147         if (m_prof) m_prof->pop(m_exec_conf);
2148
2149         /*
2150          * Ghost particle communication
2151          */
2152
2153         // first ghost ptl index
2154         unsigned int first_idx = m_pdata->getN();
2155
2156         // total up ghosts received thus far
2157         for (unsigned int istage = 0; istage < stage; ++istage)
2158             first_idx += m_n_recv_ghosts_tot[istage];
2159
2160             {
2161             unsigned int offs = 0;
2162             // access particle data
2163             #ifdef ENABLE_MPI_CUDA
2164             offs = first_idx;
2165             // recv buffers, directly write into particle data arrays
2166             ArrayHandle<Scalar4> pos_ghost_recvbuf_handle(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
2167             ArrayHandle<Scalar4> vel_ghost_recvbuf_handle(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
2168             ArrayHandle<Scalar4> orientation_ghost_recvbuf_handle(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
2169
2170             // send buffers
2171             ArrayHandle<Scalar4> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf, access_location::device, access_mode::read);
2172             ArrayHandle<Scalar4> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf, access_location::device, access_mode::read);
2173             ArrayHandle<Scalar4> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf, access_location::device, access_mode::read);
2174
2175             ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
2176             ArrayHandle<unsigned int> h_ghost_begin(m_ghost_begin, access_location::host, access_mode::read);
2177
2178             // MPI library may use non-zero stream
2179             cudaDeviceSynchronize();
2180             #else
2181             // recv buffers
2182             ArrayHandle<Scalar4> pos_ghost_recvbuf_handle(m_pos_ghost_recvbuf, access_location::host, access_mode::overwrite);
2183             ArrayHandle<Scalar4> vel_ghost_recvbuf_handle(m_vel_ghost_recvbuf, access_location::host, access_mode::overwrite);
2184             ArrayHandle<Scalar4> orientation_ghost_recvbuf_handle(m_orientation_ghost_recvbuf, access_location::host, access_mode::overwrite);
2185
2186             // send buffers
2187             ArrayHandleAsync<Scalar4> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf, access_location::host, access_mode::read);
2188             ArrayHandleAsync<Scalar4> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf, access_location::host, access_mode::read);
2189             ArrayHandleAsync<Scalar4> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf, access_location::host, access_mode::read);
2190
2191             // end measurement of device-host transfer time
2192             m_tuner_ghost_send->end();
2193
2194             ArrayHandleAsync<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
2195             ArrayHandleAsync<unsigned int> h_ghost_begin(m_ghost_begin, access_location::host, access_mode::read);
2196
2197             // lump together into one synchronization call
2198             cudaEventRecord(m_event);
2199             cudaEventSynchronize(m_event);
2200             #endif
2201
2202             // access send buffers
2203             if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
2204
2205             m_reqs.clear();
2206             MPI_Request req;
2207
2208             unsigned int send_bytes = 0;
2209             unsigned int recv_bytes = 0;
2210
2211             // loop over neighbors
2212             for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
2213                 {
2214                 // rank of neighbor processor
2215                 unsigned int neighbor = h_unique_neighbors.data[ineigh];
2216
2217                 if (flags[comm_flag::position])
2218                     {
2219                     if (m_n_send_ghosts[stage][ineigh])
2220                         {
2221                         MPI_Isend(pos_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
2222                             m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
2223                             MPI_BYTE,
2224                             neighbor,
2225                             2,
2226                             m_mpi_comm,
2227                             &req);
2228                         m_reqs.push_back(req);
2229                         }
2230                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
2231
2232                     if (m_n_recv_ghosts[stage][ineigh])
2233                         {
2234                         MPI_Irecv(pos_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
2235                             m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
2236                             MPI_BYTE,
2237                             neighbor,
2238                             2,
2239                             m_mpi_comm,
2240                             &req);
2241                         m_reqs.push_back(req);
2242                         }
2243                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
2244                     }
2245
2246                 if (flags[comm_flag::velocity])
2247                     {
2248                     if (m_n_send_ghosts[stage][ineigh])
2249                         {
2250                         MPI_Isend(vel_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
2251                             m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
2252                             MPI_BYTE,
2253                             neighbor,
2254                             3,
2255                             m_mpi_comm,
2256                             &req);
2257                         m_reqs.push_back(req);
2258                         }
2259                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
2260
2261                     if (m_n_recv_ghosts[stage][ineigh])
2262                         {
2263                         MPI_Irecv(vel_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
2264                             m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
2265                             MPI_BYTE,
2266                             neighbor,
2267                             3,
2268                             m_mpi_comm,
2269                             &req);
2270                         m_reqs.push_back(req);
2271                         }
2272                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
2273                     }
2274
2275                 if (flags[comm_flag::orientation])
2276                     {
2277                     if (m_n_send_ghosts[stage][ineigh])
2278                         {
2279                         MPI_Isend(orientation_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
2280                             m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
2281                             MPI_BYTE,
2282                             neighbor,
2283                             6,
2284                             m_mpi_comm,
2285                             &req);
2286                         m_reqs.push_back(req);
2287                         }
2288                     send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
2289
2290                     if (m_n_recv_ghosts[stage][ineigh])
2291                         {
2292                         MPI_Irecv(orientation_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
2293                             m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
2294                             MPI_BYTE,
2295                             neighbor,
2296                             6,
2297                             m_mpi_comm,
2298                             &req);
2299                         m_reqs.push_back(req);
2300                         }
2301                     recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
2302                     }
2303                 } // end neighbor loop
2304
2305             if (m_num_stages == 1)
2306                 {
2307                 // use non-blocking MPI
2308                 m_comm_pending = true;
2309                 }
2310             else
2311                 {
2312                 // complete communication
2313                 std::vector<MPI_Status> stats(m_reqs.size());
2314                 MPI_Waitall(m_reqs.size(), &m_reqs.front(), &stats.front());
2315                 }
2316
2317             if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
2318             } // end ArrayHandle scope
2319
2320
2321         if (!m_comm_pending)
2322             {
2323             #ifndef ENABLE_MPI_CUDA
2324             //only unpack in non-CUDA MPI builds
2325             if (m_prof) m_prof->push(m_exec_conf,"unpack");
2326                 {
2327                 // begin measurement of host-device transfer time for recveived ghosts
2328                 m_tuner_ghost_recv->begin();
2329
2330                 // access receive buffers
2331                 ArrayHandle<Scalar4> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf, access_location::device, access_mode::read);
2332                 ArrayHandle<Scalar4> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf, access_location::device, access_mode::read);
2333                 ArrayHandle<Scalar4> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf, access_location::device, access_mode::read);
2334                 // access particle data
2335                 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
2336                 ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
2337                 ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
2338
2339                 // copy recv buf into particle data
2340                 gpu_exchange_ghosts_copy_buf(
2341                     m_n_recv_ghosts_tot[stage],
2342                     NULL,
2343                     d_pos_ghost_recvbuf.data,
2344                     d_vel_ghost_recvbuf.data,
2345                     NULL,
2346                     NULL,
2347                     d_orientation_ghost_recvbuf.data,
2348                     NULL,
2349                     d_pos.data + first_idx,
2350                     d_vel.data + first_idx,
2351                     NULL,
2352                     NULL,
2353                     d_orientation.data + first_idx,
2354                     false,
2355                     flags[comm_flag::position],
2356                     flags[comm_flag::velocity],
2357                     false,
2358                     false,
2359                     flags[comm_flag::orientation]);
2360
2361                 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2362
2363                 // end measurement
2364                 m_tuner_ghost_recv->end();
2365                 }
2366             if (m_prof) m_prof->pop(m_exec_conf);
2367             #endif
2368             }
2369         } // end main communication loop
2370
2371     if (m_prof) m_prof->pop(m_exec_conf);
2372     }
2373
2374 /*! Finish ghost update
2375  *
2376  * \param timestep The time step
2377  */
2378 void CommunicatorGPU::finishUpdateGhosts(unsigned int timestep)
2379     {
2380     if (m_comm_pending)
2381         {
2382         m_comm_pending = false;
2383
2384         if (m_prof) m_prof->push(m_exec_conf, "comm_ghost_update");
2385
2386         // complete communication
2387         if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
2388         std::vector<MPI_Status> stats(m_reqs.size());
2389         MPI_Waitall(m_reqs.size(), &m_reqs.front(), &stats.front());
2390         if (m_prof) m_prof->pop(m_exec_conf);
2391
2392         #ifdef ENABLE_MPI_CUDA
2393         // MPI library may use non-zero stream
2394         cudaDeviceSynchronize();
2395         #else
2396         // only unpack in non-CUDA-MPI builds
2397         assert(m_num_stages == 1);
2398         unsigned int stage = 0;
2399         unsigned int first_idx = m_pdata->getN();
2400         CommFlags flags = m_last_flags;
2401         if (m_prof) m_prof->push(m_exec_conf,"unpack");
2402             {
2403             // begin measurement of host-device transfer time for recveived ghosts
2404             m_tuner_ghost_recv->begin();
2405
2406             // access receive buffers
2407             ArrayHandle<Scalar4> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf, access_location::device, access_mode::read);
2408             ArrayHandle<Scalar4> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf, access_location::device, access_mode::read);
2409             ArrayHandle<Scalar4> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf, access_location::device, access_mode::read);
2410             // access particle data
2411             ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
2412             ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
2413             ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
2414
2415             // copy recv buf into particle data
2416             gpu_exchange_ghosts_copy_buf(
2417                 m_n_recv_ghosts_tot[stage],
2418                 NULL,
2419                 d_pos_ghost_recvbuf.data,
2420                 d_vel_ghost_recvbuf.data,
2421                 NULL,
2422                 NULL,
2423                 d_orientation_ghost_recvbuf.data,
2424                 NULL,
2425                 d_pos.data + first_idx,
2426                 d_vel.data + first_idx,
2427                 NULL,
2428                 NULL,
2429                 d_orientation.data + first_idx,
2430                 false,
2431                 flags[comm_flag::position],
2432                 flags[comm_flag::velocity],
2433                 false,
2434                 false,
2435                 flags[comm_flag::orientation]);
2436
2437             if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2438
2439             // end measurement
2440             m_tuner_ghost_recv->end();
2441             }
2442         if (m_prof) m_prof->pop(m_exec_conf);
2443         #endif
2444
2445         if (m_prof) m_prof->pop(m_exec_conf);
2446         }
2447     }
2448
2449 //! Export CommunicatorGPU class to python
2450 void export_CommunicatorGPU()
2451     {
2452     boost::python::class_<CommunicatorGPU, boost::python::bases<Communicator>, boost::shared_ptr<CommunicatorGPU>, boost::noncopyable>("CommunicatorGPU",
2453            boost::python::init<boost::shared_ptr<SystemDefinition>,
2454                 boost::shared_ptr<DomainDecomposition> >())
2455             .def("setMaxStages",&CommunicatorGPU::setMaxStages)
2456     ;
2457     }
2458
2459 #endif // ENABLE_CUDA
2460 #endif // ENABLE_MPI