Merge branch 'maint'
[hoomd-blue.git] / libhoomd / communication / CommunicatorGPU.cc
blobe0058c26de0cebcc898d56a01d83e36dc23635ec
1 /*
2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
32 permission.
34 Disclaimer
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: jglaser
52 /*! \file CommunicatorGPU.cc
53 \brief Implements the CommunicatorGPU class
56 #ifdef ENABLE_MPI
57 #ifdef ENABLE_CUDA
59 #include "CommunicatorGPU.h"
60 #include "Profiler.h"
61 #include "System.h"
63 #include <boost/python.hpp>
64 #include <algorithm>
65 #include <functional>
67 //! Constructor
68 CommunicatorGPU::CommunicatorGPU(boost::shared_ptr<SystemDefinition> sysdef,
69 boost::shared_ptr<DomainDecomposition> decomposition)
70 : Communicator(sysdef, decomposition),
71 m_max_stages(1),
72 m_num_stages(0),
73 m_comm_mask(0),
74 m_bond_comm(*this, m_sysdef->getBondData()),
75 m_angle_comm(*this, m_sysdef->getAngleData()),
76 m_dihedral_comm(*this, m_sysdef->getDihedralData()),
77 m_improper_comm(*this, m_sysdef->getImproperData())
79 // default value
80 #ifndef ENABLE_MPI_CUDA
81 m_mapped_ghost_recv = true;
82 m_mapped_ghost_send = true;
83 #else
84 // do not use host buffers with CUDA-aware MPI
85 m_mapped_ghost_recv = false;
86 m_mapped_ghost_send = false;
87 #endif
89 // allocate memory
90 allocateBuffers();
92 // initialize communciation stages
93 initializeCommunicationStages();
95 // Initialize cache configuration
96 gpu_communicator_initialize_cache_config();
98 // create at ModernGPU context
99 m_mgpu_context = mgpu::CreateCudaDeviceAttachStream(0);
101 GPUArray<unsigned int> begin(NEIGH_MAX,m_exec_conf,true);
102 m_begin.swap(begin);
104 GPUArray<unsigned int> end(NEIGH_MAX,m_exec_conf,true);
105 m_end.swap(end);
107 // create cuda event
108 // cudaEventCreate(&m_event, cudaEventBlockingSync | cudaEventDisableTiming);
109 cudaEventCreate(&m_event, cudaEventDisableTiming);
111 #ifndef ENABLE_MPI_CUDA
112 // set up autotuners to determine whether to use mapped memory (boolean values)
113 std::vector<unsigned int> valid_params(2);
114 valid_params[0] = 0; valid_params[1] = 1;
116 m_tuner_ghost_recv.reset(new Autotuner(valid_params, 5, 100000, "comm_ghost_recv_mapped", this->m_exec_conf));
117 m_tuner_ghost_send.reset(new Autotuner(valid_params, 5, 100000, "comm_ghost_send_mapped", this->m_exec_conf));
119 // synchronize autotuner results across ranks
120 m_tuner_ghost_recv->setSync(true);
121 m_tuner_ghost_send->setSync(true);
122 #endif
125 //! Destructor
126 CommunicatorGPU::~CommunicatorGPU()
128 m_exec_conf->msg->notice(5) << "Destroying CommunicatorGPU";
129 cudaEventDestroy(m_event);
132 void CommunicatorGPU::allocateBuffers()
134 #ifndef ENABLE_MPI_CUDA
135 // mapped buffers for particle migration
136 bool mapped = true;
137 #else
138 bool mapped = false;
139 #endif
142 * Particle migration
144 GPUVector<pdata_element> gpu_sendbuf(m_exec_conf,mapped);
145 m_gpu_sendbuf.swap(gpu_sendbuf);
147 GPUVector<pdata_element> gpu_recvbuf(m_exec_conf,mapped);
148 m_gpu_recvbuf.swap(gpu_recvbuf);
150 // Communciation flags for every particle sent
151 GPUVector<unsigned int> comm_flags(m_exec_conf);
152 m_comm_flags.swap(comm_flags);
154 // Key for every particle sent
155 GPUVector<unsigned int> send_keys(m_exec_conf);
156 m_send_keys.swap(send_keys);
159 * Ghost communication
162 GPUVector<unsigned int> tag_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
163 m_tag_ghost_sendbuf.swap(tag_ghost_sendbuf);
165 GPUVector<unsigned int> tag_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
166 m_tag_ghost_recvbuf.swap(tag_ghost_recvbuf);
168 GPUVector<Scalar4> pos_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
169 m_pos_ghost_sendbuf.swap(pos_ghost_sendbuf);
171 GPUVector<Scalar4> pos_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
172 m_pos_ghost_recvbuf.swap(pos_ghost_recvbuf);
174 GPUVector<Scalar4> vel_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
175 m_vel_ghost_sendbuf.swap(vel_ghost_sendbuf);
177 GPUVector<Scalar4> vel_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
178 m_vel_ghost_recvbuf.swap(vel_ghost_recvbuf);
180 GPUVector<Scalar> charge_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
181 m_charge_ghost_sendbuf.swap(charge_ghost_sendbuf);
183 GPUVector<Scalar> charge_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
184 m_charge_ghost_recvbuf.swap(charge_ghost_recvbuf);
186 GPUVector<Scalar> diameter_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
187 m_diameter_ghost_sendbuf.swap(diameter_ghost_sendbuf);
189 GPUVector<Scalar> diameter_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
190 m_diameter_ghost_recvbuf.swap(diameter_ghost_recvbuf);
192 GPUVector<Scalar4> orientation_ghost_sendbuf(m_exec_conf,m_mapped_ghost_send);
193 m_orientation_ghost_sendbuf.swap(orientation_ghost_sendbuf);
195 GPUVector<Scalar4> orientation_ghost_recvbuf(m_exec_conf,m_mapped_ghost_recv);
196 m_orientation_ghost_recvbuf.swap(orientation_ghost_recvbuf);
198 #ifndef ENABLE_MPI_CUDA
199 // initialize standby buffers (oppposite mapped setting)
200 GPUVector<unsigned int> tag_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
201 m_tag_ghost_sendbuf_alt.swap(tag_ghost_sendbuf_alt);
203 GPUVector<unsigned int> tag_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
204 m_tag_ghost_recvbuf_alt.swap(tag_ghost_recvbuf_alt);
206 GPUVector<Scalar4> pos_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
207 m_pos_ghost_sendbuf_alt.swap(pos_ghost_sendbuf_alt);
209 GPUVector<Scalar4> pos_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
210 m_pos_ghost_recvbuf_alt.swap(pos_ghost_recvbuf_alt);
212 GPUVector<Scalar4> vel_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
213 m_vel_ghost_sendbuf_alt.swap(vel_ghost_sendbuf_alt);
215 GPUVector<Scalar4> vel_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
216 m_vel_ghost_recvbuf_alt.swap(vel_ghost_recvbuf_alt);
218 GPUVector<Scalar> charge_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
219 m_charge_ghost_sendbuf_alt.swap(charge_ghost_sendbuf_alt);
221 GPUVector<Scalar> charge_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
222 m_charge_ghost_recvbuf_alt.swap(charge_ghost_recvbuf_alt);
224 GPUVector<Scalar> diameter_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
225 m_diameter_ghost_sendbuf_alt.swap(diameter_ghost_sendbuf_alt);
227 GPUVector<Scalar> diameter_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
228 m_diameter_ghost_recvbuf_alt.swap(diameter_ghost_recvbuf_alt);
230 GPUVector<Scalar4> orientation_ghost_sendbuf_alt(m_exec_conf,!m_mapped_ghost_send);
231 m_orientation_ghost_sendbuf_alt.swap(orientation_ghost_sendbuf_alt);
233 GPUVector<Scalar4> orientation_ghost_recvbuf_alt(m_exec_conf,!m_mapped_ghost_recv);
234 m_orientation_ghost_recvbuf_alt.swap(orientation_ghost_recvbuf_alt);
235 #endif
237 GPUVector<unsigned int> ghost_begin(m_exec_conf,true);
238 m_ghost_begin.swap(ghost_begin);
240 GPUVector<unsigned int> ghost_end(m_exec_conf,true);
241 m_ghost_end.swap(ghost_end);
243 GPUVector<unsigned int> ghost_plan(m_exec_conf);
244 m_ghost_plan.swap(ghost_plan);
246 GPUVector<uint2> ghost_idx_adj(m_exec_conf);
247 m_ghost_idx_adj.swap(ghost_idx_adj);
249 GPUVector<unsigned int> ghost_neigh(m_exec_conf);
250 m_ghost_neigh.swap(ghost_neigh);
252 GPUVector<unsigned int> neigh_counts(m_exec_conf);
253 m_neigh_counts.swap(neigh_counts);
256 void CommunicatorGPU::initializeCommunicationStages()
258 // sanity check for user input
259 if (m_max_stages == 0)
261 m_exec_conf->msg->warning()
262 << "Maximum number of communication stages needs to be greater than zero. Assuming one."
263 << std::endl;
264 m_max_stages = 1;
267 if (m_max_stages > 3)
269 m_exec_conf->msg->warning()
270 << "Maximum number of communication stages too large. Assuming three."
271 << std::endl;
272 m_max_stages = 3;
275 // accesss neighbors and adjacency array
276 ArrayHandle<unsigned int> h_adj_mask(m_adj_mask, access_location::host, access_mode::read);
278 Index3D di= m_decomposition->getDomainIndexer();
280 // number of stages in every communication step
281 m_num_stages = 0;
283 m_comm_mask.clear();
284 m_comm_mask.resize(m_max_stages,0);
286 const unsigned int mask_east = 1 << 2 | 1 << 5 | 1 << 8 | 1 << 11
287 | 1 << 14 | 1 << 17 | 1 << 20 | 1 << 23 | 1 << 26;
288 const unsigned int mask_west = mask_east >> 2;
289 const unsigned int mask_north = 1 << 6 | 1 << 7 | 1 << 8 | 1 << 15
290 | 1 << 16 | 1 << 17 | 1 << 24 | 1 << 25 | 1 << 26;
291 const unsigned int mask_south = mask_north >> 6;
292 const unsigned int mask_up = 1 << 18 | 1 << 19 | 1 << 20 | 1 << 21
293 | 1 << 22 | 1 << 23 | 1 << 24 | 1 << 25 | 1 << 26;
294 const unsigned int mask_down = mask_up >> 18;
296 // loop through neighbors to determine the communication stages
297 std::vector<unsigned int> neigh_flags(m_n_unique_neigh);
298 unsigned int max_stage = 0;
299 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ++ineigh)
301 int stage = 0;
302 int n = -1;
304 // determine stage
305 if (di.getW() > 1 && (n+1 < (int) m_max_stages)) n++;
306 if (h_adj_mask.data[ineigh] & (mask_east | mask_west)) stage = n;
307 if (di.getH() > 1 && (n+1 < (int) m_max_stages)) n++;
308 if (h_adj_mask.data[ineigh] & (mask_north | mask_south)) stage = n;
309 if (di.getD() > 1 && (n+1 < (int) m_max_stages)) n++;
310 if (h_adj_mask.data[ineigh] & (mask_up | mask_down)) stage = n;
312 assert(stage >= 0);
313 assert(n >= 0);
315 unsigned int mask = 0;
316 if (h_adj_mask.data[ineigh] & mask_east)
317 mask |= send_east;
318 if (h_adj_mask.data[ineigh] & mask_west)
319 mask |= send_west;
320 if (h_adj_mask.data[ineigh] & mask_north)
321 mask |= send_north;
322 if (h_adj_mask.data[ineigh] & mask_south)
323 mask |= send_south;
324 if (h_adj_mask.data[ineigh] & mask_up)
325 mask |= send_up;
326 if (h_adj_mask.data[ineigh] & mask_down)
327 mask |= send_down;
329 neigh_flags[ineigh] = mask;
331 // set communication flags for stage
332 m_comm_mask[stage] |= mask;
334 if (stage > (int)max_stage) max_stage = stage;
337 // number of communication stages
338 m_num_stages = max_stage + 1;
340 // every direction occurs in one and only one stages
341 // number of communications per stage is constant or decreases with stage number
342 for (unsigned int istage = 0; istage < m_num_stages; ++istage)
343 for (unsigned int jstage = istage+1; jstage < m_num_stages; ++jstage)
344 m_comm_mask[jstage] &= ~m_comm_mask[istage];
346 // access unique neighbors
347 ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
349 // initialize stages array
350 m_stages.resize(m_n_unique_neigh,-1);
352 // assign stages to unique neighbors
353 for (unsigned int i= 0; i < m_n_unique_neigh; i++)
354 for (unsigned int istage = 0; istage < m_num_stages; ++istage)
355 // compare adjacency masks of neighbors to the mask for this stage
356 if ((neigh_flags[i] & m_comm_mask[istage]) == neigh_flags[i])
358 m_stages[i] = istage;
359 break; // associate neighbor with stage of lowest index
362 m_exec_conf->msg->notice(4) << "ComunicatorGPU: Using " << m_num_stages
363 << " communication stage(s)." << std::endl;
366 //! Select a particle for migration
367 struct get_migrate_key : public std::unary_function<const unsigned int, unsigned int >
369 const uint3 my_pos; //!< My domain decomposition position
370 const Index3D di; //!< Domain indexer
371 const unsigned int mask; //!< Mask of allowed directions
372 const unsigned int *h_cart_ranks; //!< Rank lookup table
374 //! Constructor
377 get_migrate_key(const uint3 _my_pos, const Index3D _di, const unsigned int _mask,
378 const unsigned int *_h_cart_ranks)
379 : my_pos(_my_pos), di(_di), mask(_mask), h_cart_ranks(_h_cart_ranks)
382 //! Generate key for a sent particle
383 unsigned int operator()(const unsigned int flags)
385 int ix, iy, iz;
386 ix = iy = iz = 0;
388 if ((flags & Communicator::send_east) && (mask & Communicator::send_east))
389 ix = 1;
390 else if ((flags & Communicator::send_west) && (mask & Communicator::send_west))
391 ix = -1;
393 if ((flags & Communicator::send_north) && (mask & Communicator::send_north))
394 iy = 1;
395 else if ((flags & Communicator::send_south) && (mask & Communicator::send_south))
396 iy = -1;
398 if ((flags & Communicator::send_up) && (mask & Communicator::send_up))
399 iz = 1;
400 else if ((flags & Communicator::send_down) && (mask & Communicator::send_down))
401 iz = -1;
403 // sanity check: particle has to be sent somewhere
404 assert(ix || iy || iz);
406 int i = my_pos.x;
407 int j = my_pos.y;
408 int k = my_pos.z;
410 i += ix;
411 if (i == (int)di.getW())
412 i = 0;
413 else if (i < 0)
414 i += di.getW();
416 j += iy;
417 if (j == (int)di.getH())
418 j = 0;
419 else if (j < 0)
420 j += di.getH();
422 k += iz;
423 if (k == (int)di.getD())
424 k = 0;
425 else if (k < 0)
426 k += di.getD();
428 return h_cart_ranks[di(i,j,k)];
433 //! Constructor
434 template<class group_data>
435 CommunicatorGPU::GroupCommunicatorGPU<group_data>::GroupCommunicatorGPU(CommunicatorGPU& gpu_comm, boost::shared_ptr<group_data> gdata)
436 : m_gpu_comm(gpu_comm), m_exec_conf(m_gpu_comm.m_exec_conf), m_gdata(gdata)
438 // accelerate copying of data for host MPI
439 #ifdef ENABLE_MPI_CUDA
440 bool mapped = false;
441 #else
442 bool mapped = true;
443 #endif
445 GPUVector<unsigned int> rank_mask(m_gpu_comm.m_exec_conf);
446 m_rank_mask.swap(rank_mask);
448 GPUVector<unsigned int> scratch(m_gpu_comm.m_exec_conf);
449 m_scan.swap(scratch);
451 GPUVector<rank_element_t> ranks_out(m_gpu_comm.m_exec_conf,mapped);
452 m_ranks_out.swap(ranks_out);
454 GPUVector<rank_element_t> ranks_sendbuf(m_gpu_comm.m_exec_conf,mapped);
455 m_ranks_sendbuf.swap(ranks_sendbuf);
457 GPUVector<rank_element_t> ranks_recvbuf(m_gpu_comm.m_exec_conf,mapped);
458 m_ranks_recvbuf.swap(ranks_recvbuf);
460 GPUVector<group_element_t> groups_out(m_gpu_comm.m_exec_conf,mapped);
461 m_groups_out.swap(groups_out);
463 GPUVector<unsigned int> rank_mask_out(m_gpu_comm.m_exec_conf,mapped);
464 m_rank_mask_out.swap(rank_mask_out);
466 GPUVector<group_element_t> groups_sendbuf(m_gpu_comm.m_exec_conf,mapped);
467 m_groups_sendbuf.swap(groups_sendbuf);
469 GPUVector<group_element_t> groups_recvbuf(m_gpu_comm.m_exec_conf,mapped);
470 m_groups_recvbuf.swap(groups_recvbuf);
472 GPUVector<group_element_t> groups_in(m_gpu_comm.m_exec_conf, mapped);
473 m_groups_in.swap(groups_in);
475 // the size of the bit field must be larger or equal the group size
476 assert(sizeof(unsigned int)*8 >= group_data::size);
479 //! Migrate groups
480 template<class group_data>
481 void CommunicatorGPU::GroupCommunicatorGPU<group_data>::migrateGroups(bool incomplete)
483 if (m_gdata->getNGlobal())
485 m_exec_conf->msg->notice(7) << "GroupCommunicator<" << m_gdata->getName() << ">: migrate" << std::endl;
487 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf, m_gdata->getName());
489 // resize bitmasks
490 m_rank_mask.resize(m_gdata->getN());
492 // resize temporary arry
493 m_scan.resize(m_gdata->getN());
495 unsigned int n_out_ranks;
497 ArrayHandle<unsigned int> d_comm_flags(m_gpu_comm.m_pdata->getCommFlags(), access_location::device, access_mode::read);
498 ArrayHandle<typename group_data::members_t> d_members(m_gdata->getMembersArray(), access_location::device, access_mode::read);
499 ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::readwrite);
500 ArrayHandle<unsigned int> d_rank_mask(m_rank_mask, access_location::device, access_mode::overwrite);
501 ArrayHandle<unsigned int> d_rtag(m_gpu_comm.m_pdata->getRTags(), access_location::device, access_mode::read);
502 ArrayHandle<unsigned int> d_scan(m_scan, access_location::device, access_mode::overwrite);
504 boost::shared_ptr<DomainDecomposition> decomposition = m_gpu_comm.m_pdata->getDomainDecomposition();
505 ArrayHandle<unsigned int> d_cart_ranks(decomposition->getCartRanks(), access_location::device, access_mode::read);
507 Index3D di = decomposition->getDomainIndexer();
508 uint3 my_pos = decomposition->getGridPos();
510 // mark groups that have members leaving this domain
511 gpu_mark_groups<group_data::size>(
512 m_gpu_comm.m_pdata->getN(),
513 d_comm_flags.data,
514 m_gdata->getN(),
515 d_members.data,
516 d_group_ranks.data,
517 d_rank_mask.data,
518 d_rtag.data,
519 d_scan.data,
520 n_out_ranks,
522 my_pos,
523 d_cart_ranks.data,
524 incomplete,
525 m_gpu_comm.m_mgpu_context);
527 if (m_gpu_comm.m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
530 // resize output array
531 m_ranks_out.resize(n_out_ranks);
533 unsigned int n_out_groups;
535 ArrayHandle<unsigned int> d_group_tag(m_gdata->getTags(), access_location::device, access_mode::read);
536 ArrayHandle<typename group_data::members_t> d_members(m_gdata->getMembersArray(), access_location::device, access_mode::read);
537 ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::read);
538 ArrayHandle<unsigned int> d_rank_mask(m_rank_mask, access_location::device, access_mode::readwrite);
540 ArrayHandle<unsigned int> d_rtag(m_gpu_comm.m_pdata->getRTags(), access_location::device, access_mode::read);
541 ArrayHandle<unsigned int> d_comm_flags(m_gpu_comm.m_pdata->getCommFlags(), access_location::device, access_mode::readwrite);
543 ArrayHandle<rank_element_t> d_ranks_out(m_ranks_out, access_location::device, access_mode::overwrite);
545 ArrayHandle<unsigned int> d_scan(m_scan, access_location::device, access_mode::readwrite);
547 // scatter groups into output arrays according to scan result (d_scan), determine send groups and scan
548 gpu_scatter_ranks_and_mark_send_groups<group_data::size>(
549 m_gdata->getN(),
550 d_group_tag.data,
551 d_group_ranks.data,
552 d_rank_mask.data,
553 d_members.data,
554 d_rtag.data,
555 d_comm_flags.data,
556 d_scan.data,
557 n_out_groups,
558 d_ranks_out.data,
559 m_gpu_comm.m_mgpu_context);
561 if (m_gpu_comm.m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
564 #if defined(ENABLE_MPI_CUDA) && 0
565 #else
566 // fill host send buffers on host
567 unsigned int my_rank = m_gpu_comm.m_exec_conf->getRank();
569 typedef std::multimap<unsigned int, rank_element_t> map_t;
570 map_t send_map;
573 // access output buffers
574 ArrayHandle<rank_element_t> h_ranks_out(m_ranks_out, access_location::host, access_mode::read);
575 ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location:: host, access_mode::read);
577 for (unsigned int i = 0; i < n_out_ranks; ++i)
579 rank_element_t el = h_ranks_out.data[i];
580 typename group_data::ranks_t r = el.ranks;
581 unsigned int mask = el.mask;
583 if (incomplete)
584 // in initialization, send to all neighbors
585 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ++ineigh)
586 send_map.insert(std::make_pair(h_unique_neighbors.data[ineigh], el));
587 else
588 // send to other ranks owning the bonded group
589 for (unsigned int j = 0; j < group_data::size; ++j)
591 unsigned int rank = r.idx[j];
592 bool updated = mask & (1 << j);
593 // send out to ranks different from ours
594 if (rank != my_rank && !updated)
595 send_map.insert(std::make_pair(rank, el));
600 // resize send buffers
601 m_ranks_sendbuf.resize(send_map.size());
604 // access send buffers
605 ArrayHandle<rank_element_t> h_ranks_sendbuf(m_ranks_sendbuf, access_location::host, access_mode::overwrite);
607 // output send data sorted by rank
608 unsigned int n = 0;
609 for (typename map_t::iterator it = send_map.begin(); it != send_map.end(); ++it)
611 h_ranks_sendbuf.data[n] = it->second;
612 n++;
615 ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
616 ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::overwrite);
617 ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::overwrite);
619 // Find start and end indices
620 for (unsigned int i = 0; i < m_gpu_comm.m_n_unique_neigh; ++i)
622 typename map_t::iterator lower = send_map.lower_bound(h_unique_neighbors.data[i]);
623 typename map_t::iterator upper = send_map.upper_bound(h_unique_neighbors.data[i]);
624 h_begin.data[i] = std::distance(send_map.begin(),lower);
625 h_end.data[i] = std::distance(send_map.begin(),upper);
628 #endif
631 * communicate rank information (phase 1)
633 unsigned int n_send_groups[m_gpu_comm.m_n_unique_neigh];
634 unsigned int n_recv_groups[m_gpu_comm.m_n_unique_neigh];
635 unsigned int offs[m_gpu_comm.m_n_unique_neigh];
636 unsigned int n_recv_tot = 0;
639 ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::read);
640 ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::read);
641 ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
643 unsigned int send_bytes = 0;
644 unsigned int recv_bytes = 0;
645 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf, "MPI send/recv");
647 // compute send counts
648 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
649 n_send_groups[ineigh] = h_end.data[ineigh] - h_begin.data[ineigh];
651 MPI_Request req[2*m_gpu_comm.m_n_unique_neigh];
652 MPI_Status stat[2*m_gpu_comm.m_n_unique_neigh];
654 unsigned int nreq = 0;
656 // loop over neighbors
657 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
659 // rank of neighbor processor
660 unsigned int neighbor = h_unique_neighbors.data[ineigh];
662 MPI_Isend(&n_send_groups[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_gpu_comm.m_mpi_comm, & req[nreq++]);
663 MPI_Irecv(&n_recv_groups[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_gpu_comm.m_mpi_comm, & req[nreq++]);
664 send_bytes += sizeof(unsigned int);
665 recv_bytes += sizeof(unsigned int);
666 } // end neighbor loop
668 MPI_Waitall(nreq, req, stat);
670 // sum up receive counts
671 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
673 if (ineigh == 0)
674 offs[ineigh] = 0;
675 else
676 offs[ineigh] = offs[ineigh-1] + n_recv_groups[ineigh-1];
678 n_recv_tot += n_recv_groups[ineigh];
681 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
684 // Resize receive buffer
685 m_ranks_recvbuf.resize(n_recv_tot);
688 ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::read);
689 ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::read);
690 ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
692 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf,"MPI send/recv");
694 #ifdef ENABLE_MPI_CUDA
695 ArrayHandle<rank_element_t> ranks_sendbuf_handle(m_ranks_sendbuf, access_location::device, access_mode::read);
696 ArrayHandle<rank_element_t> ranks_recvbuf_handle(m_ranks_recvbuf, access_location::device, access_mode::overwrite);
698 // MPI library may use non-zero stream
699 cudaDeviceSynchronize();
700 #else
701 ArrayHandle<rank_element_t> ranks_sendbuf_handle(m_ranks_sendbuf, access_location::host, access_mode::read);
702 ArrayHandle<rank_element_t> ranks_recvbuf_handle(m_ranks_recvbuf, access_location::host, access_mode::overwrite);
703 #endif
705 std::vector<MPI_Request> reqs;
706 MPI_Request req;
708 unsigned int send_bytes = 0;
709 unsigned int recv_bytes = 0;
711 // loop over neighbors
712 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
714 // rank of neighbor processor
715 unsigned int neighbor = h_unique_neighbors.data[ineigh];
717 // exchange particle data
718 if (n_send_groups[ineigh])
720 MPI_Isend(ranks_sendbuf_handle.data+h_begin.data[ineigh],
721 n_send_groups[ineigh]*sizeof(rank_element_t),
722 MPI_BYTE,
723 neighbor,
725 m_gpu_comm.m_mpi_comm,
726 &req);
727 reqs.push_back(req);
729 send_bytes+= n_send_groups[ineigh]*sizeof(rank_element_t);
731 if (n_recv_groups[ineigh])
733 MPI_Irecv(ranks_recvbuf_handle.data+offs[ineigh],
734 n_recv_groups[ineigh]*sizeof(rank_element_t),
735 MPI_BYTE,
736 neighbor,
738 m_gpu_comm.m_mpi_comm,
739 &req);
740 reqs.push_back(req);
742 recv_bytes += n_recv_groups[ineigh]*sizeof(rank_element_t);
745 std::vector<MPI_Status> stats(reqs.size());
746 MPI_Waitall(reqs.size(), &reqs.front(), &stats.front());
748 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
752 // access receive buffers
753 ArrayHandle<rank_element_t> d_ranks_recvbuf(m_ranks_recvbuf, access_location::device, access_mode::read);
754 ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::readwrite);
755 ArrayHandle<unsigned int> d_group_rtag(m_gdata->getRTags(), access_location::device, access_mode::read);
757 // update local rank information
758 gpu_update_ranks_table<group_data::size>(
759 m_gdata->getN(),
760 d_group_ranks.data,
761 d_group_rtag.data,
762 n_recv_tot,
763 d_ranks_recvbuf.data);
764 if (m_gpu_comm.m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
767 // resize output buffer
768 m_groups_out.resize(n_out_groups);
769 m_rank_mask_out.resize(n_out_groups);
772 ArrayHandle<typename group_data::members_t> d_groups(m_gdata->getMembersArray(), access_location::device, access_mode::read);
773 ArrayHandle<unsigned int> d_group_type(m_gdata->getTypesArray(), access_location::device, access_mode::read);
774 ArrayHandle<unsigned int> d_group_tag(m_gdata->getTags(), access_location::device, access_mode::read);
775 ArrayHandle<unsigned int> d_group_rtag(m_gdata->getRTags(), access_location::device, access_mode::readwrite);
776 ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::read);
777 ArrayHandle<unsigned int> d_rank_mask(m_rank_mask, access_location::device, access_mode::readwrite);
778 ArrayHandle<unsigned int> d_scan(m_scan, access_location::device, access_mode::readwrite);
779 ArrayHandle<group_element_t> d_groups_out(m_groups_out, access_location::device, access_mode::overwrite);
780 ArrayHandle<unsigned int> d_rank_mask_out(m_rank_mask_out, access_location::device, access_mode::overwrite);
781 ArrayHandle<unsigned int> d_rtag(m_gpu_comm.m_pdata->getRTags(), access_location::device, access_mode::read);
782 ArrayHandle<unsigned int> d_comm_flags(m_gpu_comm.m_pdata->getCommFlags(), access_location::device, access_mode::read);
784 // scatter groups to be sent into output buffer, mark groups that have no local members for removal
785 gpu_scatter_and_mark_groups_for_removal<group_data::size>(
786 m_gdata->getN(),
787 d_groups.data,
788 d_group_type.data,
789 d_group_tag.data,
790 d_group_rtag.data,
791 d_group_ranks.data,
792 d_rank_mask.data,
793 d_rtag.data,
794 d_comm_flags.data,
795 m_exec_conf->getRank(),
796 d_scan.data,
797 d_groups_out.data,
798 d_rank_mask_out.data);
799 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
802 unsigned int new_ngroups;
804 // access primary arrays to read from
805 ArrayHandle<typename group_data::members_t> d_groups(m_gdata->getMembersArray(), access_location::device, access_mode::read);
806 ArrayHandle<unsigned int> d_group_type(m_gdata->getTypesArray(), access_location::device, access_mode::read);
807 ArrayHandle<unsigned int> d_group_tag(m_gdata->getTags(), access_location::device, access_mode::read);
808 ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::read);
810 // access alternate arrays to write to
811 ArrayHandle<typename group_data::members_t> d_groups_alt(m_gdata->getAltMembersArray(), access_location::device, access_mode::overwrite);
812 ArrayHandle<unsigned int> d_group_type_alt(m_gdata->getAltTypesArray(), access_location::device, access_mode::overwrite);
813 ArrayHandle<unsigned int> d_group_tag_alt(m_gdata->getAltTags(), access_location::device, access_mode::overwrite);
814 ArrayHandle<typename group_data::ranks_t> d_group_ranks_alt(m_gdata->getAltRanksArray(), access_location::device, access_mode::overwrite);
816 ArrayHandle<unsigned int> d_scan(m_scan, access_location::device, access_mode::readwrite);
818 // access rtags
819 ArrayHandle<unsigned int> d_group_rtag(m_gdata->getRTags(), access_location::device, access_mode::readwrite);
821 unsigned int ngroups = m_gdata->getN();
823 // remove groups from local table
824 gpu_remove_groups(ngroups,
825 d_groups.data,
826 d_groups_alt.data,
827 d_group_type.data,
828 d_group_type_alt.data,
829 d_group_tag.data,
830 d_group_tag_alt.data,
831 d_group_ranks.data,
832 d_group_ranks_alt.data,
833 d_group_rtag.data,
834 new_ngroups,
835 d_scan.data,
836 m_gpu_comm.m_mgpu_context);
837 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
840 // resize alternate arrays to number of groups
841 GPUVector<typename group_data::members_t>& alt_groups_array = m_gdata->getAltMembersArray();
842 GPUVector<unsigned int>& alt_group_type_array = m_gdata->getAltTypesArray();
843 GPUVector<unsigned int>& alt_group_tag_array = m_gdata->getAltTags();
844 GPUVector<typename group_data::ranks_t>& alt_group_ranks_array = m_gdata->getAltRanksArray();
846 assert(new_ngroups <= m_gdata->getN());
847 alt_groups_array.resize(new_ngroups);
848 alt_group_type_array.resize(new_ngroups);
849 alt_group_tag_array.resize(new_ngroups);
850 alt_group_ranks_array.resize(new_ngroups);
852 // make alternate arrays current
853 m_gdata->swapMemberArrays();
854 m_gdata->swapTypeArrays();
855 m_gdata->swapTagArrays();
856 m_gdata->swapRankArrays();
858 assert(m_gdata->getN() == new_ngroups);
860 #if defined(ENABLE_MPI_CUDA) && 0
861 #else
862 // fill host send buffers on host
863 typedef std::multimap<unsigned int, group_element_t> group_map_t;
864 group_map_t group_send_map;
867 // access output buffers
868 ArrayHandle<group_element_t> h_groups_out(m_groups_out, access_location::host, access_mode::read);
869 ArrayHandle<unsigned int> h_rank_mask_out(m_rank_mask_out, access_location::host, access_mode::read);
871 for (unsigned int i = 0; i < n_out_groups; ++i)
873 group_element_t el = h_groups_out.data[i];
874 typename group_data::ranks_t ranks = el.ranks;
876 for (unsigned int j = 0; j < group_data::size; ++j)
878 unsigned int rank = ranks.idx[j];
879 // are we sending to this rank?
880 if (h_rank_mask_out.data[i] & (1 << j))
881 group_send_map.insert(std::make_pair(rank, el));
886 // resize send buffers
887 m_groups_sendbuf.resize(group_send_map.size());
890 // access send buffers
891 ArrayHandle<group_element_t> h_groups_sendbuf(m_groups_sendbuf, access_location::host, access_mode::overwrite);
893 // output send data sorted by rank
894 unsigned int n = 0;
895 for (typename group_map_t::iterator it = group_send_map.begin(); it != group_send_map.end(); ++it)
897 h_groups_sendbuf.data[n] = it->second;
898 n++;
901 ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
902 ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::overwrite);
903 ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::overwrite);
905 // Find start and end indices
906 for (unsigned int i = 0; i < m_gpu_comm.m_n_unique_neigh; ++i)
908 typename group_map_t::iterator lower = group_send_map.lower_bound(h_unique_neighbors.data[i]);
909 typename group_map_t::iterator upper = group_send_map.upper_bound(h_unique_neighbors.data[i]);
910 h_begin.data[i] = std::distance(group_send_map.begin(),lower);
911 h_end.data[i] = std::distance(group_send_map.begin(),upper);
914 #endif
917 * communicate groups (phase 2)
920 n_recv_tot = 0;
922 ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::read);
923 ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::read);
924 ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
926 unsigned int send_bytes = 0;
927 unsigned int recv_bytes = 0;
928 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf, "MPI send/recv");
930 // compute send counts
931 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
932 n_send_groups[ineigh] = h_end.data[ineigh] - h_begin.data[ineigh];
934 MPI_Request req[2*m_gpu_comm.m_n_unique_neigh];
935 MPI_Status stat[2*m_gpu_comm.m_n_unique_neigh];
937 unsigned int nreq = 0;
939 // loop over neighbors
940 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
942 // rank of neighbor processor
943 unsigned int neighbor = h_unique_neighbors.data[ineigh];
945 MPI_Isend(&n_send_groups[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_gpu_comm.m_mpi_comm, & req[nreq++]);
946 MPI_Irecv(&n_recv_groups[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_gpu_comm.m_mpi_comm, & req[nreq++]);
947 send_bytes += sizeof(unsigned int);
948 recv_bytes += sizeof(unsigned int);
949 } // end neighbor loop
951 MPI_Waitall(nreq, req, stat);
953 // sum up receive counts
954 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
956 if (ineigh == 0)
957 offs[ineigh] = 0;
958 else
959 offs[ineigh] = offs[ineigh-1] + n_recv_groups[ineigh-1];
961 n_recv_tot += n_recv_groups[ineigh];
964 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
967 // Resize receive buffer
968 m_groups_recvbuf.resize(n_recv_tot);
971 ArrayHandle<unsigned int> h_begin(m_gpu_comm.m_begin, access_location::host, access_mode::read);
972 ArrayHandle<unsigned int> h_end(m_gpu_comm.m_end, access_location::host, access_mode::read);
973 ArrayHandle<unsigned int> h_unique_neighbors(m_gpu_comm.m_unique_neighbors, access_location::host, access_mode::read);
975 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->push(m_exec_conf,"MPI send/recv");
977 #if defined(ENABLE_MPI_CUDA) && 0
978 #else
979 ArrayHandle<group_element_t> groups_sendbuf_handle(m_groups_sendbuf, access_location::host, access_mode::read);
980 ArrayHandle<group_element_t> groups_recvbuf_handle(m_groups_recvbuf, access_location::host, access_mode::overwrite);
981 #endif
983 std::vector<MPI_Request> reqs;
984 MPI_Request req;
986 unsigned int send_bytes = 0;
987 unsigned int recv_bytes = 0;
989 // loop over neighbors
990 for (unsigned int ineigh = 0; ineigh < m_gpu_comm.m_n_unique_neigh; ineigh++)
992 // rank of neighbor processor
993 unsigned int neighbor = h_unique_neighbors.data[ineigh];
995 // exchange particle data
996 if (n_send_groups[ineigh])
998 MPI_Isend(groups_sendbuf_handle.data+h_begin.data[ineigh],
999 n_send_groups[ineigh]*sizeof(group_element_t),
1000 MPI_BYTE,
1001 neighbor,
1003 m_gpu_comm.m_mpi_comm,
1004 &req);
1005 reqs.push_back(req);
1007 send_bytes+= n_send_groups[ineigh]*sizeof(group_element_t);
1009 if (n_recv_groups[ineigh])
1011 MPI_Irecv(groups_recvbuf_handle.data+offs[ineigh],
1012 n_recv_groups[ineigh]*sizeof(group_element_t),
1013 MPI_BYTE,
1014 neighbor,
1016 m_gpu_comm.m_mpi_comm,
1017 &req);
1018 reqs.push_back(req);
1020 recv_bytes += n_recv_groups[ineigh]*sizeof(group_element_t);
1023 std::vector<MPI_Status> stats(reqs.size());
1024 MPI_Waitall(reqs.size(), &reqs.front(), &stats.front());
1026 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1029 unsigned int n_recv_unique = 0;
1030 #if defined(ENABLE_MPI_CUDA) && 0
1031 #else
1033 ArrayHandle<group_element_t> h_groups_recvbuf(m_groups_recvbuf, access_location::host, access_mode::read);
1035 // use a std::map, i.e. single-key, to filter out duplicate groups in input buffer
1036 typedef std::map<unsigned int, group_element_t> recv_map_t;
1037 recv_map_t recv_map;
1039 for (unsigned int recv_idx = 0; recv_idx < n_recv_tot; recv_idx++)
1041 group_element_t el = h_groups_recvbuf.data[recv_idx];
1042 unsigned int tag= el.group_tag;
1043 recv_map.insert(std::make_pair(tag, el));
1046 // resize input array of unique groups
1047 m_groups_in.resize(recv_map.size());
1049 // write out unique groups
1050 ArrayHandle<group_element_t> h_groups_in(m_groups_in, access_location::host, access_mode::overwrite);
1051 for (typename recv_map_t::iterator it = recv_map.begin(); it != recv_map.end(); ++it)
1052 h_groups_in.data[n_recv_unique++] = it->second;
1053 assert(n_recv_unique == recv_map.size());
1055 #endif
1057 unsigned int old_ngroups = m_gdata->getN();
1058 new_ngroups = old_ngroups + n_recv_unique;
1060 // resize group arrays to accomodate additional groups (there can still be duplicates with local groups)
1061 GPUVector<typename group_data::members_t>& groups_array = m_gdata->getMembersArray();
1062 GPUVector<unsigned int>& group_type_array = m_gdata->getTypesArray();
1063 GPUVector<unsigned int>& group_tag_array = m_gdata->getTags();
1064 GPUVector<typename group_data::ranks_t>& group_ranks_array = m_gdata->getRanksArray();
1066 groups_array.resize(new_ngroups);
1067 group_type_array.resize(new_ngroups);
1068 group_tag_array.resize(new_ngroups);
1069 group_ranks_array.resize(new_ngroups);
1072 ArrayHandle<group_element_t> d_groups_in(m_groups_in, access_location::device, access_mode::read);
1073 ArrayHandle<typename group_data::members_t> d_groups(m_gdata->getMembersArray(), access_location::device, access_mode::readwrite);
1074 ArrayHandle<unsigned int> d_group_type(m_gdata->getTypesArray(), access_location::device, access_mode::readwrite);
1075 ArrayHandle<unsigned int> d_group_tag(m_gdata->getTags(), access_location::device, access_mode::readwrite);
1076 ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::readwrite);
1077 ArrayHandle<unsigned int> d_group_rtag(m_gdata->getRTags(), access_location::device, access_mode::readwrite);
1079 // get temp buffer
1080 ScopedAllocation<unsigned int> d_tmp(m_exec_conf->getCachedAllocator(), n_recv_unique);
1082 // add new groups, updating groups that are already present locally
1083 gpu_add_groups(old_ngroups,
1084 n_recv_unique,
1085 d_groups_in.data,
1086 d_groups.data,
1087 d_group_type.data,
1088 d_group_tag.data,
1089 d_group_ranks.data,
1090 d_group_rtag.data,
1091 new_ngroups,
1092 d_tmp.data,
1093 m_gpu_comm.m_mgpu_context);
1094 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1097 // resize arrays to final size
1098 groups_array.resize(new_ngroups);
1099 group_type_array.resize(new_ngroups);
1100 group_tag_array.resize(new_ngroups);
1101 group_ranks_array.resize(new_ngroups);
1103 // indicate that group table has changed
1104 m_gdata->setDirty();
1106 if (m_gpu_comm.m_prof) m_gpu_comm.m_prof->pop(m_exec_conf);
1111 //! Mark ghost particles
1112 template<class group_data>
1113 void CommunicatorGPU::GroupCommunicatorGPU<group_data>::markGhostParticles(
1114 const GPUArray<unsigned int>& plans,
1115 unsigned int mask)
1117 if (m_gdata->getNGlobal())
1119 m_exec_conf->msg->notice(7) << "GroupCommunicator<" << m_gdata->getName() << ">: find incomplete groups" << std::endl;
1121 ArrayHandle<typename group_data::members_t> d_groups(m_gdata->getMembersArray(), access_location::device, access_mode::read);
1122 ArrayHandle<typename group_data::ranks_t> d_group_ranks(m_gdata->getRanksArray(), access_location::device, access_mode::read);
1123 ArrayHandle<unsigned int> d_rtag(m_gpu_comm.m_pdata->getRTags(), access_location::device, access_mode::read);
1124 ArrayHandle<Scalar4> d_pos(m_gpu_comm.m_pdata->getPositions(), access_location::device, access_mode::read);
1125 ArrayHandle<unsigned int> d_plan(plans, access_location::device, access_mode::readwrite);
1127 boost::shared_ptr<DomainDecomposition> decomposition = m_gpu_comm.m_pdata->getDomainDecomposition();
1128 ArrayHandle<unsigned int> d_cart_ranks_inv(decomposition->getInverseCartRanks(), access_location::device, access_mode::read);
1129 Index3D di = decomposition->getDomainIndexer();
1130 uint3 my_pos = decomposition->getGridPos();
1132 gpu_mark_bonded_ghosts<group_data::size>(
1133 m_gdata->getN(),
1134 d_groups.data,
1135 d_group_ranks.data,
1136 d_pos.data,
1137 m_gpu_comm.m_pdata->getBox(),
1138 d_rtag.data,
1139 d_plan.data,
1141 my_pos,
1142 d_cart_ranks_inv.data,
1143 m_exec_conf->getRank(),
1144 mask);
1145 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1149 //! Transfer particles between neighboring domains
1150 void CommunicatorGPU::migrateParticles()
1152 m_exec_conf->msg->notice(7) << "CommunicatorGPU: migrate particles" << std::endl;
1154 // check if box is sufficiently large for communication
1155 Scalar3 L= m_pdata->getBox().getNearestPlaneDistance();
1156 const Index3D& di = m_decomposition->getDomainIndexer();
1157 if ((m_r_ghost >= L.x/Scalar(2.0) && di.getW() > 1) ||
1158 (m_r_ghost >= L.y/Scalar(2.0) && di.getH() > 1) ||
1159 (m_r_ghost >= L.z/Scalar(2.0) && di.getD() > 1))
1161 m_exec_conf->msg->error() << "Simulation box too small for domain decomposition." << std::endl;
1162 throw std::runtime_error("Error during communication");
1165 if (m_prof)
1166 m_prof->push(m_exec_conf,"comm_migrate");
1168 if (m_last_flags[comm_flag::tag])
1170 // Reset reverse lookup tags of old ghost atoms
1171 ArrayHandle<unsigned int> d_rtag(m_pdata->getRTags(), access_location::device, access_mode::readwrite);
1172 ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
1174 gpu_reset_rtags(m_pdata->getNGhosts(),
1175 d_tag.data + m_pdata->getN(),
1176 d_rtag.data);
1178 if (m_exec_conf->isCUDAErrorCheckingEnabled())
1179 CHECK_CUDA_ERROR();
1182 // reset ghost particle number
1183 m_pdata->removeAllGhostParticles();
1185 // main communication loop
1186 for (unsigned int stage = 0; stage < m_num_stages; stage++)
1189 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
1190 ArrayHandle<unsigned int> d_comm_flag(m_pdata->getCommFlags(), access_location::device, access_mode::readwrite);
1192 assert(stage < m_comm_mask.size());
1194 // mark all particles which have left the box for sending (rtag=NOT_LOCAL)
1195 gpu_stage_particles(m_pdata->getN(),
1196 d_pos.data,
1197 d_comm_flag.data,
1198 m_pdata->getBox(),
1199 m_comm_mask[stage]);
1201 if (m_exec_conf->isCUDAErrorCheckingEnabled())
1202 CHECK_CUDA_ERROR();
1207 * Bonded group communication, determine groups to be sent
1209 // Bonds
1210 m_bond_comm.migrateGroups(m_bonds_changed);
1211 m_bonds_changed = false;
1213 // Angles
1214 m_angle_comm.migrateGroups(m_angles_changed);
1215 m_angles_changed = false;
1217 // Dihedrals
1218 m_dihedral_comm.migrateGroups(m_dihedrals_changed);
1219 m_dihedrals_changed = false;
1221 // Dihedrals
1222 m_improper_comm.migrateGroups(m_impropers_changed);
1223 m_impropers_changed = false;
1225 // fill send buffer
1226 m_pdata->removeParticlesGPU(m_gpu_sendbuf, m_comm_flags);
1228 const Index3D& di = m_decomposition->getDomainIndexer();
1229 // determine local particles that are to be sent to neighboring processors and fill send buffer
1230 uint3 mypos = m_decomposition->getGridPos();
1232 /* We need some better heuristics to decide whether to take the GPU or CPU code path */
1233 #if defined(ENABLE_MPI_CUDA) && 0
1235 // resize keys
1236 m_send_keys.resize(m_gpu_sendbuf.size());
1238 ArrayHandle<pdata_element> d_gpu_sendbuf(m_gpu_sendbuf, access_location::device, access_mode::readwrite);
1239 ArrayHandle<unsigned int> d_send_keys(m_send_keys, access_location::device, access_mode::overwrite);
1240 ArrayHandle<unsigned int> d_begin(m_begin, access_location::device, access_mode::overwrite);
1241 ArrayHandle<unsigned int> d_end(m_end, access_location::device, access_mode::overwrite);
1242 ArrayHandle<unsigned int> d_unique_neighbors(m_unique_neighbors, access_location::device, access_mode::read);
1243 ArrayHandle<unsigned int> d_comm_flags(m_comm_flags, access_location::device, access_mode::read);
1245 ArrayHandle<unsigned int> d_cart_ranks(m_decomposition->getCartRanks(), access_location::device, access_mode::read);
1247 // get temporary buffers
1248 unsigned int nsend = m_gpu_sendbuf.size();
1249 const CachedAllocator& alloc = m_exec_conf->getCachedAllocator();
1250 ScopedAllocation<pdata_element> d_in_copy(alloc, nsend);
1251 ScopedAllocation<unsigned int> d_tmp(alloc, nsend);
1253 gpu_sort_migrating_particles(m_gpu_sendbuf.size(),
1254 d_gpu_sendbuf.data,
1255 d_comm_flags.data,
1257 mypos,
1258 d_cart_ranks.data,
1259 d_send_keys.data,
1260 d_begin.data,
1261 d_end.data,
1262 d_unique_neighbors.data,
1263 m_n_unique_neigh,
1264 m_comm_mask[stage],
1265 m_mgpu_context,
1266 d_tmp.data,
1267 d_in_copy.data);
1269 if (m_exec_conf->isCUDAErrorCheckingEnabled())
1270 CHECK_CUDA_ERROR();
1272 #else
1274 ArrayHandle<pdata_element> h_gpu_sendbuf(m_gpu_sendbuf, access_location::host, access_mode::readwrite);
1275 ArrayHandle<unsigned int> h_begin(m_begin, access_location::host, access_mode::overwrite);
1276 ArrayHandle<unsigned int> h_end(m_end, access_location::host, access_mode::overwrite);
1277 ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1278 ArrayHandle<unsigned int> h_comm_flags(m_comm_flags, access_location::host, access_mode::read);
1280 ArrayHandle<unsigned int> h_cart_ranks(m_pdata->getDomainDecomposition()->getCartRanks(), access_location::host,
1281 access_mode::read);
1282 typedef std::multimap<unsigned int,pdata_element> key_t;
1283 key_t keys;
1285 // generate keys
1286 get_migrate_key t(mypos, di, m_comm_mask[stage],h_cart_ranks.data);
1287 for (unsigned int i = 0; i < m_comm_flags.size(); ++i)
1288 keys.insert(std::pair<unsigned int, pdata_element>(t(h_comm_flags.data[i]),h_gpu_sendbuf.data[i]));
1290 // Find start and end indices
1291 for (unsigned int i = 0; i < m_n_unique_neigh; ++i)
1293 key_t::iterator lower = keys.lower_bound(h_unique_neighbors.data[i]);
1294 key_t::iterator upper = keys.upper_bound(h_unique_neighbors.data[i]);
1295 h_begin.data[i] = std::distance(keys.begin(),lower);
1296 h_end.data[i] = std::distance(keys.begin(),upper);
1299 // sort send buffer
1300 unsigned int i = 0;
1301 for (key_t::iterator it = keys.begin(); it != keys.end(); ++it)
1302 h_gpu_sendbuf.data[i++] = it->second;
1304 #endif
1306 unsigned int n_send_ptls[m_n_unique_neigh];
1307 unsigned int n_recv_ptls[m_n_unique_neigh];
1308 unsigned int offs[m_n_unique_neigh];
1309 unsigned int n_recv_tot = 0;
1312 ArrayHandle<unsigned int> h_begin(m_begin, access_location::host, access_mode::read);
1313 ArrayHandle<unsigned int> h_end(m_end, access_location::host, access_mode::read);
1314 ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1316 unsigned int send_bytes = 0;
1317 unsigned int recv_bytes = 0;
1318 if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
1320 // compute send counts
1321 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1322 n_send_ptls[ineigh] = h_end.data[ineigh] - h_begin.data[ineigh];
1324 MPI_Request req[2*m_n_unique_neigh];
1325 MPI_Status stat[2*m_n_unique_neigh];
1327 unsigned int nreq = 0;
1329 // loop over neighbors
1330 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1332 if (m_stages[ineigh] != (int) stage)
1334 // skip neighbor if not participating in this communication stage
1335 n_send_ptls[ineigh] = 0;
1336 n_recv_ptls[ineigh] = 0;
1337 continue;
1340 // rank of neighbor processor
1341 unsigned int neighbor = h_unique_neighbors.data[ineigh];
1343 MPI_Isend(&n_send_ptls[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_mpi_comm, & req[nreq++]);
1344 MPI_Irecv(&n_recv_ptls[ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_mpi_comm, & req[nreq++]);
1345 send_bytes += sizeof(unsigned int);
1346 recv_bytes += sizeof(unsigned int);
1347 } // end neighbor loop
1349 MPI_Waitall(nreq, req, stat);
1351 // sum up receive counts
1352 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1354 if (ineigh == 0)
1355 offs[ineigh] = 0;
1356 else
1357 offs[ineigh] = offs[ineigh-1] + n_recv_ptls[ineigh-1];
1359 n_recv_tot += n_recv_ptls[ineigh];
1362 if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1365 // Resize receive buffer
1366 m_gpu_recvbuf.resize(n_recv_tot);
1369 ArrayHandle<unsigned int> h_begin(m_begin, access_location::host, access_mode::read);
1370 ArrayHandle<unsigned int> h_end(m_end, access_location::host, access_mode::read);
1371 ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1373 if (m_prof) m_prof->push(m_exec_conf,"MPI send/recv");
1375 #if defined(ENABLE_MPI_CUDA) && 0
1376 ArrayHandle<pdata_element> gpu_sendbuf_handle(m_gpu_sendbuf, access_location::device, access_mode::read);
1377 ArrayHandle<pdata_element> gpu_recvbuf_handle(m_gpu_recvbuf, access_location::device, access_mode::overwrite);
1378 #else
1379 ArrayHandle<pdata_element> gpu_sendbuf_handle(m_gpu_sendbuf, access_location::host, access_mode::read);
1380 ArrayHandle<pdata_element> gpu_recvbuf_handle(m_gpu_recvbuf, access_location::host, access_mode::overwrite);
1381 #endif
1383 std::vector<MPI_Request> reqs;
1384 MPI_Request req;
1386 unsigned int send_bytes = 0;
1387 unsigned int recv_bytes = 0;
1389 // loop over neighbors
1390 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1392 // rank of neighbor processor
1393 unsigned int neighbor = h_unique_neighbors.data[ineigh];
1395 // exchange particle data
1396 if (n_send_ptls[ineigh])
1398 MPI_Isend(gpu_sendbuf_handle.data+h_begin.data[ineigh],
1399 n_send_ptls[ineigh]*sizeof(pdata_element),
1400 MPI_BYTE,
1401 neighbor,
1403 m_mpi_comm,
1404 &req);
1405 reqs.push_back(req);
1407 send_bytes+= n_send_ptls[ineigh]*sizeof(pdata_element);
1409 if (n_recv_ptls[ineigh])
1411 MPI_Irecv(gpu_recvbuf_handle.data+offs[ineigh],
1412 n_recv_ptls[ineigh]*sizeof(pdata_element),
1413 MPI_BYTE,
1414 neighbor,
1416 m_mpi_comm,
1417 &req);
1418 reqs.push_back(req);
1420 recv_bytes += n_recv_ptls[ineigh]*sizeof(pdata_element);
1423 std::vector<MPI_Status> stats(reqs.size());
1424 MPI_Waitall(reqs.size(), &reqs.front(), &stats.front());
1426 if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1430 ArrayHandle<pdata_element> d_gpu_recvbuf(m_gpu_recvbuf, access_location::device, access_mode::readwrite);
1431 const BoxDim shifted_box = getShiftedBox();
1433 // Apply boundary conditions
1434 gpu_wrap_particles(n_recv_tot,
1435 d_gpu_recvbuf.data,
1436 shifted_box);
1437 if (m_exec_conf->isCUDAErrorCheckingEnabled())
1438 CHECK_CUDA_ERROR();
1441 // remove particles that were sent and fill particle data with received particles
1442 m_pdata->addParticlesGPU(m_gpu_recvbuf);
1444 } // end communication stage
1446 if (m_prof) m_prof->pop(m_exec_conf);
1449 //! Build a ghost particle list, exchange ghost particle data with neighboring processors
1450 void CommunicatorGPU::exchangeGhosts()
1452 if (m_prof) m_prof->push(m_exec_conf, "comm_ghost_exch");
1454 m_exec_conf->msg->notice(7) << "CommunicatorGPU: ghost exchange" << std::endl;
1456 // the ghost layer must be at_least m_r_ghost wide along every lattice direction
1457 Scalar3 ghost_fraction = m_r_ghost/m_pdata->getBox().getNearestPlaneDistance();
1459 // resize arrays
1460 m_n_send_ghosts.resize(m_num_stages);
1461 m_n_recv_ghosts.resize(m_num_stages);
1463 for (unsigned int istage = 0; istage < m_num_stages; ++istage)
1465 m_n_send_ghosts[istage].resize(m_n_unique_neigh);
1466 m_n_recv_ghosts[istage].resize(m_n_unique_neigh);
1469 m_n_send_ghosts_tot.resize(m_num_stages);
1470 m_n_recv_ghosts_tot.resize(m_num_stages);
1471 m_ghost_offs.resize(m_num_stages);
1472 for (unsigned int istage = 0; istage < m_num_stages; ++istage)
1473 m_ghost_offs[istage].resize(m_n_unique_neigh);
1475 m_ghost_begin.resize(m_n_unique_neigh*m_num_stages);
1476 m_ghost_end.resize(m_n_unique_neigh*m_num_stages);
1478 m_idx_offs.resize(m_num_stages);
1480 // get requested ghost fields
1481 CommFlags flags = getFlags();
1483 // main communication loop
1484 for (unsigned int stage = 0; stage < m_num_stages; stage++)
1486 // make room for plans
1487 m_ghost_plan.resize(m_pdata->getN()+m_pdata->getNGhosts());
1490 // compute plans for all particles, including already received ghosts
1491 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
1492 ArrayHandle<unsigned int> d_ghost_plan(m_ghost_plan, access_location::device, access_mode::overwrite);
1494 gpu_make_ghost_exchange_plan(d_ghost_plan.data,
1495 m_pdata->getN()+m_pdata->getNGhosts(),
1496 d_pos.data,
1497 m_pdata->getBox(),
1498 ghost_fraction,
1499 m_comm_mask[stage]);
1501 if (m_exec_conf->isCUDAErrorCheckingEnabled())
1502 CHECK_CUDA_ERROR();
1505 // mark particles that are members of incomplete of bonded groups as ghost
1507 // bonds
1508 m_bond_comm.markGhostParticles(m_ghost_plan,m_comm_mask[stage]);
1510 // angles
1511 m_angle_comm.markGhostParticles(m_ghost_plan,m_comm_mask[stage]);
1513 // dihedrals
1514 m_dihedral_comm.markGhostParticles(m_ghost_plan,m_comm_mask[stage]);
1516 // impropers
1517 m_improper_comm.markGhostParticles(m_ghost_plan,m_comm_mask[stage]);
1519 // resize temporary number of neighbors array
1520 m_neigh_counts.resize(m_pdata->getN()+m_pdata->getNGhosts());
1523 ArrayHandle<unsigned int> d_ghost_plan(m_ghost_plan, access_location::device, access_mode::read);
1524 ArrayHandle<unsigned int> d_adj_mask(m_adj_mask, access_location::device, access_mode::read);
1525 ArrayHandle<unsigned int> d_neigh_counts(m_neigh_counts, access_location::device, access_mode::overwrite);
1527 // count number of neighbors (total and per particle) the ghost ptls are sent to
1528 m_n_send_ghosts_tot[stage] =
1529 gpu_exchange_ghosts_count_neighbors(
1530 m_pdata->getN()+m_pdata->getNGhosts(),
1531 d_ghost_plan.data,
1532 d_adj_mask.data,
1533 d_neigh_counts.data,
1534 m_n_unique_neigh,
1535 m_mgpu_context);
1537 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1540 // compute offset into ghost idx list
1541 m_idx_offs[stage] = 0;
1542 for (unsigned int i = 0; i < stage; ++i)
1543 m_idx_offs[stage] += m_n_send_ghosts_tot[i];
1545 // compute maximum send buf size
1546 unsigned int n_max = 0;
1547 for (unsigned int istage = 0; istage <= stage; ++istage)
1548 if (m_n_send_ghosts_tot[istage] > n_max) n_max = m_n_send_ghosts_tot[istage];
1550 // make room for ghost indices and neighbor ranks
1551 m_ghost_idx_adj.resize(m_idx_offs[stage] + m_n_send_ghosts_tot[stage]);
1552 m_ghost_neigh.resize(m_idx_offs[stage] + m_n_send_ghosts_tot[stage]);
1554 if (flags[comm_flag::tag]) m_tag_ghost_sendbuf.resize(n_max);
1555 if (flags[comm_flag::position]) m_pos_ghost_sendbuf.resize(n_max);
1556 if (flags[comm_flag::velocity]) m_vel_ghost_sendbuf.resize(n_max);
1557 if (flags[comm_flag::charge]) m_charge_ghost_sendbuf.resize(n_max);
1558 if (flags[comm_flag::diameter]) m_diameter_ghost_sendbuf.resize(n_max);
1559 if (flags[comm_flag::orientation]) m_orientation_ghost_sendbuf.resize(n_max);
1562 ArrayHandle<unsigned int> d_ghost_plan(m_ghost_plan, access_location::device, access_mode::read);
1563 ArrayHandle<unsigned int> d_adj_mask(m_adj_mask, access_location::device, access_mode::read);
1564 ArrayHandle<unsigned int> d_neigh_counts(m_neigh_counts, access_location::device, access_mode::read);
1565 ArrayHandle<unsigned int> d_unique_neighbors(m_unique_neighbors, access_location::device, access_mode::read);
1567 ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
1569 ArrayHandle<uint2> d_ghost_idx_adj(m_ghost_idx_adj, access_location::device, access_mode::overwrite);
1570 ArrayHandle<unsigned int> d_ghost_neigh(m_ghost_neigh, access_location::device, access_mode::overwrite);
1571 ArrayHandle<unsigned int> d_ghost_begin(m_ghost_begin, access_location::device, access_mode::overwrite);
1572 ArrayHandle<unsigned int> d_ghost_end(m_ghost_end, access_location::device, access_mode::overwrite);
1574 //! Fill ghost send list and compute start and end indices per unique neighbor in list
1575 gpu_exchange_ghosts_make_indices(
1576 m_pdata->getN() + m_pdata->getNGhosts(),
1577 d_ghost_plan.data,
1578 d_tag.data,
1579 d_adj_mask.data,
1580 d_unique_neighbors.data,
1581 d_neigh_counts.data,
1582 d_ghost_idx_adj.data + m_idx_offs[stage],
1583 d_ghost_neigh.data + m_idx_offs[stage],
1584 d_ghost_begin.data + stage*m_n_unique_neigh,
1585 d_ghost_end.data + stage*m_n_unique_neigh,
1586 m_n_unique_neigh,
1587 m_n_send_ghosts_tot[stage],
1588 m_comm_mask[stage],
1589 m_mgpu_context);
1591 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1595 // access particle data
1596 ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
1597 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
1598 ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::read);
1599 ArrayHandle<Scalar> d_charge(m_pdata->getCharges(), access_location::device, access_mode::read);
1600 ArrayHandle<Scalar> d_diameter(m_pdata->getDiameters(), access_location::device, access_mode::read);
1601 ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::read);
1602 ArrayHandle<unsigned int> d_rtag(m_pdata->getRTags(), access_location::device, access_mode::read);
1604 // access ghost send indices
1605 ArrayHandle<uint2> d_ghost_idx_adj(m_ghost_idx_adj, access_location::device, access_mode::read);
1607 // access output buffers
1608 ArrayHandle<unsigned int> d_tag_ghost_sendbuf(m_tag_ghost_sendbuf, access_location::device, access_mode::overwrite);
1609 ArrayHandle<Scalar4> d_pos_ghost_sendbuf(m_pos_ghost_sendbuf, access_location::device, access_mode::overwrite);
1610 ArrayHandle<Scalar4> d_vel_ghost_sendbuf(m_vel_ghost_sendbuf, access_location::device, access_mode::overwrite);
1611 ArrayHandle<Scalar> d_charge_ghost_sendbuf(m_charge_ghost_sendbuf, access_location::device, access_mode::overwrite);
1612 ArrayHandle<Scalar> d_diameter_ghost_sendbuf(m_diameter_ghost_sendbuf, access_location::device, access_mode::overwrite);
1613 ArrayHandle<Scalar4> d_orientation_ghost_sendbuf(m_orientation_ghost_sendbuf, access_location::device, access_mode::overwrite);
1615 const BoxDim& global_box = m_pdata->getGlobalBox();
1616 const Index3D& di = m_pdata->getDomainDecomposition()->getDomainIndexer();
1617 uint3 my_pos = m_pdata->getDomainDecomposition()->getGridPos();
1619 // Pack ghosts into send buffers
1620 gpu_exchange_ghosts_pack(
1621 m_n_send_ghosts_tot[stage],
1622 d_ghost_idx_adj.data + m_idx_offs[stage],
1623 d_tag.data,
1624 d_pos.data,
1625 d_vel.data,
1626 d_charge.data,
1627 d_diameter.data,
1628 d_orientation.data,
1629 d_tag_ghost_sendbuf.data,
1630 d_pos_ghost_sendbuf.data,
1631 d_vel_ghost_sendbuf.data,
1632 d_charge_ghost_sendbuf.data,
1633 d_diameter_ghost_sendbuf.data,
1634 d_orientation_ghost_sendbuf.data,
1635 flags[comm_flag::tag],
1636 flags[comm_flag::position],
1637 flags[comm_flag::velocity],
1638 flags[comm_flag::charge],
1639 flags[comm_flag::diameter],
1640 flags[comm_flag::orientation],
1642 my_pos,
1643 global_box);
1645 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1649 * Ghost particle communication
1651 m_n_recv_ghosts_tot[stage] = 0;
1653 unsigned int send_bytes = 0;
1654 unsigned int recv_bytes = 0;
1657 ArrayHandle<unsigned int> h_ghost_begin(m_ghost_begin, access_location::host, access_mode::read);
1658 ArrayHandle<unsigned int> h_ghost_end(m_ghost_end, access_location::host, access_mode::read);
1659 ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1661 if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
1663 // compute send counts
1664 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1665 m_n_send_ghosts[stage][ineigh] = h_ghost_end.data[ineigh+stage*m_n_unique_neigh]
1666 - h_ghost_begin.data[ineigh+stage*m_n_unique_neigh];
1668 MPI_Request req[2*m_n_unique_neigh];
1669 MPI_Status stat[2*m_n_unique_neigh];
1671 unsigned int nreq = 0;
1673 // loop over neighbors
1674 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1676 if (m_stages[ineigh] != (int) stage)
1678 // skip neighbor if not participating in this communication stage
1679 m_n_send_ghosts[stage][ineigh] = 0;
1680 m_n_recv_ghosts[stage][ineigh] = 0;
1681 continue;
1684 // rank of neighbor processor
1685 unsigned int neighbor = h_unique_neighbors.data[ineigh];
1687 MPI_Isend(&m_n_send_ghosts[stage][ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_mpi_comm, & req[nreq++]);
1688 MPI_Irecv(&m_n_recv_ghosts[stage][ineigh], 1, MPI_UNSIGNED, neighbor, 0, m_mpi_comm, & req[nreq++]);
1690 send_bytes += sizeof(unsigned int);
1691 recv_bytes += sizeof(unsigned int);
1694 MPI_Waitall(nreq, req, stat);
1696 // total up receive counts
1697 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1699 if (ineigh == 0)
1700 m_ghost_offs[stage][ineigh] = 0;
1701 else
1702 m_ghost_offs[stage][ineigh] = m_ghost_offs[stage][ineigh-1] + m_n_recv_ghosts[stage][ineigh-1];
1704 m_n_recv_ghosts_tot[stage] += m_n_recv_ghosts[stage][ineigh];
1707 if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1710 n_max = 0;
1711 // compute maximum number of received ghosts
1712 for (unsigned int istage = 0; istage <= stage; ++istage)
1713 if (m_n_recv_ghosts_tot[istage] > n_max) n_max = m_n_recv_ghosts_tot[istage];
1715 if (flags[comm_flag::tag]) m_tag_ghost_recvbuf.resize(n_max);
1716 if (flags[comm_flag::position]) m_pos_ghost_recvbuf.resize(n_max);
1717 if (flags[comm_flag::velocity]) m_vel_ghost_recvbuf.resize(n_max);
1718 if (flags[comm_flag::charge]) m_charge_ghost_recvbuf.resize(n_max);
1719 if (flags[comm_flag::diameter]) m_diameter_ghost_recvbuf.resize(n_max);
1720 if (flags[comm_flag::orientation]) m_orientation_ghost_recvbuf.resize(n_max);
1722 // first ghost ptl index
1723 unsigned int first_idx = m_pdata->getN()+m_pdata->getNGhosts();
1725 // update number of ghost particles
1726 m_pdata->addGhostParticles(m_n_recv_ghosts_tot[stage]);
1729 unsigned int offs = 0;
1730 #ifdef ENABLE_MPI_CUDA
1731 // recv buffers, write directly into particle data arrays
1732 offs = first_idx;
1733 ArrayHandle<unsigned int> tag_ghost_recvbuf_handle(m_pdata->getTags(), access_location::device, access_mode::readwrite);
1734 ArrayHandle<Scalar4> pos_ghost_recvbuf_handle(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
1735 ArrayHandle<Scalar4> vel_ghost_recvbuf_handle(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
1736 ArrayHandle<Scalar> charge_ghost_recvbuf_handle(m_pdata->getCharges(), access_location::device, access_mode::readwrite);
1737 ArrayHandle<Scalar> diameter_ghost_recvbuf_handle(m_pdata->getDiameters(), access_location::device, access_mode::readwrite);
1738 ArrayHandle<Scalar4> orientation_ghost_recvbuf_handle(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
1740 // send buffers
1741 ArrayHandle<unsigned int> tag_ghost_sendbuf_handle(m_tag_ghost_sendbuf, access_location::device, access_mode::read);
1742 ArrayHandle<Scalar4> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf, access_location::device, access_mode::read);
1743 ArrayHandle<Scalar4> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf, access_location::device, access_mode::read);
1744 ArrayHandle<Scalar> charge_ghost_sendbuf_handle(m_charge_ghost_sendbuf, access_location::device, access_mode::read);
1745 ArrayHandle<Scalar> diameter_ghost_sendbuf_handle(m_diameter_ghost_sendbuf, access_location::device, access_mode::read);
1746 ArrayHandle<Scalar4> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf, access_location::device, access_mode::read);
1748 // MPI library may use non-zero stream
1749 cudaDeviceSynchronize();
1750 #else
1751 // recv buffers
1752 ArrayHandleAsync<unsigned int> tag_ghost_recvbuf_handle(m_tag_ghost_recvbuf, access_location::host, access_mode::overwrite);
1753 ArrayHandleAsync<Scalar4> pos_ghost_recvbuf_handle(m_pos_ghost_recvbuf, access_location::host, access_mode::overwrite);
1754 ArrayHandleAsync<Scalar4> vel_ghost_recvbuf_handle(m_vel_ghost_recvbuf, access_location::host, access_mode::overwrite);
1755 ArrayHandleAsync<Scalar> charge_ghost_recvbuf_handle(m_charge_ghost_recvbuf, access_location::host, access_mode::overwrite);
1756 ArrayHandleAsync<Scalar> diameter_ghost_recvbuf_handle(m_diameter_ghost_recvbuf, access_location::host, access_mode::overwrite);
1757 ArrayHandleAsync<Scalar4> orientation_ghost_recvbuf_handle(m_orientation_ghost_recvbuf, access_location::host, access_mode::overwrite);
1758 // send buffers
1759 ArrayHandleAsync<unsigned int> tag_ghost_sendbuf_handle(m_tag_ghost_sendbuf, access_location::host, access_mode::read);
1760 ArrayHandleAsync<Scalar4> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf, access_location::host, access_mode::read);
1761 ArrayHandleAsync<Scalar4> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf, access_location::host, access_mode::read);
1762 ArrayHandleAsync<Scalar> charge_ghost_sendbuf_handle(m_charge_ghost_sendbuf, access_location::host, access_mode::read);
1763 ArrayHandleAsync<Scalar> diameter_ghost_sendbuf_handle(m_diameter_ghost_sendbuf, access_location::host, access_mode::read);
1764 ArrayHandleAsync<Scalar4> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf, access_location::host, access_mode::read);
1766 // lump together into one synchronization call
1767 cudaDeviceSynchronize();
1768 #endif
1770 ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
1771 ArrayHandle<unsigned int> h_ghost_begin(m_ghost_begin, access_location::host, access_mode::read);
1772 ArrayHandle<unsigned int> h_ghost_end(m_ghost_end, access_location::host, access_mode::read);
1774 if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
1776 std::vector<MPI_Request> reqs;
1777 MPI_Request req;
1779 // loop over neighbors
1780 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
1782 // rank of neighbor processor
1783 unsigned int neighbor = h_unique_neighbors.data[ineigh];
1785 if (flags[comm_flag::tag])
1787 // when sending/receiving 0 ptls, the send/recv buffer may be uninitialized
1788 if (m_n_send_ghosts[stage][ineigh])
1790 MPI_Isend(tag_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh+stage*m_n_unique_neigh],
1791 m_n_send_ghosts[stage][ineigh]*sizeof(unsigned int),
1792 MPI_BYTE,
1793 neighbor,
1795 m_mpi_comm,
1796 &req);
1797 reqs.push_back(req);
1799 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(unsigned int);
1800 if (m_n_recv_ghosts[stage][ineigh])
1802 MPI_Irecv(tag_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1803 m_n_recv_ghosts[stage][ineigh]*sizeof(unsigned int),
1804 MPI_BYTE,
1805 neighbor,
1807 m_mpi_comm,
1808 &req);
1809 reqs.push_back(req);
1811 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(unsigned int);
1814 if (flags[comm_flag::position])
1816 MPI_Request req;
1817 if (m_n_send_ghosts[stage][ineigh])
1819 MPI_Isend(pos_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1820 m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
1821 MPI_BYTE,
1822 neighbor,
1824 m_mpi_comm,
1825 &req);
1826 reqs.push_back(req);
1828 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
1829 if (m_n_recv_ghosts[stage][ineigh])
1831 MPI_Irecv(pos_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1832 m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
1833 MPI_BYTE,
1834 neighbor,
1836 m_mpi_comm,
1837 &req);
1838 reqs.push_back(req);
1840 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(unsigned int);
1843 if (flags[comm_flag::velocity])
1845 if (m_n_send_ghosts[stage][ineigh])
1847 MPI_Isend(vel_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1848 m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
1849 MPI_BYTE,
1850 neighbor,
1852 m_mpi_comm,
1853 &req);
1854 reqs.push_back(req);
1856 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
1857 if (m_n_recv_ghosts[stage][ineigh])
1859 MPI_Irecv(vel_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1860 m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
1861 MPI_BYTE,
1862 neighbor,
1864 m_mpi_comm,
1865 &req);
1866 reqs.push_back(req);
1868 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
1871 if (flags[comm_flag::charge])
1873 if (m_n_send_ghosts[stage][ineigh])
1875 MPI_Isend(charge_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1876 m_n_send_ghosts[stage][ineigh]*sizeof(Scalar),
1877 MPI_BYTE,
1878 neighbor,
1880 m_mpi_comm,
1881 &req);
1882 reqs.push_back(req);
1884 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar);
1885 if (m_n_recv_ghosts[stage][ineigh])
1887 MPI_Irecv(charge_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1888 m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar),
1889 MPI_BYTE,
1890 neighbor,
1892 m_mpi_comm,
1893 &req);
1894 reqs.push_back(req);
1896 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar);
1899 if (flags[comm_flag::diameter])
1901 if (m_n_send_ghosts[stage][ineigh])
1903 MPI_Isend(diameter_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1904 m_n_send_ghosts[stage][ineigh]*sizeof(Scalar),
1905 MPI_BYTE,
1906 neighbor,
1908 m_mpi_comm,
1909 &req);
1910 reqs.push_back(req);
1912 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar);
1913 if (m_n_recv_ghosts[stage][ineigh])
1915 MPI_Irecv(diameter_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1916 m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar),
1917 MPI_BYTE,
1918 neighbor,
1920 m_mpi_comm,
1921 &req);
1922 reqs.push_back(req);
1924 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar);
1927 if (flags[comm_flag::orientation])
1929 if (m_n_send_ghosts[stage][ineigh])
1931 MPI_Isend(orientation_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
1932 m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
1933 MPI_BYTE,
1934 neighbor,
1936 m_mpi_comm,
1937 &req);
1938 reqs.push_back(req);
1940 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
1941 if (m_n_recv_ghosts[stage][ineigh])
1943 MPI_Irecv(orientation_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
1944 m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
1945 MPI_BYTE,
1946 neighbor,
1948 m_mpi_comm,
1949 &req);
1950 reqs.push_back(req);
1952 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
1954 } // end neighbor loop
1956 std::vector<MPI_Status> stats(reqs.size());
1957 MPI_Waitall(reqs.size(), &reqs.front(), &stats.front());
1959 if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
1960 } // end ArrayHandle scope
1962 #ifdef ENABLE_MPI_CUDA
1963 // MPI library may use non-zero stream
1964 cudaDeviceSynchronize();
1965 #else
1966 // only unpack in non-CUDA MPI builds
1968 // access receive buffers
1969 ArrayHandle<unsigned int> d_tag_ghost_recvbuf(m_tag_ghost_recvbuf, access_location::device, access_mode::read);
1970 ArrayHandle<Scalar4> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf, access_location::device, access_mode::read);
1971 ArrayHandle<Scalar4> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf, access_location::device, access_mode::read);
1972 ArrayHandle<Scalar> d_charge_ghost_recvbuf(m_charge_ghost_recvbuf, access_location::device, access_mode::read);
1973 ArrayHandle<Scalar> d_diameter_ghost_recvbuf(m_diameter_ghost_recvbuf, access_location::device, access_mode::read);
1974 ArrayHandle<Scalar4> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf, access_location::device, access_mode::read);
1975 // access particle data
1976 ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::readwrite);
1977 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
1978 ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
1979 ArrayHandle<Scalar> d_charge(m_pdata->getCharges(), access_location::device, access_mode::readwrite);
1980 ArrayHandle<Scalar> d_diameter(m_pdata->getDiameters(), access_location::device, access_mode::readwrite);
1981 ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
1983 // copy recv buf into particle data
1984 gpu_exchange_ghosts_copy_buf(
1985 m_n_recv_ghosts_tot[stage],
1986 d_tag_ghost_recvbuf.data,
1987 d_pos_ghost_recvbuf.data,
1988 d_vel_ghost_recvbuf.data,
1989 d_charge_ghost_recvbuf.data,
1990 d_diameter_ghost_recvbuf.data,
1991 d_orientation_ghost_recvbuf.data,
1992 d_tag.data + first_idx,
1993 d_pos.data + first_idx,
1994 d_vel.data + first_idx,
1995 d_charge.data + first_idx,
1996 d_diameter.data + first_idx,
1997 d_orientation.data + first_idx,
1998 flags[comm_flag::tag],
1999 flags[comm_flag::position],
2000 flags[comm_flag::velocity],
2001 flags[comm_flag::charge],
2002 flags[comm_flag::diameter],
2003 flags[comm_flag::orientation]);
2005 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2007 #endif
2009 if (flags[comm_flag::tag])
2011 ArrayHandle<unsigned int> d_tag(m_pdata->getTags(), access_location::device, access_mode::read);
2012 ArrayHandle<unsigned int> d_rtag(m_pdata->getRTags(), access_location::device, access_mode::readwrite);
2014 // update reverse-lookup table
2015 gpu_compute_ghost_rtags(first_idx,
2016 m_n_recv_ghosts_tot[stage],
2017 d_tag.data + first_idx,
2018 d_rtag.data);
2019 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2021 } // end main communication loop
2023 m_last_flags = flags;
2025 if (m_prof) m_prof->pop(m_exec_conf);
2027 // we have updated ghost particles, so notify subscribers about this
2028 m_pdata->notifyGhostParticleNumberChange();
2031 //! Perform ghosts update
2032 void CommunicatorGPU::beginUpdateGhosts(unsigned int timestep)
2034 m_exec_conf->msg->notice(7) << "CommunicatorGPU: ghost update" << std::endl;
2036 if (m_prof) m_prof->push(m_exec_conf, "comm_ghost_update");
2038 CommFlags flags = getFlags();
2040 // main communication loop
2041 for (unsigned int stage = 0; stage < m_num_stages; ++stage)
2043 #ifndef ENABLE_MPI_CUDA
2044 bool mapped_ghost_recv = m_tuner_ghost_recv->getParam();
2045 bool mapped_ghost_send = m_tuner_ghost_send->getParam();
2047 // switch between mapped and non-mapped
2048 if (mapped_ghost_recv != m_mapped_ghost_recv)
2050 m_pos_ghost_recvbuf_alt.resize(m_pos_ghost_recvbuf.size());
2051 m_pos_ghost_recvbuf.swap(m_pos_ghost_recvbuf_alt);
2053 m_vel_ghost_recvbuf_alt.resize(m_vel_ghost_recvbuf.size());
2054 m_vel_ghost_recvbuf.swap(m_vel_ghost_recvbuf_alt);
2056 m_orientation_ghost_recvbuf_alt.resize(m_orientation_ghost_recvbuf.size());
2057 m_orientation_ghost_recvbuf.swap(m_orientation_ghost_recvbuf_alt);
2059 m_tag_ghost_recvbuf_alt.resize(m_tag_ghost_recvbuf.size());
2060 m_tag_ghost_recvbuf.swap(m_tag_ghost_recvbuf_alt);
2062 m_charge_ghost_recvbuf_alt.resize(m_charge_ghost_recvbuf.size());
2063 m_charge_ghost_recvbuf.swap(m_charge_ghost_recvbuf_alt);
2065 m_diameter_ghost_recvbuf_alt.resize(m_diameter_ghost_recvbuf.size());
2066 m_diameter_ghost_recvbuf.swap(m_diameter_ghost_recvbuf_alt);
2068 m_mapped_ghost_recv = mapped_ghost_recv;
2070 if (mapped_ghost_send != m_mapped_ghost_send)
2072 m_pos_ghost_sendbuf_alt.resize(m_pos_ghost_sendbuf.size());
2073 m_pos_ghost_sendbuf.swap(m_pos_ghost_sendbuf_alt);
2075 m_vel_ghost_sendbuf_alt.resize(m_vel_ghost_sendbuf.size());
2076 m_vel_ghost_sendbuf.swap(m_vel_ghost_sendbuf_alt);
2078 m_orientation_ghost_sendbuf_alt.resize(m_orientation_ghost_sendbuf.size());
2079 m_orientation_ghost_sendbuf.swap(m_orientation_ghost_sendbuf_alt);
2081 m_tag_ghost_sendbuf_alt.resize(m_tag_ghost_sendbuf.size());
2082 m_tag_ghost_sendbuf.swap(m_tag_ghost_sendbuf_alt);
2084 m_charge_ghost_sendbuf_alt.resize(m_charge_ghost_sendbuf.size());
2085 m_charge_ghost_sendbuf.swap(m_charge_ghost_sendbuf_alt);
2087 m_diameter_ghost_sendbuf_alt.resize(m_diameter_ghost_sendbuf.size());
2088 m_diameter_ghost_sendbuf.swap(m_diameter_ghost_sendbuf_alt);
2090 m_mapped_ghost_send = mapped_ghost_send;
2092 #endif
2094 if (m_prof) m_prof->push(m_exec_conf,"pack");
2096 // access particle data
2097 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::read);
2098 ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::read);
2099 ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::read);
2101 // access ghost send indices
2102 ArrayHandle<uint2> d_ghost_idx_adj(m_ghost_idx_adj, access_location::device, access_mode::read);
2104 #ifndef ENABLE_MPI_CUDA
2105 // begin measurement of device-host transfer time
2106 m_tuner_ghost_send->begin();
2107 #endif
2109 // access output buffers
2110 ArrayHandle<unsigned int> d_tag_ghost_sendbuf(m_tag_ghost_sendbuf, access_location::device, access_mode::overwrite);
2111 ArrayHandle<Scalar4> d_pos_ghost_sendbuf(m_pos_ghost_sendbuf, access_location::device, access_mode::overwrite);
2112 ArrayHandle<Scalar4> d_vel_ghost_sendbuf(m_vel_ghost_sendbuf, access_location::device, access_mode::overwrite);
2113 ArrayHandle<Scalar4> d_orientation_ghost_sendbuf(m_orientation_ghost_sendbuf, access_location::device, access_mode::overwrite);
2115 const BoxDim& global_box = m_pdata->getGlobalBox();
2116 const Index3D& di = m_pdata->getDomainDecomposition()->getDomainIndexer();
2117 uint3 my_pos = m_pdata->getDomainDecomposition()->getGridPos();
2119 // Pack ghosts into send buffers
2120 gpu_exchange_ghosts_pack(
2121 m_n_send_ghosts_tot[stage],
2122 d_ghost_idx_adj.data + m_idx_offs[stage],
2123 NULL,
2124 d_pos.data,
2125 d_vel.data,
2126 NULL,
2127 NULL,
2128 d_orientation.data,
2129 NULL,
2130 d_pos_ghost_sendbuf.data,
2131 d_vel_ghost_sendbuf.data,
2132 NULL,
2133 NULL,
2134 d_orientation_ghost_sendbuf.data,
2135 false,
2136 flags[comm_flag::position],
2137 flags[comm_flag::velocity],
2138 false,
2139 false,
2140 flags[comm_flag::orientation],
2142 my_pos,
2143 global_box);
2145 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2147 if (m_prof) m_prof->pop(m_exec_conf);
2150 * Ghost particle communication
2153 // first ghost ptl index
2154 unsigned int first_idx = m_pdata->getN();
2156 // total up ghosts received thus far
2157 for (unsigned int istage = 0; istage < stage; ++istage)
2158 first_idx += m_n_recv_ghosts_tot[istage];
2161 unsigned int offs = 0;
2162 // access particle data
2163 #ifdef ENABLE_MPI_CUDA
2164 offs = first_idx;
2165 // recv buffers, directly write into particle data arrays
2166 ArrayHandle<Scalar4> pos_ghost_recvbuf_handle(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
2167 ArrayHandle<Scalar4> vel_ghost_recvbuf_handle(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
2168 ArrayHandle<Scalar4> orientation_ghost_recvbuf_handle(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
2170 // send buffers
2171 ArrayHandle<Scalar4> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf, access_location::device, access_mode::read);
2172 ArrayHandle<Scalar4> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf, access_location::device, access_mode::read);
2173 ArrayHandle<Scalar4> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf, access_location::device, access_mode::read);
2175 ArrayHandle<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
2176 ArrayHandle<unsigned int> h_ghost_begin(m_ghost_begin, access_location::host, access_mode::read);
2178 // MPI library may use non-zero stream
2179 cudaDeviceSynchronize();
2180 #else
2181 // recv buffers
2182 ArrayHandle<Scalar4> pos_ghost_recvbuf_handle(m_pos_ghost_recvbuf, access_location::host, access_mode::overwrite);
2183 ArrayHandle<Scalar4> vel_ghost_recvbuf_handle(m_vel_ghost_recvbuf, access_location::host, access_mode::overwrite);
2184 ArrayHandle<Scalar4> orientation_ghost_recvbuf_handle(m_orientation_ghost_recvbuf, access_location::host, access_mode::overwrite);
2186 // send buffers
2187 ArrayHandleAsync<Scalar4> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf, access_location::host, access_mode::read);
2188 ArrayHandleAsync<Scalar4> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf, access_location::host, access_mode::read);
2189 ArrayHandleAsync<Scalar4> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf, access_location::host, access_mode::read);
2191 // end measurement of device-host transfer time
2192 m_tuner_ghost_send->end();
2194 ArrayHandleAsync<unsigned int> h_unique_neighbors(m_unique_neighbors, access_location::host, access_mode::read);
2195 ArrayHandleAsync<unsigned int> h_ghost_begin(m_ghost_begin, access_location::host, access_mode::read);
2197 // lump together into one synchronization call
2198 cudaEventRecord(m_event);
2199 cudaEventSynchronize(m_event);
2200 #endif
2202 // access send buffers
2203 if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
2205 m_reqs.clear();
2206 MPI_Request req;
2208 unsigned int send_bytes = 0;
2209 unsigned int recv_bytes = 0;
2211 // loop over neighbors
2212 for (unsigned int ineigh = 0; ineigh < m_n_unique_neigh; ineigh++)
2214 // rank of neighbor processor
2215 unsigned int neighbor = h_unique_neighbors.data[ineigh];
2217 if (flags[comm_flag::position])
2219 if (m_n_send_ghosts[stage][ineigh])
2221 MPI_Isend(pos_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
2222 m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
2223 MPI_BYTE,
2224 neighbor,
2226 m_mpi_comm,
2227 &req);
2228 m_reqs.push_back(req);
2230 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
2232 if (m_n_recv_ghosts[stage][ineigh])
2234 MPI_Irecv(pos_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
2235 m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
2236 MPI_BYTE,
2237 neighbor,
2239 m_mpi_comm,
2240 &req);
2241 m_reqs.push_back(req);
2243 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
2246 if (flags[comm_flag::velocity])
2248 if (m_n_send_ghosts[stage][ineigh])
2250 MPI_Isend(vel_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
2251 m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
2252 MPI_BYTE,
2253 neighbor,
2255 m_mpi_comm,
2256 &req);
2257 m_reqs.push_back(req);
2259 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
2261 if (m_n_recv_ghosts[stage][ineigh])
2263 MPI_Irecv(vel_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
2264 m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
2265 MPI_BYTE,
2266 neighbor,
2268 m_mpi_comm,
2269 &req);
2270 m_reqs.push_back(req);
2272 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
2275 if (flags[comm_flag::orientation])
2277 if (m_n_send_ghosts[stage][ineigh])
2279 MPI_Isend(orientation_ghost_sendbuf_handle.data+h_ghost_begin.data[ineigh + stage*m_n_unique_neigh],
2280 m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4),
2281 MPI_BYTE,
2282 neighbor,
2284 m_mpi_comm,
2285 &req);
2286 m_reqs.push_back(req);
2288 send_bytes += m_n_send_ghosts[stage][ineigh]*sizeof(Scalar4);
2290 if (m_n_recv_ghosts[stage][ineigh])
2292 MPI_Irecv(orientation_ghost_recvbuf_handle.data + m_ghost_offs[stage][ineigh] + offs,
2293 m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4),
2294 MPI_BYTE,
2295 neighbor,
2297 m_mpi_comm,
2298 &req);
2299 m_reqs.push_back(req);
2301 recv_bytes += m_n_recv_ghosts[stage][ineigh]*sizeof(Scalar4);
2303 } // end neighbor loop
2305 if (m_num_stages == 1)
2307 // use non-blocking MPI
2308 m_comm_pending = true;
2310 else
2312 // complete communication
2313 std::vector<MPI_Status> stats(m_reqs.size());
2314 MPI_Waitall(m_reqs.size(), &m_reqs.front(), &stats.front());
2317 if (m_prof) m_prof->pop(m_exec_conf,0,send_bytes+recv_bytes);
2318 } // end ArrayHandle scope
2321 if (!m_comm_pending)
2323 #ifndef ENABLE_MPI_CUDA
2324 //only unpack in non-CUDA MPI builds
2325 if (m_prof) m_prof->push(m_exec_conf,"unpack");
2327 // begin measurement of host-device transfer time for recveived ghosts
2328 m_tuner_ghost_recv->begin();
2330 // access receive buffers
2331 ArrayHandle<Scalar4> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf, access_location::device, access_mode::read);
2332 ArrayHandle<Scalar4> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf, access_location::device, access_mode::read);
2333 ArrayHandle<Scalar4> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf, access_location::device, access_mode::read);
2334 // access particle data
2335 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
2336 ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
2337 ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
2339 // copy recv buf into particle data
2340 gpu_exchange_ghosts_copy_buf(
2341 m_n_recv_ghosts_tot[stage],
2342 NULL,
2343 d_pos_ghost_recvbuf.data,
2344 d_vel_ghost_recvbuf.data,
2345 NULL,
2346 NULL,
2347 d_orientation_ghost_recvbuf.data,
2348 NULL,
2349 d_pos.data + first_idx,
2350 d_vel.data + first_idx,
2351 NULL,
2352 NULL,
2353 d_orientation.data + first_idx,
2354 false,
2355 flags[comm_flag::position],
2356 flags[comm_flag::velocity],
2357 false,
2358 false,
2359 flags[comm_flag::orientation]);
2361 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2363 // end measurement
2364 m_tuner_ghost_recv->end();
2366 if (m_prof) m_prof->pop(m_exec_conf);
2367 #endif
2369 } // end main communication loop
2371 if (m_prof) m_prof->pop(m_exec_conf);
2374 /*! Finish ghost update
2376 * \param timestep The time step
2378 void CommunicatorGPU::finishUpdateGhosts(unsigned int timestep)
2380 if (m_comm_pending)
2382 m_comm_pending = false;
2384 if (m_prof) m_prof->push(m_exec_conf, "comm_ghost_update");
2386 // complete communication
2387 if (m_prof) m_prof->push(m_exec_conf, "MPI send/recv");
2388 std::vector<MPI_Status> stats(m_reqs.size());
2389 MPI_Waitall(m_reqs.size(), &m_reqs.front(), &stats.front());
2390 if (m_prof) m_prof->pop(m_exec_conf);
2392 #ifdef ENABLE_MPI_CUDA
2393 // MPI library may use non-zero stream
2394 cudaDeviceSynchronize();
2395 #else
2396 // only unpack in non-CUDA-MPI builds
2397 assert(m_num_stages == 1);
2398 unsigned int stage = 0;
2399 unsigned int first_idx = m_pdata->getN();
2400 CommFlags flags = m_last_flags;
2401 if (m_prof) m_prof->push(m_exec_conf,"unpack");
2403 // begin measurement of host-device transfer time for recveived ghosts
2404 m_tuner_ghost_recv->begin();
2406 // access receive buffers
2407 ArrayHandle<Scalar4> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf, access_location::device, access_mode::read);
2408 ArrayHandle<Scalar4> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf, access_location::device, access_mode::read);
2409 ArrayHandle<Scalar4> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf, access_location::device, access_mode::read);
2410 // access particle data
2411 ArrayHandle<Scalar4> d_pos(m_pdata->getPositions(), access_location::device, access_mode::readwrite);
2412 ArrayHandle<Scalar4> d_vel(m_pdata->getVelocities(), access_location::device, access_mode::readwrite);
2413 ArrayHandle<Scalar4> d_orientation(m_pdata->getOrientationArray(), access_location::device, access_mode::readwrite);
2415 // copy recv buf into particle data
2416 gpu_exchange_ghosts_copy_buf(
2417 m_n_recv_ghosts_tot[stage],
2418 NULL,
2419 d_pos_ghost_recvbuf.data,
2420 d_vel_ghost_recvbuf.data,
2421 NULL,
2422 NULL,
2423 d_orientation_ghost_recvbuf.data,
2424 NULL,
2425 d_pos.data + first_idx,
2426 d_vel.data + first_idx,
2427 NULL,
2428 NULL,
2429 d_orientation.data + first_idx,
2430 false,
2431 flags[comm_flag::position],
2432 flags[comm_flag::velocity],
2433 false,
2434 false,
2435 flags[comm_flag::orientation]);
2437 if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2439 // end measurement
2440 m_tuner_ghost_recv->end();
2442 if (m_prof) m_prof->pop(m_exec_conf);
2443 #endif
2445 if (m_prof) m_prof->pop(m_exec_conf);
2449 //! Export CommunicatorGPU class to python
2450 void export_CommunicatorGPU()
2452 boost::python::class_<CommunicatorGPU, boost::python::bases<Communicator>, boost::shared_ptr<CommunicatorGPU>, boost::noncopyable>("CommunicatorGPU",
2453 boost::python::init<boost::shared_ptr<SystemDefinition>,
2454 boost::shared_ptr<DomainDecomposition> >())
2455 .def("setMaxStages",&CommunicatorGPU::setMaxStages)
2459 #endif // ENABLE_CUDA
2460 #endif // ENABLE_MPI