2 Highly Optimized Object-oriented Many-particle Dynamics -- Blue Edition
3 (HOOMD-blue) Open Source Software License Copyright 2009-2014 The Regents of
4 the University of Michigan All rights reserved.
6 HOOMD-blue may contain modifications ("Contributions") provided, and to which
7 copyright is held, by various Contributors who have granted The Regents of the
8 University of Michigan the right to modify and/or distribute such Contributions.
10 You may redistribute, use, and create derivate works of HOOMD-blue, in source
11 and binary forms, provided you abide by the following conditions:
13 * Redistributions of source code must retain the above copyright notice, this
14 list of conditions, and the following disclaimer both in the code and
15 prominently in any materials provided with the distribution.
17 * Redistributions in binary form must reproduce the above copyright notice, this
18 list of conditions, and the following disclaimer in the documentation and/or
19 other materials provided with the distribution.
21 * All publications and presentations based on HOOMD-blue, including any reports
22 or published results obtained, in whole or in part, with HOOMD-blue, will
23 acknowledge its use according to the terms posted at the time of submission on:
24 http://codeblue.umich.edu/hoomd-blue/citations.html
26 * Any electronic documents citing HOOMD-Blue will link to the HOOMD-Blue website:
27 http://codeblue.umich.edu/hoomd-blue/
29 * Apart from the above required attributions, neither the name of the copyright
30 holder nor the names of HOOMD-blue's contributors may be used to endorse or
31 promote products derived from this software without specific prior written
36 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' AND
37 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
38 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND/OR ANY
39 WARRANTIES THAT THIS SOFTWARE IS FREE OF INFRINGEMENT ARE DISCLAIMED.
41 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
42 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
43 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
44 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
45 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
46 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
47 ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 // Maintainer: jglaser
52 /*! \file CommunicatorGPU.cc
53 \brief Implements the CommunicatorGPU class
59 #include "CommunicatorGPU.h"
63 #include <boost/python.hpp>
68 CommunicatorGPU::CommunicatorGPU(boost::shared_ptr
<SystemDefinition
> sysdef
,
69 boost::shared_ptr
<DomainDecomposition
> decomposition
)
70 : Communicator(sysdef
, decomposition
),
74 m_bond_comm(*this, m_sysdef
->getBondData()),
75 m_angle_comm(*this, m_sysdef
->getAngleData()),
76 m_dihedral_comm(*this, m_sysdef
->getDihedralData()),
77 m_improper_comm(*this, m_sysdef
->getImproperData())
80 #ifndef ENABLE_MPI_CUDA
81 m_mapped_ghost_recv
= true;
82 m_mapped_ghost_send
= true;
84 // do not use host buffers with CUDA-aware MPI
85 m_mapped_ghost_recv
= false;
86 m_mapped_ghost_send
= false;
92 // initialize communciation stages
93 initializeCommunicationStages();
95 // Initialize cache configuration
96 gpu_communicator_initialize_cache_config();
98 // create at ModernGPU context
99 m_mgpu_context
= mgpu::CreateCudaDeviceAttachStream(0);
101 GPUArray
<unsigned int> begin(NEIGH_MAX
,m_exec_conf
,true);
104 GPUArray
<unsigned int> end(NEIGH_MAX
,m_exec_conf
,true);
108 // cudaEventCreate(&m_event, cudaEventBlockingSync | cudaEventDisableTiming);
109 cudaEventCreate(&m_event
, cudaEventDisableTiming
);
111 #ifndef ENABLE_MPI_CUDA
112 // set up autotuners to determine whether to use mapped memory (boolean values)
113 std::vector
<unsigned int> valid_params(2);
114 valid_params
[0] = 0; valid_params
[1] = 1;
116 m_tuner_ghost_recv
.reset(new Autotuner(valid_params
, 5, 100000, "comm_ghost_recv_mapped", this->m_exec_conf
));
117 m_tuner_ghost_send
.reset(new Autotuner(valid_params
, 5, 100000, "comm_ghost_send_mapped", this->m_exec_conf
));
119 // synchronize autotuner results across ranks
120 m_tuner_ghost_recv
->setSync(true);
121 m_tuner_ghost_send
->setSync(true);
126 CommunicatorGPU::~CommunicatorGPU()
128 m_exec_conf
->msg
->notice(5) << "Destroying CommunicatorGPU";
129 cudaEventDestroy(m_event
);
132 void CommunicatorGPU::allocateBuffers()
134 #ifndef ENABLE_MPI_CUDA
135 // mapped buffers for particle migration
144 GPUVector
<pdata_element
> gpu_sendbuf(m_exec_conf
,mapped
);
145 m_gpu_sendbuf
.swap(gpu_sendbuf
);
147 GPUVector
<pdata_element
> gpu_recvbuf(m_exec_conf
,mapped
);
148 m_gpu_recvbuf
.swap(gpu_recvbuf
);
150 // Communciation flags for every particle sent
151 GPUVector
<unsigned int> comm_flags(m_exec_conf
);
152 m_comm_flags
.swap(comm_flags
);
154 // Key for every particle sent
155 GPUVector
<unsigned int> send_keys(m_exec_conf
);
156 m_send_keys
.swap(send_keys
);
159 * Ghost communication
162 GPUVector
<unsigned int> tag_ghost_sendbuf(m_exec_conf
,m_mapped_ghost_send
);
163 m_tag_ghost_sendbuf
.swap(tag_ghost_sendbuf
);
165 GPUVector
<unsigned int> tag_ghost_recvbuf(m_exec_conf
,m_mapped_ghost_recv
);
166 m_tag_ghost_recvbuf
.swap(tag_ghost_recvbuf
);
168 GPUVector
<Scalar4
> pos_ghost_sendbuf(m_exec_conf
,m_mapped_ghost_send
);
169 m_pos_ghost_sendbuf
.swap(pos_ghost_sendbuf
);
171 GPUVector
<Scalar4
> pos_ghost_recvbuf(m_exec_conf
,m_mapped_ghost_recv
);
172 m_pos_ghost_recvbuf
.swap(pos_ghost_recvbuf
);
174 GPUVector
<Scalar4
> vel_ghost_sendbuf(m_exec_conf
,m_mapped_ghost_send
);
175 m_vel_ghost_sendbuf
.swap(vel_ghost_sendbuf
);
177 GPUVector
<Scalar4
> vel_ghost_recvbuf(m_exec_conf
,m_mapped_ghost_recv
);
178 m_vel_ghost_recvbuf
.swap(vel_ghost_recvbuf
);
180 GPUVector
<Scalar
> charge_ghost_sendbuf(m_exec_conf
,m_mapped_ghost_send
);
181 m_charge_ghost_sendbuf
.swap(charge_ghost_sendbuf
);
183 GPUVector
<Scalar
> charge_ghost_recvbuf(m_exec_conf
,m_mapped_ghost_recv
);
184 m_charge_ghost_recvbuf
.swap(charge_ghost_recvbuf
);
186 GPUVector
<Scalar
> diameter_ghost_sendbuf(m_exec_conf
,m_mapped_ghost_send
);
187 m_diameter_ghost_sendbuf
.swap(diameter_ghost_sendbuf
);
189 GPUVector
<Scalar
> diameter_ghost_recvbuf(m_exec_conf
,m_mapped_ghost_recv
);
190 m_diameter_ghost_recvbuf
.swap(diameter_ghost_recvbuf
);
192 GPUVector
<Scalar4
> orientation_ghost_sendbuf(m_exec_conf
,m_mapped_ghost_send
);
193 m_orientation_ghost_sendbuf
.swap(orientation_ghost_sendbuf
);
195 GPUVector
<Scalar4
> orientation_ghost_recvbuf(m_exec_conf
,m_mapped_ghost_recv
);
196 m_orientation_ghost_recvbuf
.swap(orientation_ghost_recvbuf
);
198 #ifndef ENABLE_MPI_CUDA
199 // initialize standby buffers (oppposite mapped setting)
200 GPUVector
<unsigned int> tag_ghost_sendbuf_alt(m_exec_conf
,!m_mapped_ghost_send
);
201 m_tag_ghost_sendbuf_alt
.swap(tag_ghost_sendbuf_alt
);
203 GPUVector
<unsigned int> tag_ghost_recvbuf_alt(m_exec_conf
,!m_mapped_ghost_recv
);
204 m_tag_ghost_recvbuf_alt
.swap(tag_ghost_recvbuf_alt
);
206 GPUVector
<Scalar4
> pos_ghost_sendbuf_alt(m_exec_conf
,!m_mapped_ghost_send
);
207 m_pos_ghost_sendbuf_alt
.swap(pos_ghost_sendbuf_alt
);
209 GPUVector
<Scalar4
> pos_ghost_recvbuf_alt(m_exec_conf
,!m_mapped_ghost_recv
);
210 m_pos_ghost_recvbuf_alt
.swap(pos_ghost_recvbuf_alt
);
212 GPUVector
<Scalar4
> vel_ghost_sendbuf_alt(m_exec_conf
,!m_mapped_ghost_send
);
213 m_vel_ghost_sendbuf_alt
.swap(vel_ghost_sendbuf_alt
);
215 GPUVector
<Scalar4
> vel_ghost_recvbuf_alt(m_exec_conf
,!m_mapped_ghost_recv
);
216 m_vel_ghost_recvbuf_alt
.swap(vel_ghost_recvbuf_alt
);
218 GPUVector
<Scalar
> charge_ghost_sendbuf_alt(m_exec_conf
,!m_mapped_ghost_send
);
219 m_charge_ghost_sendbuf_alt
.swap(charge_ghost_sendbuf_alt
);
221 GPUVector
<Scalar
> charge_ghost_recvbuf_alt(m_exec_conf
,!m_mapped_ghost_recv
);
222 m_charge_ghost_recvbuf_alt
.swap(charge_ghost_recvbuf_alt
);
224 GPUVector
<Scalar
> diameter_ghost_sendbuf_alt(m_exec_conf
,!m_mapped_ghost_send
);
225 m_diameter_ghost_sendbuf_alt
.swap(diameter_ghost_sendbuf_alt
);
227 GPUVector
<Scalar
> diameter_ghost_recvbuf_alt(m_exec_conf
,!m_mapped_ghost_recv
);
228 m_diameter_ghost_recvbuf_alt
.swap(diameter_ghost_recvbuf_alt
);
230 GPUVector
<Scalar4
> orientation_ghost_sendbuf_alt(m_exec_conf
,!m_mapped_ghost_send
);
231 m_orientation_ghost_sendbuf_alt
.swap(orientation_ghost_sendbuf_alt
);
233 GPUVector
<Scalar4
> orientation_ghost_recvbuf_alt(m_exec_conf
,!m_mapped_ghost_recv
);
234 m_orientation_ghost_recvbuf_alt
.swap(orientation_ghost_recvbuf_alt
);
237 GPUVector
<unsigned int> ghost_begin(m_exec_conf
,true);
238 m_ghost_begin
.swap(ghost_begin
);
240 GPUVector
<unsigned int> ghost_end(m_exec_conf
,true);
241 m_ghost_end
.swap(ghost_end
);
243 GPUVector
<unsigned int> ghost_plan(m_exec_conf
);
244 m_ghost_plan
.swap(ghost_plan
);
246 GPUVector
<uint2
> ghost_idx_adj(m_exec_conf
);
247 m_ghost_idx_adj
.swap(ghost_idx_adj
);
249 GPUVector
<unsigned int> ghost_neigh(m_exec_conf
);
250 m_ghost_neigh
.swap(ghost_neigh
);
252 GPUVector
<unsigned int> neigh_counts(m_exec_conf
);
253 m_neigh_counts
.swap(neigh_counts
);
256 void CommunicatorGPU::initializeCommunicationStages()
258 // sanity check for user input
259 if (m_max_stages
== 0)
261 m_exec_conf
->msg
->warning()
262 << "Maximum number of communication stages needs to be greater than zero. Assuming one."
267 if (m_max_stages
> 3)
269 m_exec_conf
->msg
->warning()
270 << "Maximum number of communication stages too large. Assuming three."
275 // accesss neighbors and adjacency array
276 ArrayHandle
<unsigned int> h_adj_mask(m_adj_mask
, access_location::host
, access_mode::read
);
278 Index3D di
= m_decomposition
->getDomainIndexer();
280 // number of stages in every communication step
284 m_comm_mask
.resize(m_max_stages
,0);
286 const unsigned int mask_east
= 1 << 2 | 1 << 5 | 1 << 8 | 1 << 11
287 | 1 << 14 | 1 << 17 | 1 << 20 | 1 << 23 | 1 << 26;
288 const unsigned int mask_west
= mask_east
>> 2;
289 const unsigned int mask_north
= 1 << 6 | 1 << 7 | 1 << 8 | 1 << 15
290 | 1 << 16 | 1 << 17 | 1 << 24 | 1 << 25 | 1 << 26;
291 const unsigned int mask_south
= mask_north
>> 6;
292 const unsigned int mask_up
= 1 << 18 | 1 << 19 | 1 << 20 | 1 << 21
293 | 1 << 22 | 1 << 23 | 1 << 24 | 1 << 25 | 1 << 26;
294 const unsigned int mask_down
= mask_up
>> 18;
296 // loop through neighbors to determine the communication stages
297 std::vector
<unsigned int> neigh_flags(m_n_unique_neigh
);
298 unsigned int max_stage
= 0;
299 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ++ineigh
)
305 if (di
.getW() > 1 && (n
+1 < (int) m_max_stages
)) n
++;
306 if (h_adj_mask
.data
[ineigh
] & (mask_east
| mask_west
)) stage
= n
;
307 if (di
.getH() > 1 && (n
+1 < (int) m_max_stages
)) n
++;
308 if (h_adj_mask
.data
[ineigh
] & (mask_north
| mask_south
)) stage
= n
;
309 if (di
.getD() > 1 && (n
+1 < (int) m_max_stages
)) n
++;
310 if (h_adj_mask
.data
[ineigh
] & (mask_up
| mask_down
)) stage
= n
;
315 unsigned int mask
= 0;
316 if (h_adj_mask
.data
[ineigh
] & mask_east
)
318 if (h_adj_mask
.data
[ineigh
] & mask_west
)
320 if (h_adj_mask
.data
[ineigh
] & mask_north
)
322 if (h_adj_mask
.data
[ineigh
] & mask_south
)
324 if (h_adj_mask
.data
[ineigh
] & mask_up
)
326 if (h_adj_mask
.data
[ineigh
] & mask_down
)
329 neigh_flags
[ineigh
] = mask
;
331 // set communication flags for stage
332 m_comm_mask
[stage
] |= mask
;
334 if (stage
> (int)max_stage
) max_stage
= stage
;
337 // number of communication stages
338 m_num_stages
= max_stage
+ 1;
340 // every direction occurs in one and only one stages
341 // number of communications per stage is constant or decreases with stage number
342 for (unsigned int istage
= 0; istage
< m_num_stages
; ++istage
)
343 for (unsigned int jstage
= istage
+1; jstage
< m_num_stages
; ++jstage
)
344 m_comm_mask
[jstage
] &= ~m_comm_mask
[istage
];
346 // access unique neighbors
347 ArrayHandle
<unsigned int> h_unique_neighbors(m_unique_neighbors
, access_location::host
, access_mode::read
);
349 // initialize stages array
350 m_stages
.resize(m_n_unique_neigh
,-1);
352 // assign stages to unique neighbors
353 for (unsigned int i
= 0; i
< m_n_unique_neigh
; i
++)
354 for (unsigned int istage
= 0; istage
< m_num_stages
; ++istage
)
355 // compare adjacency masks of neighbors to the mask for this stage
356 if ((neigh_flags
[i
] & m_comm_mask
[istage
]) == neigh_flags
[i
])
358 m_stages
[i
] = istage
;
359 break; // associate neighbor with stage of lowest index
362 m_exec_conf
->msg
->notice(4) << "ComunicatorGPU: Using " << m_num_stages
363 << " communication stage(s)." << std::endl
;
366 //! Select a particle for migration
367 struct get_migrate_key
: public std::unary_function
<const unsigned int, unsigned int >
369 const uint3 my_pos
; //!< My domain decomposition position
370 const Index3D di
; //!< Domain indexer
371 const unsigned int mask
; //!< Mask of allowed directions
372 const unsigned int *h_cart_ranks
; //!< Rank lookup table
377 get_migrate_key(const uint3 _my_pos
, const Index3D _di
, const unsigned int _mask
,
378 const unsigned int *_h_cart_ranks
)
379 : my_pos(_my_pos
), di(_di
), mask(_mask
), h_cart_ranks(_h_cart_ranks
)
382 //! Generate key for a sent particle
383 unsigned int operator()(const unsigned int flags
)
388 if ((flags
& Communicator::send_east
) && (mask
& Communicator::send_east
))
390 else if ((flags
& Communicator::send_west
) && (mask
& Communicator::send_west
))
393 if ((flags
& Communicator::send_north
) && (mask
& Communicator::send_north
))
395 else if ((flags
& Communicator::send_south
) && (mask
& Communicator::send_south
))
398 if ((flags
& Communicator::send_up
) && (mask
& Communicator::send_up
))
400 else if ((flags
& Communicator::send_down
) && (mask
& Communicator::send_down
))
403 // sanity check: particle has to be sent somewhere
404 assert(ix
|| iy
|| iz
);
411 if (i
== (int)di
.getW())
417 if (j
== (int)di
.getH())
423 if (k
== (int)di
.getD())
428 return h_cart_ranks
[di(i
,j
,k
)];
434 template<class group_data
>
435 CommunicatorGPU::GroupCommunicatorGPU
<group_data
>::GroupCommunicatorGPU(CommunicatorGPU
& gpu_comm
, boost::shared_ptr
<group_data
> gdata
)
436 : m_gpu_comm(gpu_comm
), m_exec_conf(m_gpu_comm
.m_exec_conf
), m_gdata(gdata
)
438 // accelerate copying of data for host MPI
439 #ifdef ENABLE_MPI_CUDA
445 GPUVector
<unsigned int> rank_mask(m_gpu_comm
.m_exec_conf
);
446 m_rank_mask
.swap(rank_mask
);
448 GPUVector
<unsigned int> scratch(m_gpu_comm
.m_exec_conf
);
449 m_scan
.swap(scratch
);
451 GPUVector
<rank_element_t
> ranks_out(m_gpu_comm
.m_exec_conf
,mapped
);
452 m_ranks_out
.swap(ranks_out
);
454 GPUVector
<rank_element_t
> ranks_sendbuf(m_gpu_comm
.m_exec_conf
,mapped
);
455 m_ranks_sendbuf
.swap(ranks_sendbuf
);
457 GPUVector
<rank_element_t
> ranks_recvbuf(m_gpu_comm
.m_exec_conf
,mapped
);
458 m_ranks_recvbuf
.swap(ranks_recvbuf
);
460 GPUVector
<group_element_t
> groups_out(m_gpu_comm
.m_exec_conf
,mapped
);
461 m_groups_out
.swap(groups_out
);
463 GPUVector
<unsigned int> rank_mask_out(m_gpu_comm
.m_exec_conf
,mapped
);
464 m_rank_mask_out
.swap(rank_mask_out
);
466 GPUVector
<group_element_t
> groups_sendbuf(m_gpu_comm
.m_exec_conf
,mapped
);
467 m_groups_sendbuf
.swap(groups_sendbuf
);
469 GPUVector
<group_element_t
> groups_recvbuf(m_gpu_comm
.m_exec_conf
,mapped
);
470 m_groups_recvbuf
.swap(groups_recvbuf
);
472 GPUVector
<group_element_t
> groups_in(m_gpu_comm
.m_exec_conf
, mapped
);
473 m_groups_in
.swap(groups_in
);
475 // the size of the bit field must be larger or equal the group size
476 assert(sizeof(unsigned int)*8 >= group_data::size
);
480 template<class group_data
>
481 void CommunicatorGPU::GroupCommunicatorGPU
<group_data
>::migrateGroups(bool incomplete
)
483 if (m_gdata
->getNGlobal())
485 m_exec_conf
->msg
->notice(7) << "GroupCommunicator<" << m_gdata
->getName() << ">: migrate" << std::endl
;
487 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->push(m_exec_conf
, m_gdata
->getName());
490 m_rank_mask
.resize(m_gdata
->getN());
492 // resize temporary arry
493 m_scan
.resize(m_gdata
->getN());
495 unsigned int n_out_ranks
;
497 ArrayHandle
<unsigned int> d_comm_flags(m_gpu_comm
.m_pdata
->getCommFlags(), access_location::device
, access_mode::read
);
498 ArrayHandle
<typename
group_data::members_t
> d_members(m_gdata
->getMembersArray(), access_location::device
, access_mode::read
);
499 ArrayHandle
<typename
group_data::ranks_t
> d_group_ranks(m_gdata
->getRanksArray(), access_location::device
, access_mode::readwrite
);
500 ArrayHandle
<unsigned int> d_rank_mask(m_rank_mask
, access_location::device
, access_mode::overwrite
);
501 ArrayHandle
<unsigned int> d_rtag(m_gpu_comm
.m_pdata
->getRTags(), access_location::device
, access_mode::read
);
502 ArrayHandle
<unsigned int> d_scan(m_scan
, access_location::device
, access_mode::overwrite
);
504 boost::shared_ptr
<DomainDecomposition
> decomposition
= m_gpu_comm
.m_pdata
->getDomainDecomposition();
505 ArrayHandle
<unsigned int> d_cart_ranks(decomposition
->getCartRanks(), access_location::device
, access_mode::read
);
507 Index3D di
= decomposition
->getDomainIndexer();
508 uint3 my_pos
= decomposition
->getGridPos();
510 // mark groups that have members leaving this domain
511 gpu_mark_groups
<group_data::size
>(
512 m_gpu_comm
.m_pdata
->getN(),
525 m_gpu_comm
.m_mgpu_context
);
527 if (m_gpu_comm
.m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
530 // resize output array
531 m_ranks_out
.resize(n_out_ranks
);
533 unsigned int n_out_groups
;
535 ArrayHandle
<unsigned int> d_group_tag(m_gdata
->getTags(), access_location::device
, access_mode::read
);
536 ArrayHandle
<typename
group_data::members_t
> d_members(m_gdata
->getMembersArray(), access_location::device
, access_mode::read
);
537 ArrayHandle
<typename
group_data::ranks_t
> d_group_ranks(m_gdata
->getRanksArray(), access_location::device
, access_mode::read
);
538 ArrayHandle
<unsigned int> d_rank_mask(m_rank_mask
, access_location::device
, access_mode::readwrite
);
540 ArrayHandle
<unsigned int> d_rtag(m_gpu_comm
.m_pdata
->getRTags(), access_location::device
, access_mode::read
);
541 ArrayHandle
<unsigned int> d_comm_flags(m_gpu_comm
.m_pdata
->getCommFlags(), access_location::device
, access_mode::readwrite
);
543 ArrayHandle
<rank_element_t
> d_ranks_out(m_ranks_out
, access_location::device
, access_mode::overwrite
);
545 ArrayHandle
<unsigned int> d_scan(m_scan
, access_location::device
, access_mode::readwrite
);
547 // scatter groups into output arrays according to scan result (d_scan), determine send groups and scan
548 gpu_scatter_ranks_and_mark_send_groups
<group_data::size
>(
559 m_gpu_comm
.m_mgpu_context
);
561 if (m_gpu_comm
.m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
564 #if defined(ENABLE_MPI_CUDA) && 0
566 // fill host send buffers on host
567 unsigned int my_rank
= m_gpu_comm
.m_exec_conf
->getRank();
569 typedef std::multimap
<unsigned int, rank_element_t
> map_t
;
573 // access output buffers
574 ArrayHandle
<rank_element_t
> h_ranks_out(m_ranks_out
, access_location::host
, access_mode::read
);
575 ArrayHandle
<unsigned int> h_unique_neighbors(m_gpu_comm
.m_unique_neighbors
, access_location:: host
, access_mode::read
);
577 for (unsigned int i
= 0; i
< n_out_ranks
; ++i
)
579 rank_element_t el
= h_ranks_out
.data
[i
];
580 typename
group_data::ranks_t r
= el
.ranks
;
581 unsigned int mask
= el
.mask
;
584 // in initialization, send to all neighbors
585 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ++ineigh
)
586 send_map
.insert(std::make_pair(h_unique_neighbors
.data
[ineigh
], el
));
588 // send to other ranks owning the bonded group
589 for (unsigned int j
= 0; j
< group_data::size
; ++j
)
591 unsigned int rank
= r
.idx
[j
];
592 bool updated
= mask
& (1 << j
);
593 // send out to ranks different from ours
594 if (rank
!= my_rank
&& !updated
)
595 send_map
.insert(std::make_pair(rank
, el
));
600 // resize send buffers
601 m_ranks_sendbuf
.resize(send_map
.size());
604 // access send buffers
605 ArrayHandle
<rank_element_t
> h_ranks_sendbuf(m_ranks_sendbuf
, access_location::host
, access_mode::overwrite
);
607 // output send data sorted by rank
609 for (typename
map_t::iterator it
= send_map
.begin(); it
!= send_map
.end(); ++it
)
611 h_ranks_sendbuf
.data
[n
] = it
->second
;
615 ArrayHandle
<unsigned int> h_unique_neighbors(m_gpu_comm
.m_unique_neighbors
, access_location::host
, access_mode::read
);
616 ArrayHandle
<unsigned int> h_begin(m_gpu_comm
.m_begin
, access_location::host
, access_mode::overwrite
);
617 ArrayHandle
<unsigned int> h_end(m_gpu_comm
.m_end
, access_location::host
, access_mode::overwrite
);
619 // Find start and end indices
620 for (unsigned int i
= 0; i
< m_gpu_comm
.m_n_unique_neigh
; ++i
)
622 typename
map_t::iterator lower
= send_map
.lower_bound(h_unique_neighbors
.data
[i
]);
623 typename
map_t::iterator upper
= send_map
.upper_bound(h_unique_neighbors
.data
[i
]);
624 h_begin
.data
[i
] = std::distance(send_map
.begin(),lower
);
625 h_end
.data
[i
] = std::distance(send_map
.begin(),upper
);
631 * communicate rank information (phase 1)
633 unsigned int n_send_groups
[m_gpu_comm
.m_n_unique_neigh
];
634 unsigned int n_recv_groups
[m_gpu_comm
.m_n_unique_neigh
];
635 unsigned int offs
[m_gpu_comm
.m_n_unique_neigh
];
636 unsigned int n_recv_tot
= 0;
639 ArrayHandle
<unsigned int> h_begin(m_gpu_comm
.m_begin
, access_location::host
, access_mode::read
);
640 ArrayHandle
<unsigned int> h_end(m_gpu_comm
.m_end
, access_location::host
, access_mode::read
);
641 ArrayHandle
<unsigned int> h_unique_neighbors(m_gpu_comm
.m_unique_neighbors
, access_location::host
, access_mode::read
);
643 unsigned int send_bytes
= 0;
644 unsigned int recv_bytes
= 0;
645 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->push(m_exec_conf
, "MPI send/recv");
647 // compute send counts
648 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ineigh
++)
649 n_send_groups
[ineigh
] = h_end
.data
[ineigh
] - h_begin
.data
[ineigh
];
651 MPI_Request req
[2*m_gpu_comm
.m_n_unique_neigh
];
652 MPI_Status stat
[2*m_gpu_comm
.m_n_unique_neigh
];
654 unsigned int nreq
= 0;
656 // loop over neighbors
657 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ineigh
++)
659 // rank of neighbor processor
660 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
662 MPI_Isend(&n_send_groups
[ineigh
], 1, MPI_UNSIGNED
, neighbor
, 0, m_gpu_comm
.m_mpi_comm
, & req
[nreq
++]);
663 MPI_Irecv(&n_recv_groups
[ineigh
], 1, MPI_UNSIGNED
, neighbor
, 0, m_gpu_comm
.m_mpi_comm
, & req
[nreq
++]);
664 send_bytes
+= sizeof(unsigned int);
665 recv_bytes
+= sizeof(unsigned int);
666 } // end neighbor loop
668 MPI_Waitall(nreq
, req
, stat
);
670 // sum up receive counts
671 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ineigh
++)
676 offs
[ineigh
] = offs
[ineigh
-1] + n_recv_groups
[ineigh
-1];
678 n_recv_tot
+= n_recv_groups
[ineigh
];
681 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
684 // Resize receive buffer
685 m_ranks_recvbuf
.resize(n_recv_tot
);
688 ArrayHandle
<unsigned int> h_begin(m_gpu_comm
.m_begin
, access_location::host
, access_mode::read
);
689 ArrayHandle
<unsigned int> h_end(m_gpu_comm
.m_end
, access_location::host
, access_mode::read
);
690 ArrayHandle
<unsigned int> h_unique_neighbors(m_gpu_comm
.m_unique_neighbors
, access_location::host
, access_mode::read
);
692 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->push(m_exec_conf
,"MPI send/recv");
694 #ifdef ENABLE_MPI_CUDA
695 ArrayHandle
<rank_element_t
> ranks_sendbuf_handle(m_ranks_sendbuf
, access_location::device
, access_mode::read
);
696 ArrayHandle
<rank_element_t
> ranks_recvbuf_handle(m_ranks_recvbuf
, access_location::device
, access_mode::overwrite
);
698 // MPI library may use non-zero stream
699 cudaDeviceSynchronize();
701 ArrayHandle
<rank_element_t
> ranks_sendbuf_handle(m_ranks_sendbuf
, access_location::host
, access_mode::read
);
702 ArrayHandle
<rank_element_t
> ranks_recvbuf_handle(m_ranks_recvbuf
, access_location::host
, access_mode::overwrite
);
705 std::vector
<MPI_Request
> reqs
;
708 unsigned int send_bytes
= 0;
709 unsigned int recv_bytes
= 0;
711 // loop over neighbors
712 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ineigh
++)
714 // rank of neighbor processor
715 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
717 // exchange particle data
718 if (n_send_groups
[ineigh
])
720 MPI_Isend(ranks_sendbuf_handle
.data
+h_begin
.data
[ineigh
],
721 n_send_groups
[ineigh
]*sizeof(rank_element_t
),
725 m_gpu_comm
.m_mpi_comm
,
729 send_bytes
+= n_send_groups
[ineigh
]*sizeof(rank_element_t
);
731 if (n_recv_groups
[ineigh
])
733 MPI_Irecv(ranks_recvbuf_handle
.data
+offs
[ineigh
],
734 n_recv_groups
[ineigh
]*sizeof(rank_element_t
),
738 m_gpu_comm
.m_mpi_comm
,
742 recv_bytes
+= n_recv_groups
[ineigh
]*sizeof(rank_element_t
);
745 std::vector
<MPI_Status
> stats(reqs
.size());
746 MPI_Waitall(reqs
.size(), &reqs
.front(), &stats
.front());
748 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
752 // access receive buffers
753 ArrayHandle
<rank_element_t
> d_ranks_recvbuf(m_ranks_recvbuf
, access_location::device
, access_mode::read
);
754 ArrayHandle
<typename
group_data::ranks_t
> d_group_ranks(m_gdata
->getRanksArray(), access_location::device
, access_mode::readwrite
);
755 ArrayHandle
<unsigned int> d_group_rtag(m_gdata
->getRTags(), access_location::device
, access_mode::read
);
757 // update local rank information
758 gpu_update_ranks_table
<group_data::size
>(
763 d_ranks_recvbuf
.data
);
764 if (m_gpu_comm
.m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
767 // resize output buffer
768 m_groups_out
.resize(n_out_groups
);
769 m_rank_mask_out
.resize(n_out_groups
);
772 ArrayHandle
<typename
group_data::members_t
> d_groups(m_gdata
->getMembersArray(), access_location::device
, access_mode::read
);
773 ArrayHandle
<unsigned int> d_group_type(m_gdata
->getTypesArray(), access_location::device
, access_mode::read
);
774 ArrayHandle
<unsigned int> d_group_tag(m_gdata
->getTags(), access_location::device
, access_mode::read
);
775 ArrayHandle
<unsigned int> d_group_rtag(m_gdata
->getRTags(), access_location::device
, access_mode::readwrite
);
776 ArrayHandle
<typename
group_data::ranks_t
> d_group_ranks(m_gdata
->getRanksArray(), access_location::device
, access_mode::read
);
777 ArrayHandle
<unsigned int> d_rank_mask(m_rank_mask
, access_location::device
, access_mode::readwrite
);
778 ArrayHandle
<unsigned int> d_scan(m_scan
, access_location::device
, access_mode::readwrite
);
779 ArrayHandle
<group_element_t
> d_groups_out(m_groups_out
, access_location::device
, access_mode::overwrite
);
780 ArrayHandle
<unsigned int> d_rank_mask_out(m_rank_mask_out
, access_location::device
, access_mode::overwrite
);
781 ArrayHandle
<unsigned int> d_rtag(m_gpu_comm
.m_pdata
->getRTags(), access_location::device
, access_mode::read
);
782 ArrayHandle
<unsigned int> d_comm_flags(m_gpu_comm
.m_pdata
->getCommFlags(), access_location::device
, access_mode::read
);
784 // scatter groups to be sent into output buffer, mark groups that have no local members for removal
785 gpu_scatter_and_mark_groups_for_removal
<group_data::size
>(
795 m_exec_conf
->getRank(),
798 d_rank_mask_out
.data
);
799 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
802 unsigned int new_ngroups
;
804 // access primary arrays to read from
805 ArrayHandle
<typename
group_data::members_t
> d_groups(m_gdata
->getMembersArray(), access_location::device
, access_mode::read
);
806 ArrayHandle
<unsigned int> d_group_type(m_gdata
->getTypesArray(), access_location::device
, access_mode::read
);
807 ArrayHandle
<unsigned int> d_group_tag(m_gdata
->getTags(), access_location::device
, access_mode::read
);
808 ArrayHandle
<typename
group_data::ranks_t
> d_group_ranks(m_gdata
->getRanksArray(), access_location::device
, access_mode::read
);
810 // access alternate arrays to write to
811 ArrayHandle
<typename
group_data::members_t
> d_groups_alt(m_gdata
->getAltMembersArray(), access_location::device
, access_mode::overwrite
);
812 ArrayHandle
<unsigned int> d_group_type_alt(m_gdata
->getAltTypesArray(), access_location::device
, access_mode::overwrite
);
813 ArrayHandle
<unsigned int> d_group_tag_alt(m_gdata
->getAltTags(), access_location::device
, access_mode::overwrite
);
814 ArrayHandle
<typename
group_data::ranks_t
> d_group_ranks_alt(m_gdata
->getAltRanksArray(), access_location::device
, access_mode::overwrite
);
816 ArrayHandle
<unsigned int> d_scan(m_scan
, access_location::device
, access_mode::readwrite
);
819 ArrayHandle
<unsigned int> d_group_rtag(m_gdata
->getRTags(), access_location::device
, access_mode::readwrite
);
821 unsigned int ngroups
= m_gdata
->getN();
823 // remove groups from local table
824 gpu_remove_groups(ngroups
,
828 d_group_type_alt
.data
,
830 d_group_tag_alt
.data
,
832 d_group_ranks_alt
.data
,
836 m_gpu_comm
.m_mgpu_context
);
837 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
840 // resize alternate arrays to number of groups
841 GPUVector
<typename
group_data::members_t
>& alt_groups_array
= m_gdata
->getAltMembersArray();
842 GPUVector
<unsigned int>& alt_group_type_array
= m_gdata
->getAltTypesArray();
843 GPUVector
<unsigned int>& alt_group_tag_array
= m_gdata
->getAltTags();
844 GPUVector
<typename
group_data::ranks_t
>& alt_group_ranks_array
= m_gdata
->getAltRanksArray();
846 assert(new_ngroups
<= m_gdata
->getN());
847 alt_groups_array
.resize(new_ngroups
);
848 alt_group_type_array
.resize(new_ngroups
);
849 alt_group_tag_array
.resize(new_ngroups
);
850 alt_group_ranks_array
.resize(new_ngroups
);
852 // make alternate arrays current
853 m_gdata
->swapMemberArrays();
854 m_gdata
->swapTypeArrays();
855 m_gdata
->swapTagArrays();
856 m_gdata
->swapRankArrays();
858 assert(m_gdata
->getN() == new_ngroups
);
860 #if defined(ENABLE_MPI_CUDA) && 0
862 // fill host send buffers on host
863 typedef std::multimap
<unsigned int, group_element_t
> group_map_t
;
864 group_map_t group_send_map
;
867 // access output buffers
868 ArrayHandle
<group_element_t
> h_groups_out(m_groups_out
, access_location::host
, access_mode::read
);
869 ArrayHandle
<unsigned int> h_rank_mask_out(m_rank_mask_out
, access_location::host
, access_mode::read
);
871 for (unsigned int i
= 0; i
< n_out_groups
; ++i
)
873 group_element_t el
= h_groups_out
.data
[i
];
874 typename
group_data::ranks_t ranks
= el
.ranks
;
876 for (unsigned int j
= 0; j
< group_data::size
; ++j
)
878 unsigned int rank
= ranks
.idx
[j
];
879 // are we sending to this rank?
880 if (h_rank_mask_out
.data
[i
] & (1 << j
))
881 group_send_map
.insert(std::make_pair(rank
, el
));
886 // resize send buffers
887 m_groups_sendbuf
.resize(group_send_map
.size());
890 // access send buffers
891 ArrayHandle
<group_element_t
> h_groups_sendbuf(m_groups_sendbuf
, access_location::host
, access_mode::overwrite
);
893 // output send data sorted by rank
895 for (typename
group_map_t::iterator it
= group_send_map
.begin(); it
!= group_send_map
.end(); ++it
)
897 h_groups_sendbuf
.data
[n
] = it
->second
;
901 ArrayHandle
<unsigned int> h_unique_neighbors(m_gpu_comm
.m_unique_neighbors
, access_location::host
, access_mode::read
);
902 ArrayHandle
<unsigned int> h_begin(m_gpu_comm
.m_begin
, access_location::host
, access_mode::overwrite
);
903 ArrayHandle
<unsigned int> h_end(m_gpu_comm
.m_end
, access_location::host
, access_mode::overwrite
);
905 // Find start and end indices
906 for (unsigned int i
= 0; i
< m_gpu_comm
.m_n_unique_neigh
; ++i
)
908 typename
group_map_t::iterator lower
= group_send_map
.lower_bound(h_unique_neighbors
.data
[i
]);
909 typename
group_map_t::iterator upper
= group_send_map
.upper_bound(h_unique_neighbors
.data
[i
]);
910 h_begin
.data
[i
] = std::distance(group_send_map
.begin(),lower
);
911 h_end
.data
[i
] = std::distance(group_send_map
.begin(),upper
);
917 * communicate groups (phase 2)
922 ArrayHandle
<unsigned int> h_begin(m_gpu_comm
.m_begin
, access_location::host
, access_mode::read
);
923 ArrayHandle
<unsigned int> h_end(m_gpu_comm
.m_end
, access_location::host
, access_mode::read
);
924 ArrayHandle
<unsigned int> h_unique_neighbors(m_gpu_comm
.m_unique_neighbors
, access_location::host
, access_mode::read
);
926 unsigned int send_bytes
= 0;
927 unsigned int recv_bytes
= 0;
928 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->push(m_exec_conf
, "MPI send/recv");
930 // compute send counts
931 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ineigh
++)
932 n_send_groups
[ineigh
] = h_end
.data
[ineigh
] - h_begin
.data
[ineigh
];
934 MPI_Request req
[2*m_gpu_comm
.m_n_unique_neigh
];
935 MPI_Status stat
[2*m_gpu_comm
.m_n_unique_neigh
];
937 unsigned int nreq
= 0;
939 // loop over neighbors
940 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ineigh
++)
942 // rank of neighbor processor
943 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
945 MPI_Isend(&n_send_groups
[ineigh
], 1, MPI_UNSIGNED
, neighbor
, 0, m_gpu_comm
.m_mpi_comm
, & req
[nreq
++]);
946 MPI_Irecv(&n_recv_groups
[ineigh
], 1, MPI_UNSIGNED
, neighbor
, 0, m_gpu_comm
.m_mpi_comm
, & req
[nreq
++]);
947 send_bytes
+= sizeof(unsigned int);
948 recv_bytes
+= sizeof(unsigned int);
949 } // end neighbor loop
951 MPI_Waitall(nreq
, req
, stat
);
953 // sum up receive counts
954 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ineigh
++)
959 offs
[ineigh
] = offs
[ineigh
-1] + n_recv_groups
[ineigh
-1];
961 n_recv_tot
+= n_recv_groups
[ineigh
];
964 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
967 // Resize receive buffer
968 m_groups_recvbuf
.resize(n_recv_tot
);
971 ArrayHandle
<unsigned int> h_begin(m_gpu_comm
.m_begin
, access_location::host
, access_mode::read
);
972 ArrayHandle
<unsigned int> h_end(m_gpu_comm
.m_end
, access_location::host
, access_mode::read
);
973 ArrayHandle
<unsigned int> h_unique_neighbors(m_gpu_comm
.m_unique_neighbors
, access_location::host
, access_mode::read
);
975 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->push(m_exec_conf
,"MPI send/recv");
977 #if defined(ENABLE_MPI_CUDA) && 0
979 ArrayHandle
<group_element_t
> groups_sendbuf_handle(m_groups_sendbuf
, access_location::host
, access_mode::read
);
980 ArrayHandle
<group_element_t
> groups_recvbuf_handle(m_groups_recvbuf
, access_location::host
, access_mode::overwrite
);
983 std::vector
<MPI_Request
> reqs
;
986 unsigned int send_bytes
= 0;
987 unsigned int recv_bytes
= 0;
989 // loop over neighbors
990 for (unsigned int ineigh
= 0; ineigh
< m_gpu_comm
.m_n_unique_neigh
; ineigh
++)
992 // rank of neighbor processor
993 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
995 // exchange particle data
996 if (n_send_groups
[ineigh
])
998 MPI_Isend(groups_sendbuf_handle
.data
+h_begin
.data
[ineigh
],
999 n_send_groups
[ineigh
]*sizeof(group_element_t
),
1003 m_gpu_comm
.m_mpi_comm
,
1005 reqs
.push_back(req
);
1007 send_bytes
+= n_send_groups
[ineigh
]*sizeof(group_element_t
);
1009 if (n_recv_groups
[ineigh
])
1011 MPI_Irecv(groups_recvbuf_handle
.data
+offs
[ineigh
],
1012 n_recv_groups
[ineigh
]*sizeof(group_element_t
),
1016 m_gpu_comm
.m_mpi_comm
,
1018 reqs
.push_back(req
);
1020 recv_bytes
+= n_recv_groups
[ineigh
]*sizeof(group_element_t
);
1023 std::vector
<MPI_Status
> stats(reqs
.size());
1024 MPI_Waitall(reqs
.size(), &reqs
.front(), &stats
.front());
1026 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
1029 unsigned int n_recv_unique
= 0;
1030 #if defined(ENABLE_MPI_CUDA) && 0
1033 ArrayHandle
<group_element_t
> h_groups_recvbuf(m_groups_recvbuf
, access_location::host
, access_mode::read
);
1035 // use a std::map, i.e. single-key, to filter out duplicate groups in input buffer
1036 typedef std::map
<unsigned int, group_element_t
> recv_map_t
;
1037 recv_map_t recv_map
;
1039 for (unsigned int recv_idx
= 0; recv_idx
< n_recv_tot
; recv_idx
++)
1041 group_element_t el
= h_groups_recvbuf
.data
[recv_idx
];
1042 unsigned int tag
= el
.group_tag
;
1043 recv_map
.insert(std::make_pair(tag
, el
));
1046 // resize input array of unique groups
1047 m_groups_in
.resize(recv_map
.size());
1049 // write out unique groups
1050 ArrayHandle
<group_element_t
> h_groups_in(m_groups_in
, access_location::host
, access_mode::overwrite
);
1051 for (typename
recv_map_t::iterator it
= recv_map
.begin(); it
!= recv_map
.end(); ++it
)
1052 h_groups_in
.data
[n_recv_unique
++] = it
->second
;
1053 assert(n_recv_unique
== recv_map
.size());
1057 unsigned int old_ngroups
= m_gdata
->getN();
1058 new_ngroups
= old_ngroups
+ n_recv_unique
;
1060 // resize group arrays to accomodate additional groups (there can still be duplicates with local groups)
1061 GPUVector
<typename
group_data::members_t
>& groups_array
= m_gdata
->getMembersArray();
1062 GPUVector
<unsigned int>& group_type_array
= m_gdata
->getTypesArray();
1063 GPUVector
<unsigned int>& group_tag_array
= m_gdata
->getTags();
1064 GPUVector
<typename
group_data::ranks_t
>& group_ranks_array
= m_gdata
->getRanksArray();
1066 groups_array
.resize(new_ngroups
);
1067 group_type_array
.resize(new_ngroups
);
1068 group_tag_array
.resize(new_ngroups
);
1069 group_ranks_array
.resize(new_ngroups
);
1072 ArrayHandle
<group_element_t
> d_groups_in(m_groups_in
, access_location::device
, access_mode::read
);
1073 ArrayHandle
<typename
group_data::members_t
> d_groups(m_gdata
->getMembersArray(), access_location::device
, access_mode::readwrite
);
1074 ArrayHandle
<unsigned int> d_group_type(m_gdata
->getTypesArray(), access_location::device
, access_mode::readwrite
);
1075 ArrayHandle
<unsigned int> d_group_tag(m_gdata
->getTags(), access_location::device
, access_mode::readwrite
);
1076 ArrayHandle
<typename
group_data::ranks_t
> d_group_ranks(m_gdata
->getRanksArray(), access_location::device
, access_mode::readwrite
);
1077 ArrayHandle
<unsigned int> d_group_rtag(m_gdata
->getRTags(), access_location::device
, access_mode::readwrite
);
1080 ScopedAllocation
<unsigned int> d_tmp(m_exec_conf
->getCachedAllocator(), n_recv_unique
);
1082 // add new groups, updating groups that are already present locally
1083 gpu_add_groups(old_ngroups
,
1093 m_gpu_comm
.m_mgpu_context
);
1094 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1097 // resize arrays to final size
1098 groups_array
.resize(new_ngroups
);
1099 group_type_array
.resize(new_ngroups
);
1100 group_tag_array
.resize(new_ngroups
);
1101 group_ranks_array
.resize(new_ngroups
);
1103 // indicate that group table has changed
1104 m_gdata
->setDirty();
1106 if (m_gpu_comm
.m_prof
) m_gpu_comm
.m_prof
->pop(m_exec_conf
);
1111 //! Mark ghost particles
1112 template<class group_data
>
1113 void CommunicatorGPU::GroupCommunicatorGPU
<group_data
>::markGhostParticles(
1114 const GPUArray
<unsigned int>& plans
,
1117 if (m_gdata
->getNGlobal())
1119 m_exec_conf
->msg
->notice(7) << "GroupCommunicator<" << m_gdata
->getName() << ">: find incomplete groups" << std::endl
;
1121 ArrayHandle
<typename
group_data::members_t
> d_groups(m_gdata
->getMembersArray(), access_location::device
, access_mode::read
);
1122 ArrayHandle
<typename
group_data::ranks_t
> d_group_ranks(m_gdata
->getRanksArray(), access_location::device
, access_mode::read
);
1123 ArrayHandle
<unsigned int> d_rtag(m_gpu_comm
.m_pdata
->getRTags(), access_location::device
, access_mode::read
);
1124 ArrayHandle
<Scalar4
> d_pos(m_gpu_comm
.m_pdata
->getPositions(), access_location::device
, access_mode::read
);
1125 ArrayHandle
<unsigned int> d_plan(plans
, access_location::device
, access_mode::readwrite
);
1127 boost::shared_ptr
<DomainDecomposition
> decomposition
= m_gpu_comm
.m_pdata
->getDomainDecomposition();
1128 ArrayHandle
<unsigned int> d_cart_ranks_inv(decomposition
->getInverseCartRanks(), access_location::device
, access_mode::read
);
1129 Index3D di
= decomposition
->getDomainIndexer();
1130 uint3 my_pos
= decomposition
->getGridPos();
1132 gpu_mark_bonded_ghosts
<group_data::size
>(
1137 m_gpu_comm
.m_pdata
->getBox(),
1142 d_cart_ranks_inv
.data
,
1143 m_exec_conf
->getRank(),
1145 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1149 //! Transfer particles between neighboring domains
1150 void CommunicatorGPU::migrateParticles()
1152 m_exec_conf
->msg
->notice(7) << "CommunicatorGPU: migrate particles" << std::endl
;
1154 // check if box is sufficiently large for communication
1155 Scalar3 L
= m_pdata
->getBox().getNearestPlaneDistance();
1156 const Index3D
& di
= m_decomposition
->getDomainIndexer();
1157 if ((m_r_ghost
>= L
.x
/Scalar(2.0) && di
.getW() > 1) ||
1158 (m_r_ghost
>= L
.y
/Scalar(2.0) && di
.getH() > 1) ||
1159 (m_r_ghost
>= L
.z
/Scalar(2.0) && di
.getD() > 1))
1161 m_exec_conf
->msg
->error() << "Simulation box too small for domain decomposition." << std::endl
;
1162 throw std::runtime_error("Error during communication");
1166 m_prof
->push(m_exec_conf
,"comm_migrate");
1168 if (m_last_flags
[comm_flag::tag
])
1170 // Reset reverse lookup tags of old ghost atoms
1171 ArrayHandle
<unsigned int> d_rtag(m_pdata
->getRTags(), access_location::device
, access_mode::readwrite
);
1172 ArrayHandle
<unsigned int> d_tag(m_pdata
->getTags(), access_location::device
, access_mode::read
);
1174 gpu_reset_rtags(m_pdata
->getNGhosts(),
1175 d_tag
.data
+ m_pdata
->getN(),
1178 if (m_exec_conf
->isCUDAErrorCheckingEnabled())
1182 // reset ghost particle number
1183 m_pdata
->removeAllGhostParticles();
1185 // main communication loop
1186 for (unsigned int stage
= 0; stage
< m_num_stages
; stage
++)
1189 ArrayHandle
<Scalar4
> d_pos(m_pdata
->getPositions(), access_location::device
, access_mode::read
);
1190 ArrayHandle
<unsigned int> d_comm_flag(m_pdata
->getCommFlags(), access_location::device
, access_mode::readwrite
);
1192 assert(stage
< m_comm_mask
.size());
1194 // mark all particles which have left the box for sending (rtag=NOT_LOCAL)
1195 gpu_stage_particles(m_pdata
->getN(),
1199 m_comm_mask
[stage
]);
1201 if (m_exec_conf
->isCUDAErrorCheckingEnabled())
1207 * Bonded group communication, determine groups to be sent
1210 m_bond_comm
.migrateGroups(m_bonds_changed
);
1211 m_bonds_changed
= false;
1214 m_angle_comm
.migrateGroups(m_angles_changed
);
1215 m_angles_changed
= false;
1218 m_dihedral_comm
.migrateGroups(m_dihedrals_changed
);
1219 m_dihedrals_changed
= false;
1222 m_improper_comm
.migrateGroups(m_impropers_changed
);
1223 m_impropers_changed
= false;
1226 m_pdata
->removeParticlesGPU(m_gpu_sendbuf
, m_comm_flags
);
1228 const Index3D
& di
= m_decomposition
->getDomainIndexer();
1229 // determine local particles that are to be sent to neighboring processors and fill send buffer
1230 uint3 mypos
= m_decomposition
->getGridPos();
1232 /* We need some better heuristics to decide whether to take the GPU or CPU code path */
1233 #if defined(ENABLE_MPI_CUDA) && 0
1236 m_send_keys
.resize(m_gpu_sendbuf
.size());
1238 ArrayHandle
<pdata_element
> d_gpu_sendbuf(m_gpu_sendbuf
, access_location::device
, access_mode::readwrite
);
1239 ArrayHandle
<unsigned int> d_send_keys(m_send_keys
, access_location::device
, access_mode::overwrite
);
1240 ArrayHandle
<unsigned int> d_begin(m_begin
, access_location::device
, access_mode::overwrite
);
1241 ArrayHandle
<unsigned int> d_end(m_end
, access_location::device
, access_mode::overwrite
);
1242 ArrayHandle
<unsigned int> d_unique_neighbors(m_unique_neighbors
, access_location::device
, access_mode::read
);
1243 ArrayHandle
<unsigned int> d_comm_flags(m_comm_flags
, access_location::device
, access_mode::read
);
1245 ArrayHandle
<unsigned int> d_cart_ranks(m_decomposition
->getCartRanks(), access_location::device
, access_mode::read
);
1247 // get temporary buffers
1248 unsigned int nsend
= m_gpu_sendbuf
.size();
1249 const CachedAllocator
& alloc
= m_exec_conf
->getCachedAllocator();
1250 ScopedAllocation
<pdata_element
> d_in_copy(alloc
, nsend
);
1251 ScopedAllocation
<unsigned int> d_tmp(alloc
, nsend
);
1253 gpu_sort_migrating_particles(m_gpu_sendbuf
.size(),
1262 d_unique_neighbors
.data
,
1269 if (m_exec_conf
->isCUDAErrorCheckingEnabled())
1274 ArrayHandle
<pdata_element
> h_gpu_sendbuf(m_gpu_sendbuf
, access_location::host
, access_mode::readwrite
);
1275 ArrayHandle
<unsigned int> h_begin(m_begin
, access_location::host
, access_mode::overwrite
);
1276 ArrayHandle
<unsigned int> h_end(m_end
, access_location::host
, access_mode::overwrite
);
1277 ArrayHandle
<unsigned int> h_unique_neighbors(m_unique_neighbors
, access_location::host
, access_mode::read
);
1278 ArrayHandle
<unsigned int> h_comm_flags(m_comm_flags
, access_location::host
, access_mode::read
);
1280 ArrayHandle
<unsigned int> h_cart_ranks(m_pdata
->getDomainDecomposition()->getCartRanks(), access_location::host
,
1282 typedef std::multimap
<unsigned int,pdata_element
> key_t
;
1286 get_migrate_key
t(mypos
, di
, m_comm_mask
[stage
],h_cart_ranks
.data
);
1287 for (unsigned int i
= 0; i
< m_comm_flags
.size(); ++i
)
1288 keys
.insert(std::pair
<unsigned int, pdata_element
>(t(h_comm_flags
.data
[i
]),h_gpu_sendbuf
.data
[i
]));
1290 // Find start and end indices
1291 for (unsigned int i
= 0; i
< m_n_unique_neigh
; ++i
)
1293 key_t::iterator lower
= keys
.lower_bound(h_unique_neighbors
.data
[i
]);
1294 key_t::iterator upper
= keys
.upper_bound(h_unique_neighbors
.data
[i
]);
1295 h_begin
.data
[i
] = std::distance(keys
.begin(),lower
);
1296 h_end
.data
[i
] = std::distance(keys
.begin(),upper
);
1301 for (key_t::iterator it
= keys
.begin(); it
!= keys
.end(); ++it
)
1302 h_gpu_sendbuf
.data
[i
++] = it
->second
;
1306 unsigned int n_send_ptls
[m_n_unique_neigh
];
1307 unsigned int n_recv_ptls
[m_n_unique_neigh
];
1308 unsigned int offs
[m_n_unique_neigh
];
1309 unsigned int n_recv_tot
= 0;
1312 ArrayHandle
<unsigned int> h_begin(m_begin
, access_location::host
, access_mode::read
);
1313 ArrayHandle
<unsigned int> h_end(m_end
, access_location::host
, access_mode::read
);
1314 ArrayHandle
<unsigned int> h_unique_neighbors(m_unique_neighbors
, access_location::host
, access_mode::read
);
1316 unsigned int send_bytes
= 0;
1317 unsigned int recv_bytes
= 0;
1318 if (m_prof
) m_prof
->push(m_exec_conf
, "MPI send/recv");
1320 // compute send counts
1321 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
1322 n_send_ptls
[ineigh
] = h_end
.data
[ineigh
] - h_begin
.data
[ineigh
];
1324 MPI_Request req
[2*m_n_unique_neigh
];
1325 MPI_Status stat
[2*m_n_unique_neigh
];
1327 unsigned int nreq
= 0;
1329 // loop over neighbors
1330 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
1332 if (m_stages
[ineigh
] != (int) stage
)
1334 // skip neighbor if not participating in this communication stage
1335 n_send_ptls
[ineigh
] = 0;
1336 n_recv_ptls
[ineigh
] = 0;
1340 // rank of neighbor processor
1341 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
1343 MPI_Isend(&n_send_ptls
[ineigh
], 1, MPI_UNSIGNED
, neighbor
, 0, m_mpi_comm
, & req
[nreq
++]);
1344 MPI_Irecv(&n_recv_ptls
[ineigh
], 1, MPI_UNSIGNED
, neighbor
, 0, m_mpi_comm
, & req
[nreq
++]);
1345 send_bytes
+= sizeof(unsigned int);
1346 recv_bytes
+= sizeof(unsigned int);
1347 } // end neighbor loop
1349 MPI_Waitall(nreq
, req
, stat
);
1351 // sum up receive counts
1352 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
1357 offs
[ineigh
] = offs
[ineigh
-1] + n_recv_ptls
[ineigh
-1];
1359 n_recv_tot
+= n_recv_ptls
[ineigh
];
1362 if (m_prof
) m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
1365 // Resize receive buffer
1366 m_gpu_recvbuf
.resize(n_recv_tot
);
1369 ArrayHandle
<unsigned int> h_begin(m_begin
, access_location::host
, access_mode::read
);
1370 ArrayHandle
<unsigned int> h_end(m_end
, access_location::host
, access_mode::read
);
1371 ArrayHandle
<unsigned int> h_unique_neighbors(m_unique_neighbors
, access_location::host
, access_mode::read
);
1373 if (m_prof
) m_prof
->push(m_exec_conf
,"MPI send/recv");
1375 #if defined(ENABLE_MPI_CUDA) && 0
1376 ArrayHandle
<pdata_element
> gpu_sendbuf_handle(m_gpu_sendbuf
, access_location::device
, access_mode::read
);
1377 ArrayHandle
<pdata_element
> gpu_recvbuf_handle(m_gpu_recvbuf
, access_location::device
, access_mode::overwrite
);
1379 ArrayHandle
<pdata_element
> gpu_sendbuf_handle(m_gpu_sendbuf
, access_location::host
, access_mode::read
);
1380 ArrayHandle
<pdata_element
> gpu_recvbuf_handle(m_gpu_recvbuf
, access_location::host
, access_mode::overwrite
);
1383 std::vector
<MPI_Request
> reqs
;
1386 unsigned int send_bytes
= 0;
1387 unsigned int recv_bytes
= 0;
1389 // loop over neighbors
1390 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
1392 // rank of neighbor processor
1393 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
1395 // exchange particle data
1396 if (n_send_ptls
[ineigh
])
1398 MPI_Isend(gpu_sendbuf_handle
.data
+h_begin
.data
[ineigh
],
1399 n_send_ptls
[ineigh
]*sizeof(pdata_element
),
1405 reqs
.push_back(req
);
1407 send_bytes
+= n_send_ptls
[ineigh
]*sizeof(pdata_element
);
1409 if (n_recv_ptls
[ineigh
])
1411 MPI_Irecv(gpu_recvbuf_handle
.data
+offs
[ineigh
],
1412 n_recv_ptls
[ineigh
]*sizeof(pdata_element
),
1418 reqs
.push_back(req
);
1420 recv_bytes
+= n_recv_ptls
[ineigh
]*sizeof(pdata_element
);
1423 std::vector
<MPI_Status
> stats(reqs
.size());
1424 MPI_Waitall(reqs
.size(), &reqs
.front(), &stats
.front());
1426 if (m_prof
) m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
1430 ArrayHandle
<pdata_element
> d_gpu_recvbuf(m_gpu_recvbuf
, access_location::device
, access_mode::readwrite
);
1431 const BoxDim shifted_box
= getShiftedBox();
1433 // Apply boundary conditions
1434 gpu_wrap_particles(n_recv_tot
,
1437 if (m_exec_conf
->isCUDAErrorCheckingEnabled())
1441 // remove particles that were sent and fill particle data with received particles
1442 m_pdata
->addParticlesGPU(m_gpu_recvbuf
);
1444 } // end communication stage
1446 if (m_prof
) m_prof
->pop(m_exec_conf
);
1449 //! Build a ghost particle list, exchange ghost particle data with neighboring processors
1450 void CommunicatorGPU::exchangeGhosts()
1452 if (m_prof
) m_prof
->push(m_exec_conf
, "comm_ghost_exch");
1454 m_exec_conf
->msg
->notice(7) << "CommunicatorGPU: ghost exchange" << std::endl
;
1456 // the ghost layer must be at_least m_r_ghost wide along every lattice direction
1457 Scalar3 ghost_fraction
= m_r_ghost
/m_pdata
->getBox().getNearestPlaneDistance();
1460 m_n_send_ghosts
.resize(m_num_stages
);
1461 m_n_recv_ghosts
.resize(m_num_stages
);
1463 for (unsigned int istage
= 0; istage
< m_num_stages
; ++istage
)
1465 m_n_send_ghosts
[istage
].resize(m_n_unique_neigh
);
1466 m_n_recv_ghosts
[istage
].resize(m_n_unique_neigh
);
1469 m_n_send_ghosts_tot
.resize(m_num_stages
);
1470 m_n_recv_ghosts_tot
.resize(m_num_stages
);
1471 m_ghost_offs
.resize(m_num_stages
);
1472 for (unsigned int istage
= 0; istage
< m_num_stages
; ++istage
)
1473 m_ghost_offs
[istage
].resize(m_n_unique_neigh
);
1475 m_ghost_begin
.resize(m_n_unique_neigh
*m_num_stages
);
1476 m_ghost_end
.resize(m_n_unique_neigh
*m_num_stages
);
1478 m_idx_offs
.resize(m_num_stages
);
1480 // get requested ghost fields
1481 CommFlags flags
= getFlags();
1483 // main communication loop
1484 for (unsigned int stage
= 0; stage
< m_num_stages
; stage
++)
1486 // make room for plans
1487 m_ghost_plan
.resize(m_pdata
->getN()+m_pdata
->getNGhosts());
1490 // compute plans for all particles, including already received ghosts
1491 ArrayHandle
<Scalar4
> d_pos(m_pdata
->getPositions(), access_location::device
, access_mode::read
);
1492 ArrayHandle
<unsigned int> d_ghost_plan(m_ghost_plan
, access_location::device
, access_mode::overwrite
);
1494 gpu_make_ghost_exchange_plan(d_ghost_plan
.data
,
1495 m_pdata
->getN()+m_pdata
->getNGhosts(),
1499 m_comm_mask
[stage
]);
1501 if (m_exec_conf
->isCUDAErrorCheckingEnabled())
1505 // mark particles that are members of incomplete of bonded groups as ghost
1508 m_bond_comm
.markGhostParticles(m_ghost_plan
,m_comm_mask
[stage
]);
1511 m_angle_comm
.markGhostParticles(m_ghost_plan
,m_comm_mask
[stage
]);
1514 m_dihedral_comm
.markGhostParticles(m_ghost_plan
,m_comm_mask
[stage
]);
1517 m_improper_comm
.markGhostParticles(m_ghost_plan
,m_comm_mask
[stage
]);
1519 // resize temporary number of neighbors array
1520 m_neigh_counts
.resize(m_pdata
->getN()+m_pdata
->getNGhosts());
1523 ArrayHandle
<unsigned int> d_ghost_plan(m_ghost_plan
, access_location::device
, access_mode::read
);
1524 ArrayHandle
<unsigned int> d_adj_mask(m_adj_mask
, access_location::device
, access_mode::read
);
1525 ArrayHandle
<unsigned int> d_neigh_counts(m_neigh_counts
, access_location::device
, access_mode::overwrite
);
1527 // count number of neighbors (total and per particle) the ghost ptls are sent to
1528 m_n_send_ghosts_tot
[stage
] =
1529 gpu_exchange_ghosts_count_neighbors(
1530 m_pdata
->getN()+m_pdata
->getNGhosts(),
1533 d_neigh_counts
.data
,
1537 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1540 // compute offset into ghost idx list
1541 m_idx_offs
[stage
] = 0;
1542 for (unsigned int i
= 0; i
< stage
; ++i
)
1543 m_idx_offs
[stage
] += m_n_send_ghosts_tot
[i
];
1545 // compute maximum send buf size
1546 unsigned int n_max
= 0;
1547 for (unsigned int istage
= 0; istage
<= stage
; ++istage
)
1548 if (m_n_send_ghosts_tot
[istage
] > n_max
) n_max
= m_n_send_ghosts_tot
[istage
];
1550 // make room for ghost indices and neighbor ranks
1551 m_ghost_idx_adj
.resize(m_idx_offs
[stage
] + m_n_send_ghosts_tot
[stage
]);
1552 m_ghost_neigh
.resize(m_idx_offs
[stage
] + m_n_send_ghosts_tot
[stage
]);
1554 if (flags
[comm_flag::tag
]) m_tag_ghost_sendbuf
.resize(n_max
);
1555 if (flags
[comm_flag::position
]) m_pos_ghost_sendbuf
.resize(n_max
);
1556 if (flags
[comm_flag::velocity
]) m_vel_ghost_sendbuf
.resize(n_max
);
1557 if (flags
[comm_flag::charge
]) m_charge_ghost_sendbuf
.resize(n_max
);
1558 if (flags
[comm_flag::diameter
]) m_diameter_ghost_sendbuf
.resize(n_max
);
1559 if (flags
[comm_flag::orientation
]) m_orientation_ghost_sendbuf
.resize(n_max
);
1562 ArrayHandle
<unsigned int> d_ghost_plan(m_ghost_plan
, access_location::device
, access_mode::read
);
1563 ArrayHandle
<unsigned int> d_adj_mask(m_adj_mask
, access_location::device
, access_mode::read
);
1564 ArrayHandle
<unsigned int> d_neigh_counts(m_neigh_counts
, access_location::device
, access_mode::read
);
1565 ArrayHandle
<unsigned int> d_unique_neighbors(m_unique_neighbors
, access_location::device
, access_mode::read
);
1567 ArrayHandle
<unsigned int> d_tag(m_pdata
->getTags(), access_location::device
, access_mode::read
);
1569 ArrayHandle
<uint2
> d_ghost_idx_adj(m_ghost_idx_adj
, access_location::device
, access_mode::overwrite
);
1570 ArrayHandle
<unsigned int> d_ghost_neigh(m_ghost_neigh
, access_location::device
, access_mode::overwrite
);
1571 ArrayHandle
<unsigned int> d_ghost_begin(m_ghost_begin
, access_location::device
, access_mode::overwrite
);
1572 ArrayHandle
<unsigned int> d_ghost_end(m_ghost_end
, access_location::device
, access_mode::overwrite
);
1574 //! Fill ghost send list and compute start and end indices per unique neighbor in list
1575 gpu_exchange_ghosts_make_indices(
1576 m_pdata
->getN() + m_pdata
->getNGhosts(),
1580 d_unique_neighbors
.data
,
1581 d_neigh_counts
.data
,
1582 d_ghost_idx_adj
.data
+ m_idx_offs
[stage
],
1583 d_ghost_neigh
.data
+ m_idx_offs
[stage
],
1584 d_ghost_begin
.data
+ stage
*m_n_unique_neigh
,
1585 d_ghost_end
.data
+ stage
*m_n_unique_neigh
,
1587 m_n_send_ghosts_tot
[stage
],
1591 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1595 // access particle data
1596 ArrayHandle
<unsigned int> d_tag(m_pdata
->getTags(), access_location::device
, access_mode::read
);
1597 ArrayHandle
<Scalar4
> d_pos(m_pdata
->getPositions(), access_location::device
, access_mode::read
);
1598 ArrayHandle
<Scalar4
> d_vel(m_pdata
->getVelocities(), access_location::device
, access_mode::read
);
1599 ArrayHandle
<Scalar
> d_charge(m_pdata
->getCharges(), access_location::device
, access_mode::read
);
1600 ArrayHandle
<Scalar
> d_diameter(m_pdata
->getDiameters(), access_location::device
, access_mode::read
);
1601 ArrayHandle
<Scalar4
> d_orientation(m_pdata
->getOrientationArray(), access_location::device
, access_mode::read
);
1602 ArrayHandle
<unsigned int> d_rtag(m_pdata
->getRTags(), access_location::device
, access_mode::read
);
1604 // access ghost send indices
1605 ArrayHandle
<uint2
> d_ghost_idx_adj(m_ghost_idx_adj
, access_location::device
, access_mode::read
);
1607 // access output buffers
1608 ArrayHandle
<unsigned int> d_tag_ghost_sendbuf(m_tag_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
1609 ArrayHandle
<Scalar4
> d_pos_ghost_sendbuf(m_pos_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
1610 ArrayHandle
<Scalar4
> d_vel_ghost_sendbuf(m_vel_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
1611 ArrayHandle
<Scalar
> d_charge_ghost_sendbuf(m_charge_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
1612 ArrayHandle
<Scalar
> d_diameter_ghost_sendbuf(m_diameter_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
1613 ArrayHandle
<Scalar4
> d_orientation_ghost_sendbuf(m_orientation_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
1615 const BoxDim
& global_box
= m_pdata
->getGlobalBox();
1616 const Index3D
& di
= m_pdata
->getDomainDecomposition()->getDomainIndexer();
1617 uint3 my_pos
= m_pdata
->getDomainDecomposition()->getGridPos();
1619 // Pack ghosts into send buffers
1620 gpu_exchange_ghosts_pack(
1621 m_n_send_ghosts_tot
[stage
],
1622 d_ghost_idx_adj
.data
+ m_idx_offs
[stage
],
1629 d_tag_ghost_sendbuf
.data
,
1630 d_pos_ghost_sendbuf
.data
,
1631 d_vel_ghost_sendbuf
.data
,
1632 d_charge_ghost_sendbuf
.data
,
1633 d_diameter_ghost_sendbuf
.data
,
1634 d_orientation_ghost_sendbuf
.data
,
1635 flags
[comm_flag::tag
],
1636 flags
[comm_flag::position
],
1637 flags
[comm_flag::velocity
],
1638 flags
[comm_flag::charge
],
1639 flags
[comm_flag::diameter
],
1640 flags
[comm_flag::orientation
],
1645 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
1649 * Ghost particle communication
1651 m_n_recv_ghosts_tot
[stage
] = 0;
1653 unsigned int send_bytes
= 0;
1654 unsigned int recv_bytes
= 0;
1657 ArrayHandle
<unsigned int> h_ghost_begin(m_ghost_begin
, access_location::host
, access_mode::read
);
1658 ArrayHandle
<unsigned int> h_ghost_end(m_ghost_end
, access_location::host
, access_mode::read
);
1659 ArrayHandle
<unsigned int> h_unique_neighbors(m_unique_neighbors
, access_location::host
, access_mode::read
);
1661 if (m_prof
) m_prof
->push(m_exec_conf
, "MPI send/recv");
1663 // compute send counts
1664 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
1665 m_n_send_ghosts
[stage
][ineigh
] = h_ghost_end
.data
[ineigh
+stage
*m_n_unique_neigh
]
1666 - h_ghost_begin
.data
[ineigh
+stage
*m_n_unique_neigh
];
1668 MPI_Request req
[2*m_n_unique_neigh
];
1669 MPI_Status stat
[2*m_n_unique_neigh
];
1671 unsigned int nreq
= 0;
1673 // loop over neighbors
1674 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
1676 if (m_stages
[ineigh
] != (int) stage
)
1678 // skip neighbor if not participating in this communication stage
1679 m_n_send_ghosts
[stage
][ineigh
] = 0;
1680 m_n_recv_ghosts
[stage
][ineigh
] = 0;
1684 // rank of neighbor processor
1685 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
1687 MPI_Isend(&m_n_send_ghosts
[stage
][ineigh
], 1, MPI_UNSIGNED
, neighbor
, 0, m_mpi_comm
, & req
[nreq
++]);
1688 MPI_Irecv(&m_n_recv_ghosts
[stage
][ineigh
], 1, MPI_UNSIGNED
, neighbor
, 0, m_mpi_comm
, & req
[nreq
++]);
1690 send_bytes
+= sizeof(unsigned int);
1691 recv_bytes
+= sizeof(unsigned int);
1694 MPI_Waitall(nreq
, req
, stat
);
1696 // total up receive counts
1697 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
1700 m_ghost_offs
[stage
][ineigh
] = 0;
1702 m_ghost_offs
[stage
][ineigh
] = m_ghost_offs
[stage
][ineigh
-1] + m_n_recv_ghosts
[stage
][ineigh
-1];
1704 m_n_recv_ghosts_tot
[stage
] += m_n_recv_ghosts
[stage
][ineigh
];
1707 if (m_prof
) m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
1711 // compute maximum number of received ghosts
1712 for (unsigned int istage
= 0; istage
<= stage
; ++istage
)
1713 if (m_n_recv_ghosts_tot
[istage
] > n_max
) n_max
= m_n_recv_ghosts_tot
[istage
];
1715 if (flags
[comm_flag::tag
]) m_tag_ghost_recvbuf
.resize(n_max
);
1716 if (flags
[comm_flag::position
]) m_pos_ghost_recvbuf
.resize(n_max
);
1717 if (flags
[comm_flag::velocity
]) m_vel_ghost_recvbuf
.resize(n_max
);
1718 if (flags
[comm_flag::charge
]) m_charge_ghost_recvbuf
.resize(n_max
);
1719 if (flags
[comm_flag::diameter
]) m_diameter_ghost_recvbuf
.resize(n_max
);
1720 if (flags
[comm_flag::orientation
]) m_orientation_ghost_recvbuf
.resize(n_max
);
1722 // first ghost ptl index
1723 unsigned int first_idx
= m_pdata
->getN()+m_pdata
->getNGhosts();
1725 // update number of ghost particles
1726 m_pdata
->addGhostParticles(m_n_recv_ghosts_tot
[stage
]);
1729 unsigned int offs
= 0;
1730 #ifdef ENABLE_MPI_CUDA
1731 // recv buffers, write directly into particle data arrays
1733 ArrayHandle
<unsigned int> tag_ghost_recvbuf_handle(m_pdata
->getTags(), access_location::device
, access_mode::readwrite
);
1734 ArrayHandle
<Scalar4
> pos_ghost_recvbuf_handle(m_pdata
->getPositions(), access_location::device
, access_mode::readwrite
);
1735 ArrayHandle
<Scalar4
> vel_ghost_recvbuf_handle(m_pdata
->getVelocities(), access_location::device
, access_mode::readwrite
);
1736 ArrayHandle
<Scalar
> charge_ghost_recvbuf_handle(m_pdata
->getCharges(), access_location::device
, access_mode::readwrite
);
1737 ArrayHandle
<Scalar
> diameter_ghost_recvbuf_handle(m_pdata
->getDiameters(), access_location::device
, access_mode::readwrite
);
1738 ArrayHandle
<Scalar4
> orientation_ghost_recvbuf_handle(m_pdata
->getOrientationArray(), access_location::device
, access_mode::readwrite
);
1741 ArrayHandle
<unsigned int> tag_ghost_sendbuf_handle(m_tag_ghost_sendbuf
, access_location::device
, access_mode::read
);
1742 ArrayHandle
<Scalar4
> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf
, access_location::device
, access_mode::read
);
1743 ArrayHandle
<Scalar4
> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf
, access_location::device
, access_mode::read
);
1744 ArrayHandle
<Scalar
> charge_ghost_sendbuf_handle(m_charge_ghost_sendbuf
, access_location::device
, access_mode::read
);
1745 ArrayHandle
<Scalar
> diameter_ghost_sendbuf_handle(m_diameter_ghost_sendbuf
, access_location::device
, access_mode::read
);
1746 ArrayHandle
<Scalar4
> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf
, access_location::device
, access_mode::read
);
1748 // MPI library may use non-zero stream
1749 cudaDeviceSynchronize();
1752 ArrayHandleAsync
<unsigned int> tag_ghost_recvbuf_handle(m_tag_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
1753 ArrayHandleAsync
<Scalar4
> pos_ghost_recvbuf_handle(m_pos_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
1754 ArrayHandleAsync
<Scalar4
> vel_ghost_recvbuf_handle(m_vel_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
1755 ArrayHandleAsync
<Scalar
> charge_ghost_recvbuf_handle(m_charge_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
1756 ArrayHandleAsync
<Scalar
> diameter_ghost_recvbuf_handle(m_diameter_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
1757 ArrayHandleAsync
<Scalar4
> orientation_ghost_recvbuf_handle(m_orientation_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
1759 ArrayHandleAsync
<unsigned int> tag_ghost_sendbuf_handle(m_tag_ghost_sendbuf
, access_location::host
, access_mode::read
);
1760 ArrayHandleAsync
<Scalar4
> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf
, access_location::host
, access_mode::read
);
1761 ArrayHandleAsync
<Scalar4
> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf
, access_location::host
, access_mode::read
);
1762 ArrayHandleAsync
<Scalar
> charge_ghost_sendbuf_handle(m_charge_ghost_sendbuf
, access_location::host
, access_mode::read
);
1763 ArrayHandleAsync
<Scalar
> diameter_ghost_sendbuf_handle(m_diameter_ghost_sendbuf
, access_location::host
, access_mode::read
);
1764 ArrayHandleAsync
<Scalar4
> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf
, access_location::host
, access_mode::read
);
1766 // lump together into one synchronization call
1767 cudaDeviceSynchronize();
1770 ArrayHandle
<unsigned int> h_unique_neighbors(m_unique_neighbors
, access_location::host
, access_mode::read
);
1771 ArrayHandle
<unsigned int> h_ghost_begin(m_ghost_begin
, access_location::host
, access_mode::read
);
1772 ArrayHandle
<unsigned int> h_ghost_end(m_ghost_end
, access_location::host
, access_mode::read
);
1774 if (m_prof
) m_prof
->push(m_exec_conf
, "MPI send/recv");
1776 std::vector
<MPI_Request
> reqs
;
1779 // loop over neighbors
1780 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
1782 // rank of neighbor processor
1783 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
1785 if (flags
[comm_flag::tag
])
1787 // when sending/receiving 0 ptls, the send/recv buffer may be uninitialized
1788 if (m_n_send_ghosts
[stage
][ineigh
])
1790 MPI_Isend(tag_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+stage
*m_n_unique_neigh
],
1791 m_n_send_ghosts
[stage
][ineigh
]*sizeof(unsigned int),
1797 reqs
.push_back(req
);
1799 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(unsigned int);
1800 if (m_n_recv_ghosts
[stage
][ineigh
])
1802 MPI_Irecv(tag_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
1803 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(unsigned int),
1809 reqs
.push_back(req
);
1811 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(unsigned int);
1814 if (flags
[comm_flag::position
])
1817 if (m_n_send_ghosts
[stage
][ineigh
])
1819 MPI_Isend(pos_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+ stage
*m_n_unique_neigh
],
1820 m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
1826 reqs
.push_back(req
);
1828 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
1829 if (m_n_recv_ghosts
[stage
][ineigh
])
1831 MPI_Irecv(pos_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
1832 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
1838 reqs
.push_back(req
);
1840 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(unsigned int);
1843 if (flags
[comm_flag::velocity
])
1845 if (m_n_send_ghosts
[stage
][ineigh
])
1847 MPI_Isend(vel_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+ stage
*m_n_unique_neigh
],
1848 m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
1854 reqs
.push_back(req
);
1856 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
1857 if (m_n_recv_ghosts
[stage
][ineigh
])
1859 MPI_Irecv(vel_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
1860 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
1866 reqs
.push_back(req
);
1868 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
1871 if (flags
[comm_flag::charge
])
1873 if (m_n_send_ghosts
[stage
][ineigh
])
1875 MPI_Isend(charge_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+ stage
*m_n_unique_neigh
],
1876 m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar
),
1882 reqs
.push_back(req
);
1884 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar
);
1885 if (m_n_recv_ghosts
[stage
][ineigh
])
1887 MPI_Irecv(charge_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
1888 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar
),
1894 reqs
.push_back(req
);
1896 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar
);
1899 if (flags
[comm_flag::diameter
])
1901 if (m_n_send_ghosts
[stage
][ineigh
])
1903 MPI_Isend(diameter_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+ stage
*m_n_unique_neigh
],
1904 m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar
),
1910 reqs
.push_back(req
);
1912 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar
);
1913 if (m_n_recv_ghosts
[stage
][ineigh
])
1915 MPI_Irecv(diameter_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
1916 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar
),
1922 reqs
.push_back(req
);
1924 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar
);
1927 if (flags
[comm_flag::orientation
])
1929 if (m_n_send_ghosts
[stage
][ineigh
])
1931 MPI_Isend(orientation_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+ stage
*m_n_unique_neigh
],
1932 m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
1938 reqs
.push_back(req
);
1940 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
1941 if (m_n_recv_ghosts
[stage
][ineigh
])
1943 MPI_Irecv(orientation_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
1944 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
1950 reqs
.push_back(req
);
1952 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
1954 } // end neighbor loop
1956 std::vector
<MPI_Status
> stats(reqs
.size());
1957 MPI_Waitall(reqs
.size(), &reqs
.front(), &stats
.front());
1959 if (m_prof
) m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
1960 } // end ArrayHandle scope
1962 #ifdef ENABLE_MPI_CUDA
1963 // MPI library may use non-zero stream
1964 cudaDeviceSynchronize();
1966 // only unpack in non-CUDA MPI builds
1968 // access receive buffers
1969 ArrayHandle
<unsigned int> d_tag_ghost_recvbuf(m_tag_ghost_recvbuf
, access_location::device
, access_mode::read
);
1970 ArrayHandle
<Scalar4
> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf
, access_location::device
, access_mode::read
);
1971 ArrayHandle
<Scalar4
> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf
, access_location::device
, access_mode::read
);
1972 ArrayHandle
<Scalar
> d_charge_ghost_recvbuf(m_charge_ghost_recvbuf
, access_location::device
, access_mode::read
);
1973 ArrayHandle
<Scalar
> d_diameter_ghost_recvbuf(m_diameter_ghost_recvbuf
, access_location::device
, access_mode::read
);
1974 ArrayHandle
<Scalar4
> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf
, access_location::device
, access_mode::read
);
1975 // access particle data
1976 ArrayHandle
<unsigned int> d_tag(m_pdata
->getTags(), access_location::device
, access_mode::readwrite
);
1977 ArrayHandle
<Scalar4
> d_pos(m_pdata
->getPositions(), access_location::device
, access_mode::readwrite
);
1978 ArrayHandle
<Scalar4
> d_vel(m_pdata
->getVelocities(), access_location::device
, access_mode::readwrite
);
1979 ArrayHandle
<Scalar
> d_charge(m_pdata
->getCharges(), access_location::device
, access_mode::readwrite
);
1980 ArrayHandle
<Scalar
> d_diameter(m_pdata
->getDiameters(), access_location::device
, access_mode::readwrite
);
1981 ArrayHandle
<Scalar4
> d_orientation(m_pdata
->getOrientationArray(), access_location::device
, access_mode::readwrite
);
1983 // copy recv buf into particle data
1984 gpu_exchange_ghosts_copy_buf(
1985 m_n_recv_ghosts_tot
[stage
],
1986 d_tag_ghost_recvbuf
.data
,
1987 d_pos_ghost_recvbuf
.data
,
1988 d_vel_ghost_recvbuf
.data
,
1989 d_charge_ghost_recvbuf
.data
,
1990 d_diameter_ghost_recvbuf
.data
,
1991 d_orientation_ghost_recvbuf
.data
,
1992 d_tag
.data
+ first_idx
,
1993 d_pos
.data
+ first_idx
,
1994 d_vel
.data
+ first_idx
,
1995 d_charge
.data
+ first_idx
,
1996 d_diameter
.data
+ first_idx
,
1997 d_orientation
.data
+ first_idx
,
1998 flags
[comm_flag::tag
],
1999 flags
[comm_flag::position
],
2000 flags
[comm_flag::velocity
],
2001 flags
[comm_flag::charge
],
2002 flags
[comm_flag::diameter
],
2003 flags
[comm_flag::orientation
]);
2005 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2009 if (flags
[comm_flag::tag
])
2011 ArrayHandle
<unsigned int> d_tag(m_pdata
->getTags(), access_location::device
, access_mode::read
);
2012 ArrayHandle
<unsigned int> d_rtag(m_pdata
->getRTags(), access_location::device
, access_mode::readwrite
);
2014 // update reverse-lookup table
2015 gpu_compute_ghost_rtags(first_idx
,
2016 m_n_recv_ghosts_tot
[stage
],
2017 d_tag
.data
+ first_idx
,
2019 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2021 } // end main communication loop
2023 m_last_flags
= flags
;
2025 if (m_prof
) m_prof
->pop(m_exec_conf
);
2027 // we have updated ghost particles, so notify subscribers about this
2028 m_pdata
->notifyGhostParticleNumberChange();
2031 //! Perform ghosts update
2032 void CommunicatorGPU::beginUpdateGhosts(unsigned int timestep
)
2034 m_exec_conf
->msg
->notice(7) << "CommunicatorGPU: ghost update" << std::endl
;
2036 if (m_prof
) m_prof
->push(m_exec_conf
, "comm_ghost_update");
2038 CommFlags flags
= getFlags();
2040 // main communication loop
2041 for (unsigned int stage
= 0; stage
< m_num_stages
; ++stage
)
2043 #ifndef ENABLE_MPI_CUDA
2044 bool mapped_ghost_recv
= m_tuner_ghost_recv
->getParam();
2045 bool mapped_ghost_send
= m_tuner_ghost_send
->getParam();
2047 // switch between mapped and non-mapped
2048 if (mapped_ghost_recv
!= m_mapped_ghost_recv
)
2050 m_pos_ghost_recvbuf_alt
.resize(m_pos_ghost_recvbuf
.size());
2051 m_pos_ghost_recvbuf
.swap(m_pos_ghost_recvbuf_alt
);
2053 m_vel_ghost_recvbuf_alt
.resize(m_vel_ghost_recvbuf
.size());
2054 m_vel_ghost_recvbuf
.swap(m_vel_ghost_recvbuf_alt
);
2056 m_orientation_ghost_recvbuf_alt
.resize(m_orientation_ghost_recvbuf
.size());
2057 m_orientation_ghost_recvbuf
.swap(m_orientation_ghost_recvbuf_alt
);
2059 m_tag_ghost_recvbuf_alt
.resize(m_tag_ghost_recvbuf
.size());
2060 m_tag_ghost_recvbuf
.swap(m_tag_ghost_recvbuf_alt
);
2062 m_charge_ghost_recvbuf_alt
.resize(m_charge_ghost_recvbuf
.size());
2063 m_charge_ghost_recvbuf
.swap(m_charge_ghost_recvbuf_alt
);
2065 m_diameter_ghost_recvbuf_alt
.resize(m_diameter_ghost_recvbuf
.size());
2066 m_diameter_ghost_recvbuf
.swap(m_diameter_ghost_recvbuf_alt
);
2068 m_mapped_ghost_recv
= mapped_ghost_recv
;
2070 if (mapped_ghost_send
!= m_mapped_ghost_send
)
2072 m_pos_ghost_sendbuf_alt
.resize(m_pos_ghost_sendbuf
.size());
2073 m_pos_ghost_sendbuf
.swap(m_pos_ghost_sendbuf_alt
);
2075 m_vel_ghost_sendbuf_alt
.resize(m_vel_ghost_sendbuf
.size());
2076 m_vel_ghost_sendbuf
.swap(m_vel_ghost_sendbuf_alt
);
2078 m_orientation_ghost_sendbuf_alt
.resize(m_orientation_ghost_sendbuf
.size());
2079 m_orientation_ghost_sendbuf
.swap(m_orientation_ghost_sendbuf_alt
);
2081 m_tag_ghost_sendbuf_alt
.resize(m_tag_ghost_sendbuf
.size());
2082 m_tag_ghost_sendbuf
.swap(m_tag_ghost_sendbuf_alt
);
2084 m_charge_ghost_sendbuf_alt
.resize(m_charge_ghost_sendbuf
.size());
2085 m_charge_ghost_sendbuf
.swap(m_charge_ghost_sendbuf_alt
);
2087 m_diameter_ghost_sendbuf_alt
.resize(m_diameter_ghost_sendbuf
.size());
2088 m_diameter_ghost_sendbuf
.swap(m_diameter_ghost_sendbuf_alt
);
2090 m_mapped_ghost_send
= mapped_ghost_send
;
2094 if (m_prof
) m_prof
->push(m_exec_conf
,"pack");
2096 // access particle data
2097 ArrayHandle
<Scalar4
> d_pos(m_pdata
->getPositions(), access_location::device
, access_mode::read
);
2098 ArrayHandle
<Scalar4
> d_vel(m_pdata
->getVelocities(), access_location::device
, access_mode::read
);
2099 ArrayHandle
<Scalar4
> d_orientation(m_pdata
->getOrientationArray(), access_location::device
, access_mode::read
);
2101 // access ghost send indices
2102 ArrayHandle
<uint2
> d_ghost_idx_adj(m_ghost_idx_adj
, access_location::device
, access_mode::read
);
2104 #ifndef ENABLE_MPI_CUDA
2105 // begin measurement of device-host transfer time
2106 m_tuner_ghost_send
->begin();
2109 // access output buffers
2110 ArrayHandle
<unsigned int> d_tag_ghost_sendbuf(m_tag_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
2111 ArrayHandle
<Scalar4
> d_pos_ghost_sendbuf(m_pos_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
2112 ArrayHandle
<Scalar4
> d_vel_ghost_sendbuf(m_vel_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
2113 ArrayHandle
<Scalar4
> d_orientation_ghost_sendbuf(m_orientation_ghost_sendbuf
, access_location::device
, access_mode::overwrite
);
2115 const BoxDim
& global_box
= m_pdata
->getGlobalBox();
2116 const Index3D
& di
= m_pdata
->getDomainDecomposition()->getDomainIndexer();
2117 uint3 my_pos
= m_pdata
->getDomainDecomposition()->getGridPos();
2119 // Pack ghosts into send buffers
2120 gpu_exchange_ghosts_pack(
2121 m_n_send_ghosts_tot
[stage
],
2122 d_ghost_idx_adj
.data
+ m_idx_offs
[stage
],
2130 d_pos_ghost_sendbuf
.data
,
2131 d_vel_ghost_sendbuf
.data
,
2134 d_orientation_ghost_sendbuf
.data
,
2136 flags
[comm_flag::position
],
2137 flags
[comm_flag::velocity
],
2140 flags
[comm_flag::orientation
],
2145 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2147 if (m_prof
) m_prof
->pop(m_exec_conf
);
2150 * Ghost particle communication
2153 // first ghost ptl index
2154 unsigned int first_idx
= m_pdata
->getN();
2156 // total up ghosts received thus far
2157 for (unsigned int istage
= 0; istage
< stage
; ++istage
)
2158 first_idx
+= m_n_recv_ghosts_tot
[istage
];
2161 unsigned int offs
= 0;
2162 // access particle data
2163 #ifdef ENABLE_MPI_CUDA
2165 // recv buffers, directly write into particle data arrays
2166 ArrayHandle
<Scalar4
> pos_ghost_recvbuf_handle(m_pdata
->getPositions(), access_location::device
, access_mode::readwrite
);
2167 ArrayHandle
<Scalar4
> vel_ghost_recvbuf_handle(m_pdata
->getVelocities(), access_location::device
, access_mode::readwrite
);
2168 ArrayHandle
<Scalar4
> orientation_ghost_recvbuf_handle(m_pdata
->getOrientationArray(), access_location::device
, access_mode::readwrite
);
2171 ArrayHandle
<Scalar4
> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf
, access_location::device
, access_mode::read
);
2172 ArrayHandle
<Scalar4
> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf
, access_location::device
, access_mode::read
);
2173 ArrayHandle
<Scalar4
> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf
, access_location::device
, access_mode::read
);
2175 ArrayHandle
<unsigned int> h_unique_neighbors(m_unique_neighbors
, access_location::host
, access_mode::read
);
2176 ArrayHandle
<unsigned int> h_ghost_begin(m_ghost_begin
, access_location::host
, access_mode::read
);
2178 // MPI library may use non-zero stream
2179 cudaDeviceSynchronize();
2182 ArrayHandle
<Scalar4
> pos_ghost_recvbuf_handle(m_pos_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
2183 ArrayHandle
<Scalar4
> vel_ghost_recvbuf_handle(m_vel_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
2184 ArrayHandle
<Scalar4
> orientation_ghost_recvbuf_handle(m_orientation_ghost_recvbuf
, access_location::host
, access_mode::overwrite
);
2187 ArrayHandleAsync
<Scalar4
> pos_ghost_sendbuf_handle(m_pos_ghost_sendbuf
, access_location::host
, access_mode::read
);
2188 ArrayHandleAsync
<Scalar4
> vel_ghost_sendbuf_handle(m_vel_ghost_sendbuf
, access_location::host
, access_mode::read
);
2189 ArrayHandleAsync
<Scalar4
> orientation_ghost_sendbuf_handle(m_orientation_ghost_sendbuf
, access_location::host
, access_mode::read
);
2191 // end measurement of device-host transfer time
2192 m_tuner_ghost_send
->end();
2194 ArrayHandleAsync
<unsigned int> h_unique_neighbors(m_unique_neighbors
, access_location::host
, access_mode::read
);
2195 ArrayHandleAsync
<unsigned int> h_ghost_begin(m_ghost_begin
, access_location::host
, access_mode::read
);
2197 // lump together into one synchronization call
2198 cudaEventRecord(m_event
);
2199 cudaEventSynchronize(m_event
);
2202 // access send buffers
2203 if (m_prof
) m_prof
->push(m_exec_conf
, "MPI send/recv");
2208 unsigned int send_bytes
= 0;
2209 unsigned int recv_bytes
= 0;
2211 // loop over neighbors
2212 for (unsigned int ineigh
= 0; ineigh
< m_n_unique_neigh
; ineigh
++)
2214 // rank of neighbor processor
2215 unsigned int neighbor
= h_unique_neighbors
.data
[ineigh
];
2217 if (flags
[comm_flag::position
])
2219 if (m_n_send_ghosts
[stage
][ineigh
])
2221 MPI_Isend(pos_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+ stage
*m_n_unique_neigh
],
2222 m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
2228 m_reqs
.push_back(req
);
2230 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
2232 if (m_n_recv_ghosts
[stage
][ineigh
])
2234 MPI_Irecv(pos_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
2235 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
2241 m_reqs
.push_back(req
);
2243 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
2246 if (flags
[comm_flag::velocity
])
2248 if (m_n_send_ghosts
[stage
][ineigh
])
2250 MPI_Isend(vel_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+ stage
*m_n_unique_neigh
],
2251 m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
2257 m_reqs
.push_back(req
);
2259 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
2261 if (m_n_recv_ghosts
[stage
][ineigh
])
2263 MPI_Irecv(vel_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
2264 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
2270 m_reqs
.push_back(req
);
2272 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
2275 if (flags
[comm_flag::orientation
])
2277 if (m_n_send_ghosts
[stage
][ineigh
])
2279 MPI_Isend(orientation_ghost_sendbuf_handle
.data
+h_ghost_begin
.data
[ineigh
+ stage
*m_n_unique_neigh
],
2280 m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
2286 m_reqs
.push_back(req
);
2288 send_bytes
+= m_n_send_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
2290 if (m_n_recv_ghosts
[stage
][ineigh
])
2292 MPI_Irecv(orientation_ghost_recvbuf_handle
.data
+ m_ghost_offs
[stage
][ineigh
] + offs
,
2293 m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
),
2299 m_reqs
.push_back(req
);
2301 recv_bytes
+= m_n_recv_ghosts
[stage
][ineigh
]*sizeof(Scalar4
);
2303 } // end neighbor loop
2305 if (m_num_stages
== 1)
2307 // use non-blocking MPI
2308 m_comm_pending
= true;
2312 // complete communication
2313 std::vector
<MPI_Status
> stats(m_reqs
.size());
2314 MPI_Waitall(m_reqs
.size(), &m_reqs
.front(), &stats
.front());
2317 if (m_prof
) m_prof
->pop(m_exec_conf
,0,send_bytes
+recv_bytes
);
2318 } // end ArrayHandle scope
2321 if (!m_comm_pending
)
2323 #ifndef ENABLE_MPI_CUDA
2324 //only unpack in non-CUDA MPI builds
2325 if (m_prof
) m_prof
->push(m_exec_conf
,"unpack");
2327 // begin measurement of host-device transfer time for recveived ghosts
2328 m_tuner_ghost_recv
->begin();
2330 // access receive buffers
2331 ArrayHandle
<Scalar4
> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf
, access_location::device
, access_mode::read
);
2332 ArrayHandle
<Scalar4
> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf
, access_location::device
, access_mode::read
);
2333 ArrayHandle
<Scalar4
> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf
, access_location::device
, access_mode::read
);
2334 // access particle data
2335 ArrayHandle
<Scalar4
> d_pos(m_pdata
->getPositions(), access_location::device
, access_mode::readwrite
);
2336 ArrayHandle
<Scalar4
> d_vel(m_pdata
->getVelocities(), access_location::device
, access_mode::readwrite
);
2337 ArrayHandle
<Scalar4
> d_orientation(m_pdata
->getOrientationArray(), access_location::device
, access_mode::readwrite
);
2339 // copy recv buf into particle data
2340 gpu_exchange_ghosts_copy_buf(
2341 m_n_recv_ghosts_tot
[stage
],
2343 d_pos_ghost_recvbuf
.data
,
2344 d_vel_ghost_recvbuf
.data
,
2347 d_orientation_ghost_recvbuf
.data
,
2349 d_pos
.data
+ first_idx
,
2350 d_vel
.data
+ first_idx
,
2353 d_orientation
.data
+ first_idx
,
2355 flags
[comm_flag::position
],
2356 flags
[comm_flag::velocity
],
2359 flags
[comm_flag::orientation
]);
2361 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2364 m_tuner_ghost_recv
->end();
2366 if (m_prof
) m_prof
->pop(m_exec_conf
);
2369 } // end main communication loop
2371 if (m_prof
) m_prof
->pop(m_exec_conf
);
2374 /*! Finish ghost update
2376 * \param timestep The time step
2378 void CommunicatorGPU::finishUpdateGhosts(unsigned int timestep
)
2382 m_comm_pending
= false;
2384 if (m_prof
) m_prof
->push(m_exec_conf
, "comm_ghost_update");
2386 // complete communication
2387 if (m_prof
) m_prof
->push(m_exec_conf
, "MPI send/recv");
2388 std::vector
<MPI_Status
> stats(m_reqs
.size());
2389 MPI_Waitall(m_reqs
.size(), &m_reqs
.front(), &stats
.front());
2390 if (m_prof
) m_prof
->pop(m_exec_conf
);
2392 #ifdef ENABLE_MPI_CUDA
2393 // MPI library may use non-zero stream
2394 cudaDeviceSynchronize();
2396 // only unpack in non-CUDA-MPI builds
2397 assert(m_num_stages
== 1);
2398 unsigned int stage
= 0;
2399 unsigned int first_idx
= m_pdata
->getN();
2400 CommFlags flags
= m_last_flags
;
2401 if (m_prof
) m_prof
->push(m_exec_conf
,"unpack");
2403 // begin measurement of host-device transfer time for recveived ghosts
2404 m_tuner_ghost_recv
->begin();
2406 // access receive buffers
2407 ArrayHandle
<Scalar4
> d_pos_ghost_recvbuf(m_pos_ghost_recvbuf
, access_location::device
, access_mode::read
);
2408 ArrayHandle
<Scalar4
> d_vel_ghost_recvbuf(m_vel_ghost_recvbuf
, access_location::device
, access_mode::read
);
2409 ArrayHandle
<Scalar4
> d_orientation_ghost_recvbuf(m_orientation_ghost_recvbuf
, access_location::device
, access_mode::read
);
2410 // access particle data
2411 ArrayHandle
<Scalar4
> d_pos(m_pdata
->getPositions(), access_location::device
, access_mode::readwrite
);
2412 ArrayHandle
<Scalar4
> d_vel(m_pdata
->getVelocities(), access_location::device
, access_mode::readwrite
);
2413 ArrayHandle
<Scalar4
> d_orientation(m_pdata
->getOrientationArray(), access_location::device
, access_mode::readwrite
);
2415 // copy recv buf into particle data
2416 gpu_exchange_ghosts_copy_buf(
2417 m_n_recv_ghosts_tot
[stage
],
2419 d_pos_ghost_recvbuf
.data
,
2420 d_vel_ghost_recvbuf
.data
,
2423 d_orientation_ghost_recvbuf
.data
,
2425 d_pos
.data
+ first_idx
,
2426 d_vel
.data
+ first_idx
,
2429 d_orientation
.data
+ first_idx
,
2431 flags
[comm_flag::position
],
2432 flags
[comm_flag::velocity
],
2435 flags
[comm_flag::orientation
]);
2437 if (m_exec_conf
->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
2440 m_tuner_ghost_recv
->end();
2442 if (m_prof
) m_prof
->pop(m_exec_conf
);
2445 if (m_prof
) m_prof
->pop(m_exec_conf
);
2449 //! Export CommunicatorGPU class to python
2450 void export_CommunicatorGPU()
2452 boost::python::class_
<CommunicatorGPU
, boost::python::bases
<Communicator
>, boost::shared_ptr
<CommunicatorGPU
>, boost::noncopyable
>("CommunicatorGPU",
2453 boost::python::init
<boost::shared_ptr
<SystemDefinition
>,
2454 boost::shared_ptr
<DomainDecomposition
> >())
2455 .def("setMaxStages",&CommunicatorGPU::setMaxStages
)
2459 #endif // ENABLE_CUDA
2460 #endif // ENABLE_MPI