2 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
28 #include "TOMS_transpose.h"
30 /**************************************************************************/
32 static int transpose_mpi_get_block_size(int n
, int n_pes
)
34 return((n
+ n_pes
- 1) / n_pes
);
37 void transpose_mpi_get_local_size(int n
, int my_pe
, int n_pes
,
38 int *local_n
, int *local_start
)
42 block_size
= transpose_mpi_get_block_size(n
,n_pes
);
43 n_pes
= (n
+ block_size
- 1) / block_size
;
50 *local_start
= my_pe
* block_size
;
51 if (my_pe
== n_pes
- 1)
52 *local_n
= n
- *local_start
;
54 *local_n
= block_size
;
58 #define MAX2(a,b) ((a) > (b) ? (a) : (b))
60 int transpose_mpi_get_local_storage_size(int nx
, int ny
,
63 int local_nx
, local_ny
, local_x_start
, local_y_start
;
65 transpose_mpi_get_local_size(nx
,my_pe
,n_pes
,&local_nx
,&local_x_start
);
66 transpose_mpi_get_local_size(ny
,my_pe
,n_pes
,&local_ny
,&local_y_start
);
68 return MAX2(1, MAX2(local_nx
*ny
, nx
*local_ny
));
71 static int gcd(int a
, int b
)
82 /**************************************************************************/
84 transpose_mpi_plan
transpose_mpi_create_plan(int nx
, int ny
,
85 MPI_Comm transpose_comm
)
89 int x_block_size
, local_nx
, local_x_start
;
90 int y_block_size
, local_ny
, local_y_start
;
91 transpose_mpi_exchange
*exchange
= 0;
92 int step
, send_block_size
= 0, recv_block_size
= 0, num_steps
= 0;
93 int **sched
, sched_npes
, sched_sortpe
, sched_sort_ascending
= 0;
94 int *perm_block_dest
= NULL
;
95 int num_perm_blocks
= 0, perm_block_size
= 0, perm_block
;
98 int *send_block_sizes
= 0, *send_block_offsets
= 0;
99 int *recv_block_sizes
= 0, *recv_block_offsets
= 0;
102 /* create a new "clone" communicator so that transpose
103 communications do not interfere with caller communications. */
104 MPI_Comm_dup(transpose_comm
, &comm
);
106 MPI_Comm_rank(comm
,&my_pe
);
107 MPI_Comm_size(comm
,&n_pes
);
109 /* work space for in-place transpose routine: */
110 move_size
= (nx
+ ny
) / 2;
111 move
= (char *) fftw_malloc(sizeof(char) * move_size
);
113 x_block_size
= transpose_mpi_get_block_size(nx
,n_pes
);
114 transpose_mpi_get_local_size(nx
,my_pe
,n_pes
,&local_nx
,&local_x_start
);
115 y_block_size
= transpose_mpi_get_block_size(ny
,n_pes
);
116 transpose_mpi_get_local_size(ny
,my_pe
,n_pes
,&local_ny
,&local_y_start
);
118 /* allocate pre-computed post-transpose permutation: */
119 perm_block_size
= gcd(nx
,x_block_size
);
120 num_perm_blocks
= (nx
/ perm_block_size
) * local_ny
;
121 perm_block_dest
= (int *) fftw_malloc(sizeof(int) * num_perm_blocks
);
122 for (perm_block
= 0; perm_block
< num_perm_blocks
; ++perm_block
)
123 perm_block_dest
[perm_block
] = num_perm_blocks
;
125 /* allocate block sizes/offsets arrays for out-of-place transpose: */
126 send_block_sizes
= (int *) fftw_malloc(n_pes
* sizeof(int));
127 send_block_offsets
= (int *) fftw_malloc(n_pes
* sizeof(int));
128 recv_block_sizes
= (int *) fftw_malloc(n_pes
* sizeof(int));
129 recv_block_offsets
= (int *) fftw_malloc(n_pes
* sizeof(int));
130 for (step
= 0; step
< n_pes
; ++step
)
131 send_block_sizes
[step
] = send_block_offsets
[step
] =
132 recv_block_sizes
[step
] = recv_block_offsets
[step
] = 0;
134 if (local_nx
> 0 || local_ny
> 0) {
138 for (pe
= 0; pe
< n_pes
; ++pe
) {
139 int pe_nx
, pe_x_start
, pe_ny
, pe_y_start
;
141 transpose_mpi_get_local_size(nx
,pe
,n_pes
,
143 transpose_mpi_get_local_size(ny
,pe
,n_pes
,
146 if (pe_nx
== 0 && pe_ny
== 0) {
150 else if (pe_nx
* y_block_size
!= pe_ny
* x_block_size
151 && pe_nx
!= 0 && pe_ny
!= 0) {
152 if (sched_sortpe
!= -1)
153 fftw_mpi_die("BUG: More than one PE needs sorting!\n");
155 sched_sort_ascending
=
156 pe_nx
* y_block_size
> pe_ny
* x_block_size
;
160 sched
= make_comm_schedule(sched_npes
);
162 MPI_Comm_free(&comm
);
166 if (sched_sortpe
!= -1) {
167 sort_comm_schedule(sched
,sched_npes
,sched_sortpe
);
168 if (!sched_sort_ascending
)
169 invert_comm_schedule(sched
,sched_npes
);
172 send_block_size
= local_nx
* y_block_size
;
173 recv_block_size
= local_ny
* x_block_size
;
175 num_steps
= sched_npes
;
177 exchange
= (transpose_mpi_exchange
*)
178 fftw_malloc(num_steps
* sizeof(transpose_mpi_exchange
));
180 free_comm_schedule(sched
,sched_npes
);
181 MPI_Comm_free(&comm
);
185 for (step
= 0; step
< sched_npes
; ++step
) {
187 int dest_nx
, dest_x_start
;
188 int dest_ny
, dest_y_start
;
189 int num_perm_blocks_received
, num_perm_rows_received
;
191 exchange
[step
].dest_pe
= dest_pe
=
192 exchange
[step
].block_num
= sched
[my_pe
][step
];
194 if (exchange
[step
].block_num
== -1)
195 fftw_mpi_die("BUG: schedule ended too early.\n");
197 transpose_mpi_get_local_size(nx
,dest_pe
,n_pes
,
198 &dest_nx
,&dest_x_start
);
199 transpose_mpi_get_local_size(ny
,dest_pe
,n_pes
,
200 &dest_ny
,&dest_y_start
);
202 exchange
[step
].send_size
= local_nx
* dest_ny
;
203 exchange
[step
].recv_size
= dest_nx
* local_ny
;
205 send_block_sizes
[dest_pe
] = exchange
[step
].send_size
;
206 send_block_offsets
[dest_pe
] = dest_pe
* send_block_size
;
207 recv_block_sizes
[dest_pe
] = exchange
[step
].recv_size
;
208 recv_block_offsets
[dest_pe
] = dest_pe
* recv_block_size
;
210 /* Precompute the post-transpose permutation (ugh): */
211 if (exchange
[step
].recv_size
> 0) {
212 num_perm_blocks_received
=
213 exchange
[step
].recv_size
/ perm_block_size
;
214 num_perm_rows_received
= num_perm_blocks_received
/ local_ny
;
216 for (perm_block
= 0; perm_block
< num_perm_blocks_received
;
218 int old_block
, new_block
;
220 old_block
= perm_block
+ exchange
[step
].block_num
*
221 (recv_block_size
/ perm_block_size
);
222 new_block
= perm_block
% num_perm_rows_received
+
223 dest_x_start
/ perm_block_size
+
224 (perm_block
/ num_perm_rows_received
)
225 * (nx
/ perm_block_size
);
227 if (old_block
>= num_perm_blocks
||
228 new_block
>= num_perm_blocks
)
229 fftw_mpi_die("bad block index in permutation!");
231 perm_block_dest
[old_block
] = new_block
;
236 free_comm_schedule(sched
,sched_npes
);
238 } /* if (local_nx > 0 || local_ny > 0) */
240 p
= (transpose_mpi_plan
)
241 fftw_malloc(sizeof(transpose_mpi_plan_struct
));
244 MPI_Comm_free(&comm
);
249 p
->nx
= nx
; p
->ny
= ny
;
250 p
->local_nx
= local_nx
; p
->local_ny
= local_ny
;
252 p
->my_pe
= my_pe
; p
->n_pes
= n_pes
;
254 p
->exchange
= exchange
;
255 p
->send_block_size
= send_block_size
;
256 p
->recv_block_size
= recv_block_size
;
257 p
->num_steps
= num_steps
;
259 p
->perm_block_dest
= perm_block_dest
;
260 p
->num_perm_blocks
= num_perm_blocks
;
261 p
->perm_block_size
= perm_block_size
;
264 p
->move_size
= move_size
;
266 p
->send_block_sizes
= send_block_sizes
;
267 p
->send_block_offsets
= send_block_offsets
;
268 p
->recv_block_sizes
= recv_block_sizes
;
269 p
->recv_block_offsets
= recv_block_offsets
;
271 p
->all_blocks_equal
= send_block_size
* n_pes
* n_pes
== nx
* ny
&&
272 recv_block_size
* n_pes
* n_pes
== nx
* ny
;
273 if (p
->all_blocks_equal
)
274 for (step
= 0; step
< n_pes
; ++step
)
275 if (send_block_sizes
[step
] != send_block_size
||
276 recv_block_sizes
[step
] != recv_block_size
) {
277 p
->all_blocks_equal
= 0;
280 if (nx
% n_pes
== 0 && ny
% n_pes
== 0 && !p
->all_blocks_equal
)
281 fftw_mpi_die("n_pes divided dimensions but blocks are unequal!");
283 /* Set the type constant for passing to the MPI routines;
284 here, we assume that TRANSPOSE_EL_TYPE is one of the
285 floating-point types. */
286 if (sizeof(TRANSPOSE_EL_TYPE
) == sizeof(double))
287 p
->el_type
= MPI_DOUBLE
;
288 else if (sizeof(TRANSPOSE_EL_TYPE
) == sizeof(float))
289 p
->el_type
= MPI_FLOAT
;
291 fftw_mpi_die("Unknown TRANSPOSE_EL_TYPE!\n");
296 /**************************************************************************/
298 void transpose_mpi_destroy_plan(transpose_mpi_plan p
)
302 fftw_free(p
->exchange
);
303 if (p
->perm_block_dest
)
304 fftw_free(p
->perm_block_dest
);
307 if (p
->send_block_sizes
)
308 fftw_free(p
->send_block_sizes
);
309 if (p
->send_block_offsets
)
310 fftw_free(p
->send_block_offsets
);
311 if (p
->recv_block_sizes
)
312 fftw_free(p
->recv_block_sizes
);
313 if (p
->recv_block_offsets
)
314 fftw_free(p
->recv_block_offsets
);
315 MPI_Comm_free(&p
->comm
);
320 /**************************************************************************/
322 static void exchange_elements(TRANSPOSE_EL_TYPE
*buf1
,
323 TRANSPOSE_EL_TYPE
*buf2
, int n
)
326 TRANSPOSE_EL_TYPE t0
,t1
,t2
,t3
;
328 for (i
= 0; i
< (n
& 3); ++i
) {
333 for (; i
< n
; i
+= 4) {
339 buf1
[i
+1] = buf2
[i
+1];
340 buf1
[i
+2] = buf2
[i
+2];
341 buf1
[i
+3] = buf2
[i
+3];
349 static void do_permutation(TRANSPOSE_EL_TYPE
*data
,
350 int *perm_block_dest
,
356 /* Perform the permutation in the perm_block_dest array, following
357 the cycles around and *changing* the perm_block_dest array
358 to reflect the permutations that have already been performed.
359 At the end of this routine, we change the perm_block_dest
360 array back to its original state. (ugh) */
362 for (start_block
= 0; start_block
< num_perm_blocks
; ++start_block
) {
363 int cur_block
= start_block
;
364 int new_block
= perm_block_dest
[start_block
];
366 while (new_block
> 0 && new_block
< num_perm_blocks
&&
367 new_block
!= start_block
) {
368 exchange_elements(data
+ perm_block_size
*start_block
,
369 data
+ perm_block_size
*new_block
,
371 perm_block_dest
[cur_block
] = -1 - new_block
;
372 cur_block
= new_block
;
373 new_block
= perm_block_dest
[cur_block
];
376 if (new_block
== start_block
)
377 perm_block_dest
[cur_block
] = -1 - new_block
;
380 /* reset the permutation array (ugh): */
381 for (start_block
= 0; start_block
< num_perm_blocks
; ++start_block
)
382 perm_block_dest
[start_block
] = -1 - perm_block_dest
[start_block
];
385 TRANSPOSE_EL_TYPE
*transpose_allocate_send_buf(transpose_mpi_plan p
,
388 TRANSPOSE_EL_TYPE
*send_buf
= 0;
390 /* allocate the send buffer: */
391 if (p
->send_block_size
> 0) {
392 send_buf
= (TRANSPOSE_EL_TYPE
*)
393 fftw_malloc(p
->send_block_size
* el_size
394 * sizeof(TRANSPOSE_EL_TYPE
));
396 fftw_mpi_die("Out of memory!\n");
402 void transpose_in_place_local(transpose_mpi_plan p
,
403 int el_size
, TRANSPOSE_EL_TYPE
*local_data
,
404 transpose_in_place_which which
)
407 case BEFORE_TRANSPOSE
:
409 TOMS_transpose_2d(local_data
,
411 p
->move
, p
->move_size
);
413 TOMS_transpose_2d_arbitrary(local_data
,
416 p
->move
, p
->move_size
);
418 case AFTER_TRANSPOSE
:
419 do_permutation(local_data
, p
->perm_block_dest
,
420 p
->num_perm_blocks
, p
->perm_block_size
* el_size
);
425 /**************************************************************************/
427 static void local_transpose_copy(TRANSPOSE_EL_TYPE
*src
,
428 TRANSPOSE_EL_TYPE
*dest
,
429 int el_size
, int nx
, int ny
)
434 for (x
= 0; x
< nx
; ++x
)
435 for (y
= 0; y
< ny
; ++y
)
436 dest
[y
* nx
+ x
] = src
[x
* ny
+ y
];
437 else if (el_size
== 2)
438 for (x
= 0; x
< nx
; ++x
)
439 for (y
= 0; y
< ny
; ++y
) {
440 dest
[y
* (2 * nx
) + 2*x
] = src
[x
* (2 * ny
) + 2*y
];
441 dest
[y
* (2 * nx
) + 2*x
+ 1] = src
[x
* (2 * ny
) + 2*y
+ 1];
444 for (x
= 0; x
< nx
; ++x
)
445 for (y
= 0; y
< ny
; ++y
)
446 memcpy(&dest
[y
* (el_size
*nx
) + (el_size
*x
)],
447 &src
[x
* (el_size
*ny
) + (el_size
*y
)],
448 el_size
* sizeof(TRANSPOSE_EL_TYPE
));
452 /* Out-of-place version of transpose_mpi (or rather, in place using
454 static void transpose_mpi_out_of_place(transpose_mpi_plan p
, int el_size
,
455 TRANSPOSE_EL_TYPE
*local_data
,
456 TRANSPOSE_EL_TYPE
*work
)
458 local_transpose_copy(local_data
, work
, el_size
, p
->local_nx
, p
->ny
);
460 if (p
->all_blocks_equal
)
461 MPI_Alltoall(work
, p
->send_block_size
* el_size
, p
->el_type
,
462 local_data
, p
->recv_block_size
* el_size
, p
->el_type
,
465 int i
, n_pes
= p
->n_pes
;
467 for (i
= 0; i
< n_pes
; ++i
) {
468 p
->send_block_sizes
[i
] *= el_size
;
469 p
->recv_block_sizes
[i
] *= el_size
;
470 p
->send_block_offsets
[i
] *= el_size
;
471 p
->recv_block_offsets
[i
] *= el_size
;
473 MPI_Alltoallv(work
, p
->send_block_sizes
, p
->send_block_offsets
,
475 local_data
, p
->recv_block_sizes
, p
->recv_block_offsets
,
478 for (i
= 0; i
< n_pes
; ++i
) {
479 p
->send_block_sizes
[i
] /= el_size
;
480 p
->recv_block_sizes
[i
] /= el_size
;
481 p
->send_block_offsets
[i
] /= el_size
;
482 p
->recv_block_offsets
[i
] /= el_size
;
486 do_permutation(local_data
, p
->perm_block_dest
, p
->num_perm_blocks
,
487 p
->perm_block_size
* el_size
);
490 /**************************************************************************/
492 void transpose_mpi(transpose_mpi_plan p
, int el_size
,
493 TRANSPOSE_EL_TYPE
*local_data
,
494 TRANSPOSE_EL_TYPE
*work
)
496 /* if local_data and work are both NULL, we have no way of knowing
497 whether we should use in-place or out-of-place transpose routine;
498 if we guess wrong, MPI_Alltoall will block. We prevent this
499 by making sure that transpose_mpi_get_local_storage_size returns
501 if (!local_data
&& !work
)
502 fftw_mpi_die("local_data and work are both NULL!");
505 transpose_mpi_out_of_place(p
, el_size
, local_data
, work
);
506 else if (p
->local_nx
> 0 || p
->local_ny
> 0) {
508 TRANSPOSE_EL_TYPE
*send_buf
= transpose_allocate_send_buf(p
,el_size
);
510 transpose_in_place_local(p
, el_size
, local_data
, BEFORE_TRANSPOSE
);
512 for (step
= 0; step
< p
->num_steps
; ++step
) {
513 transpose_finish_exchange_step(p
, step
- 1);
515 transpose_start_exchange_step(p
, el_size
, local_data
,
516 send_buf
, step
, TRANSPOSE_SYNC
);
519 transpose_finish_exchange_step(p
, step
- 1);
521 transpose_in_place_local(p
, el_size
, local_data
, AFTER_TRANSPOSE
);
525 } /* if (local_nx > 0 || local_ny > 0) */
528 /**************************************************************************/
530 /* non-blocking routines for overlapping communication and computation: */
532 #define USE_SYNCHRONOUS_ISEND 1
533 #if USE_SYNCHRONOUS_ISEND
534 #define ISEND MPI_Issend
536 #define ISEND MPI_Isend
539 void transpose_get_send_block(transpose_mpi_plan p
, int step
,
540 int *block_y_start
, int *block_ny
)
542 if (p
->local_nx
> 0) {
544 p
->send_block_size
/ p
->local_nx
* p
->exchange
[step
].block_num
;
545 *block_ny
= p
->exchange
[step
].send_size
/ p
->local_nx
;
553 void transpose_start_exchange_step(transpose_mpi_plan p
,
555 TRANSPOSE_EL_TYPE
*local_data
,
556 TRANSPOSE_EL_TYPE
*send_buf
,
558 transpose_sync_type sync_type
)
560 if (p
->local_nx
> 0 || p
->local_ny
> 0) {
561 transpose_mpi_exchange
*exchange
= p
->exchange
;
562 int block
= exchange
[step
].block_num
;
563 int send_block_size
= p
->send_block_size
;
564 int recv_block_size
= p
->recv_block_size
;
566 if (exchange
[step
].dest_pe
!= p
->my_pe
) {
568 /* first, copy to send buffer: */
569 if (exchange
[step
].send_size
> 0)
571 local_data
+ el_size
*send_block_size
*block
,
572 el_size
* exchange
[step
].send_size
*
573 sizeof(TRANSPOSE_EL_TYPE
));
576 if (exchange[step].send_size > 0) { \
578 exchange[step].send_size * el_size, \
580 exchange[step].dest_pe, 0, \
585 p
->request
[0] = MPI_REQUEST_NULL
;
586 p
->request
[1] = MPI_REQUEST_NULL
;
588 if (sync_type
== TRANSPOSE_ASYNC
) {
589 /* Note that we impose an ordering on the sends and
590 receives (lower pe sends first) so that we won't
591 have deadlock if Isend & Irecv are blocking in some
592 MPI implementation: */
594 if (p
->my_pe
< exchange
[step
].dest_pe
)
597 if (exchange
[step
].recv_size
> 0) {
598 MPI_Irecv(local_data
+ el_size
*recv_block_size
*block
,
599 exchange
[step
].recv_size
* el_size
,
601 exchange
[step
].dest_pe
, MPI_ANY_TAG
,
606 if (p
->my_pe
> exchange
[step
].dest_pe
)
609 else /* (sync_type == TRANSPOSE_SYNC) */ {
612 MPI_Sendrecv(send_buf
,
613 exchange
[step
].send_size
* el_size
,
615 exchange
[step
].dest_pe
, 0,
617 local_data
+ el_size
*recv_block_size
*block
,
618 exchange
[step
].recv_size
* el_size
,
620 exchange
[step
].dest_pe
, MPI_ANY_TAG
,
625 else if (exchange
[step
].recv_size
> 0 &&
626 recv_block_size
!= send_block_size
)
627 memmove(local_data
+ el_size
*recv_block_size
*block
,
628 local_data
+ el_size
*send_block_size
*block
,
629 exchange
[step
].recv_size
* el_size
*
630 sizeof(TRANSPOSE_EL_TYPE
));
634 void transpose_finish_exchange_step(transpose_mpi_plan p
, int step
)
636 if ((p
->local_nx
> 0 || p
->local_ny
> 0) && step
>= 0
637 && p
->exchange
[step
].dest_pe
!= p
->my_pe
) {
638 MPI_Status status
[2];
640 MPI_Waitall(2,p
->request
,status
);