changed reading hint
[gromacs/adressmacs.git] / src / fftw / transpose_mpi.c
blob62f4153f5812d0cc6514dff6a82d0a0a6104f322
1 /*
2 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 #include <stdlib.h>
21 #include <string.h>
23 #include <mpi.h>
25 #include <fftw_mpi.h>
27 #include "sched.h"
28 #include "TOMS_transpose.h"
30 /**************************************************************************/
32 static int transpose_mpi_get_block_size(int n, int n_pes)
34 return((n + n_pes - 1) / n_pes);
37 void transpose_mpi_get_local_size(int n, int my_pe, int n_pes,
38 int *local_n, int *local_start)
40 int block_size;
42 block_size = transpose_mpi_get_block_size(n,n_pes);
43 n_pes = (n + block_size - 1) / block_size;
45 if (my_pe >= n_pes) {
46 *local_n = 0;
47 *local_start = 0;
49 else {
50 *local_start = my_pe * block_size;
51 if (my_pe == n_pes - 1)
52 *local_n = n - *local_start;
53 else
54 *local_n = block_size;
58 #define MAX2(a,b) ((a) > (b) ? (a) : (b))
60 int transpose_mpi_get_local_storage_size(int nx, int ny,
61 int my_pe, int n_pes)
63 int local_nx, local_ny, local_x_start, local_y_start;
65 transpose_mpi_get_local_size(nx,my_pe,n_pes,&local_nx,&local_x_start);
66 transpose_mpi_get_local_size(ny,my_pe,n_pes,&local_ny,&local_y_start);
68 return MAX2(1, MAX2(local_nx*ny, nx*local_ny));
71 static int gcd(int a, int b)
73 int r;
74 do {
75 r = a % b;
76 a = b;
77 b = r;
78 } while (r != 0);
79 return a;
82 /**************************************************************************/
84 transpose_mpi_plan transpose_mpi_create_plan(int nx, int ny,
85 MPI_Comm transpose_comm)
87 transpose_mpi_plan p;
88 int my_pe, n_pes, pe;
89 int x_block_size, local_nx, local_x_start;
90 int y_block_size, local_ny, local_y_start;
91 transpose_mpi_exchange *exchange = 0;
92 int step, send_block_size = 0, recv_block_size = 0, num_steps = 0;
93 int **sched, sched_npes, sched_sortpe, sched_sort_ascending = 0;
94 int *perm_block_dest = NULL;
95 int num_perm_blocks = 0, perm_block_size = 0, perm_block;
96 char *move = NULL;
97 int move_size = 0;
98 int *send_block_sizes = 0, *send_block_offsets = 0;
99 int *recv_block_sizes = 0, *recv_block_offsets = 0;
100 MPI_Comm comm;
102 /* create a new "clone" communicator so that transpose
103 communications do not interfere with caller communications. */
104 MPI_Comm_dup(transpose_comm, &comm);
106 MPI_Comm_rank(comm,&my_pe);
107 MPI_Comm_size(comm,&n_pes);
109 /* work space for in-place transpose routine: */
110 move_size = (nx + ny) / 2;
111 move = (char *) fftw_malloc(sizeof(char) * move_size);
113 x_block_size = transpose_mpi_get_block_size(nx,n_pes);
114 transpose_mpi_get_local_size(nx,my_pe,n_pes,&local_nx,&local_x_start);
115 y_block_size = transpose_mpi_get_block_size(ny,n_pes);
116 transpose_mpi_get_local_size(ny,my_pe,n_pes,&local_ny,&local_y_start);
118 /* allocate pre-computed post-transpose permutation: */
119 perm_block_size = gcd(nx,x_block_size);
120 num_perm_blocks = (nx / perm_block_size) * local_ny;
121 perm_block_dest = (int *) fftw_malloc(sizeof(int) * num_perm_blocks);
122 for (perm_block = 0; perm_block < num_perm_blocks; ++perm_block)
123 perm_block_dest[perm_block] = num_perm_blocks;
125 /* allocate block sizes/offsets arrays for out-of-place transpose: */
126 send_block_sizes = (int *) fftw_malloc(n_pes * sizeof(int));
127 send_block_offsets = (int *) fftw_malloc(n_pes * sizeof(int));
128 recv_block_sizes = (int *) fftw_malloc(n_pes * sizeof(int));
129 recv_block_offsets = (int *) fftw_malloc(n_pes * sizeof(int));
130 for (step = 0; step < n_pes; ++step)
131 send_block_sizes[step] = send_block_offsets[step] =
132 recv_block_sizes[step] = recv_block_offsets[step] = 0;
134 if (local_nx > 0 || local_ny > 0) {
136 sched_npes = n_pes;
137 sched_sortpe = -1;
138 for (pe = 0; pe < n_pes; ++pe) {
139 int pe_nx, pe_x_start, pe_ny, pe_y_start;
141 transpose_mpi_get_local_size(nx,pe,n_pes,
142 &pe_nx,&pe_x_start);
143 transpose_mpi_get_local_size(ny,pe,n_pes,
144 &pe_ny,&pe_y_start);
146 if (pe_nx == 0 && pe_ny == 0) {
147 sched_npes = pe;
148 break;
150 else if (pe_nx * y_block_size != pe_ny * x_block_size
151 && pe_nx != 0 && pe_ny != 0) {
152 if (sched_sortpe != -1)
153 fftw_mpi_die("BUG: More than one PE needs sorting!\n");
154 sched_sortpe = pe;
155 sched_sort_ascending =
156 pe_nx * y_block_size > pe_ny * x_block_size;
160 sched = make_comm_schedule(sched_npes);
161 if (!sched) {
162 MPI_Comm_free(&comm);
163 return 0;
166 if (sched_sortpe != -1) {
167 sort_comm_schedule(sched,sched_npes,sched_sortpe);
168 if (!sched_sort_ascending)
169 invert_comm_schedule(sched,sched_npes);
172 send_block_size = local_nx * y_block_size;
173 recv_block_size = local_ny * x_block_size;
175 num_steps = sched_npes;
177 exchange = (transpose_mpi_exchange *)
178 fftw_malloc(num_steps * sizeof(transpose_mpi_exchange));
179 if (!exchange) {
180 free_comm_schedule(sched,sched_npes);
181 MPI_Comm_free(&comm);
182 return 0;
185 for (step = 0; step < sched_npes; ++step) {
186 int dest_pe;
187 int dest_nx, dest_x_start;
188 int dest_ny, dest_y_start;
189 int num_perm_blocks_received, num_perm_rows_received;
191 exchange[step].dest_pe = dest_pe =
192 exchange[step].block_num = sched[my_pe][step];
194 if (exchange[step].block_num == -1)
195 fftw_mpi_die("BUG: schedule ended too early.\n");
197 transpose_mpi_get_local_size(nx,dest_pe,n_pes,
198 &dest_nx,&dest_x_start);
199 transpose_mpi_get_local_size(ny,dest_pe,n_pes,
200 &dest_ny,&dest_y_start);
202 exchange[step].send_size = local_nx * dest_ny;
203 exchange[step].recv_size = dest_nx * local_ny;
205 send_block_sizes[dest_pe] = exchange[step].send_size;
206 send_block_offsets[dest_pe] = dest_pe * send_block_size;
207 recv_block_sizes[dest_pe] = exchange[step].recv_size;
208 recv_block_offsets[dest_pe] = dest_pe * recv_block_size;
210 /* Precompute the post-transpose permutation (ugh): */
211 if (exchange[step].recv_size > 0) {
212 num_perm_blocks_received =
213 exchange[step].recv_size / perm_block_size;
214 num_perm_rows_received = num_perm_blocks_received / local_ny;
216 for (perm_block = 0; perm_block < num_perm_blocks_received;
217 ++perm_block) {
218 int old_block, new_block;
220 old_block = perm_block + exchange[step].block_num *
221 (recv_block_size / perm_block_size);
222 new_block = perm_block % num_perm_rows_received +
223 dest_x_start / perm_block_size +
224 (perm_block / num_perm_rows_received)
225 * (nx / perm_block_size);
227 if (old_block >= num_perm_blocks ||
228 new_block >= num_perm_blocks)
229 fftw_mpi_die("bad block index in permutation!");
231 perm_block_dest[old_block] = new_block;
236 free_comm_schedule(sched,sched_npes);
238 } /* if (local_nx > 0 || local_ny > 0) */
240 p = (transpose_mpi_plan)
241 fftw_malloc(sizeof(transpose_mpi_plan_struct));
242 if (!p) {
243 fftw_free(exchange);
244 MPI_Comm_free(&comm);
245 return 0;
248 p->comm = comm;
249 p->nx = nx; p->ny = ny;
250 p->local_nx = local_nx; p->local_ny = local_ny;
252 p->my_pe = my_pe; p->n_pes = n_pes;
254 p->exchange = exchange;
255 p->send_block_size = send_block_size;
256 p->recv_block_size = recv_block_size;
257 p->num_steps = num_steps;
259 p->perm_block_dest = perm_block_dest;
260 p->num_perm_blocks = num_perm_blocks;
261 p->perm_block_size = perm_block_size;
263 p->move = move;
264 p->move_size = move_size;
266 p->send_block_sizes = send_block_sizes;
267 p->send_block_offsets = send_block_offsets;
268 p->recv_block_sizes = recv_block_sizes;
269 p->recv_block_offsets = recv_block_offsets;
271 p->all_blocks_equal = send_block_size * n_pes * n_pes == nx * ny &&
272 recv_block_size * n_pes * n_pes == nx * ny;
273 if (p->all_blocks_equal)
274 for (step = 0; step < n_pes; ++step)
275 if (send_block_sizes[step] != send_block_size ||
276 recv_block_sizes[step] != recv_block_size) {
277 p->all_blocks_equal = 0;
278 break;
280 if (nx % n_pes == 0 && ny % n_pes == 0 && !p->all_blocks_equal)
281 fftw_mpi_die("n_pes divided dimensions but blocks are unequal!");
283 /* Set the type constant for passing to the MPI routines;
284 here, we assume that TRANSPOSE_EL_TYPE is one of the
285 floating-point types. */
286 if (sizeof(TRANSPOSE_EL_TYPE) == sizeof(double))
287 p->el_type = MPI_DOUBLE;
288 else if (sizeof(TRANSPOSE_EL_TYPE) == sizeof(float))
289 p->el_type = MPI_FLOAT;
290 else
291 fftw_mpi_die("Unknown TRANSPOSE_EL_TYPE!\n");
293 return p;
296 /**************************************************************************/
298 void transpose_mpi_destroy_plan(transpose_mpi_plan p)
300 if (p) {
301 if (p->exchange)
302 fftw_free(p->exchange);
303 if (p->perm_block_dest)
304 fftw_free(p->perm_block_dest);
305 if (p->move)
306 fftw_free(p->move);
307 if (p->send_block_sizes)
308 fftw_free(p->send_block_sizes);
309 if (p->send_block_offsets)
310 fftw_free(p->send_block_offsets);
311 if (p->recv_block_sizes)
312 fftw_free(p->recv_block_sizes);
313 if (p->recv_block_offsets)
314 fftw_free(p->recv_block_offsets);
315 MPI_Comm_free(&p->comm);
316 fftw_free(p);
320 /**************************************************************************/
322 static void exchange_elements(TRANSPOSE_EL_TYPE *buf1,
323 TRANSPOSE_EL_TYPE *buf2, int n)
325 int i;
326 TRANSPOSE_EL_TYPE t0,t1,t2,t3;
328 for (i = 0; i < (n & 3); ++i) {
329 t0 = buf1[i];
330 buf1[i] = buf2[i];
331 buf2[i] = t0;
333 for (; i < n; i += 4) {
334 t0 = buf1[i];
335 t1 = buf1[i+1];
336 t2 = buf1[i+2];
337 t3 = buf1[i+3];
338 buf1[i] = buf2[i];
339 buf1[i+1] = buf2[i+1];
340 buf1[i+2] = buf2[i+2];
341 buf1[i+3] = buf2[i+3];
342 buf2[i] = t0;
343 buf2[i+1] = t1;
344 buf2[i+2] = t2;
345 buf2[i+3] = t3;
349 static void do_permutation(TRANSPOSE_EL_TYPE *data,
350 int *perm_block_dest,
351 int num_perm_blocks,
352 int perm_block_size)
354 int start_block;
356 /* Perform the permutation in the perm_block_dest array, following
357 the cycles around and *changing* the perm_block_dest array
358 to reflect the permutations that have already been performed.
359 At the end of this routine, we change the perm_block_dest
360 array back to its original state. (ugh) */
362 for (start_block = 0; start_block < num_perm_blocks; ++start_block) {
363 int cur_block = start_block;
364 int new_block = perm_block_dest[start_block];
366 while (new_block > 0 && new_block < num_perm_blocks &&
367 new_block != start_block) {
368 exchange_elements(data + perm_block_size*start_block,
369 data + perm_block_size*new_block,
370 perm_block_size);
371 perm_block_dest[cur_block] = -1 - new_block;
372 cur_block = new_block;
373 new_block = perm_block_dest[cur_block];
376 if (new_block == start_block)
377 perm_block_dest[cur_block] = -1 - new_block;
380 /* reset the permutation array (ugh): */
381 for (start_block = 0; start_block < num_perm_blocks; ++start_block)
382 perm_block_dest[start_block] = -1 - perm_block_dest[start_block];
385 TRANSPOSE_EL_TYPE *transpose_allocate_send_buf(transpose_mpi_plan p,
386 int el_size)
388 TRANSPOSE_EL_TYPE *send_buf = 0;
390 /* allocate the send buffer: */
391 if (p->send_block_size > 0) {
392 send_buf = (TRANSPOSE_EL_TYPE *)
393 fftw_malloc(p->send_block_size * el_size
394 * sizeof(TRANSPOSE_EL_TYPE));
395 if (!send_buf)
396 fftw_mpi_die("Out of memory!\n");
399 return send_buf;
402 void transpose_in_place_local(transpose_mpi_plan p,
403 int el_size, TRANSPOSE_EL_TYPE *local_data,
404 transpose_in_place_which which)
406 switch (which) {
407 case BEFORE_TRANSPOSE:
408 if (el_size == 1)
409 TOMS_transpose_2d(local_data,
410 p->local_nx, p->ny,
411 p->move, p->move_size);
412 else
413 TOMS_transpose_2d_arbitrary(local_data,
414 p->local_nx, p->ny,
415 el_size,
416 p->move, p->move_size);
417 break;
418 case AFTER_TRANSPOSE:
419 do_permutation(local_data, p->perm_block_dest,
420 p->num_perm_blocks, p->perm_block_size * el_size);
421 break;
425 /**************************************************************************/
427 static void local_transpose_copy(TRANSPOSE_EL_TYPE *src,
428 TRANSPOSE_EL_TYPE *dest,
429 int el_size, int nx, int ny)
431 int x, y;
433 if (el_size == 1)
434 for (x = 0; x < nx; ++x)
435 for (y = 0; y < ny; ++y)
436 dest[y * nx + x] = src[x * ny + y];
437 else if (el_size == 2)
438 for (x = 0; x < nx; ++x)
439 for (y = 0; y < ny; ++y) {
440 dest[y * (2 * nx) + 2*x] = src[x * (2 * ny) + 2*y];
441 dest[y * (2 * nx) + 2*x + 1] = src[x * (2 * ny) + 2*y + 1];
443 else
444 for (x = 0; x < nx; ++x)
445 for (y = 0; y < ny; ++y)
446 memcpy(&dest[y * (el_size*nx) + (el_size*x)],
447 &src[x * (el_size*ny) + (el_size*y)],
448 el_size * sizeof(TRANSPOSE_EL_TYPE));
452 /* Out-of-place version of transpose_mpi (or rather, in place using
453 a scratch array): */
454 static void transpose_mpi_out_of_place(transpose_mpi_plan p, int el_size,
455 TRANSPOSE_EL_TYPE *local_data,
456 TRANSPOSE_EL_TYPE *work)
458 local_transpose_copy(local_data, work, el_size, p->local_nx, p->ny);
460 if (p->all_blocks_equal)
461 MPI_Alltoall(work, p->send_block_size * el_size, p->el_type,
462 local_data, p->recv_block_size * el_size, p->el_type,
463 p->comm);
464 else {
465 int i, n_pes = p->n_pes;
467 for (i = 0; i < n_pes; ++i) {
468 p->send_block_sizes[i] *= el_size;
469 p->recv_block_sizes[i] *= el_size;
470 p->send_block_offsets[i] *= el_size;
471 p->recv_block_offsets[i] *= el_size;
473 MPI_Alltoallv(work, p->send_block_sizes, p->send_block_offsets,
474 p->el_type,
475 local_data, p->recv_block_sizes, p->recv_block_offsets,
476 p->el_type,
477 p->comm);
478 for (i = 0; i < n_pes; ++i) {
479 p->send_block_sizes[i] /= el_size;
480 p->recv_block_sizes[i] /= el_size;
481 p->send_block_offsets[i] /= el_size;
482 p->recv_block_offsets[i] /= el_size;
486 do_permutation(local_data, p->perm_block_dest, p->num_perm_blocks,
487 p->perm_block_size * el_size);
490 /**************************************************************************/
492 void transpose_mpi(transpose_mpi_plan p, int el_size,
493 TRANSPOSE_EL_TYPE *local_data,
494 TRANSPOSE_EL_TYPE *work)
496 /* if local_data and work are both NULL, we have no way of knowing
497 whether we should use in-place or out-of-place transpose routine;
498 if we guess wrong, MPI_Alltoall will block. We prevent this
499 by making sure that transpose_mpi_get_local_storage_size returns
500 at least 1. */
501 if (!local_data && !work)
502 fftw_mpi_die("local_data and work are both NULL!");
504 if (work)
505 transpose_mpi_out_of_place(p, el_size, local_data, work);
506 else if (p->local_nx > 0 || p->local_ny > 0) {
507 int step;
508 TRANSPOSE_EL_TYPE *send_buf = transpose_allocate_send_buf(p,el_size);
510 transpose_in_place_local(p, el_size, local_data, BEFORE_TRANSPOSE);
512 for (step = 0; step < p->num_steps; ++step) {
513 transpose_finish_exchange_step(p, step - 1);
515 transpose_start_exchange_step(p, el_size, local_data,
516 send_buf, step, TRANSPOSE_SYNC);
519 transpose_finish_exchange_step(p, step - 1);
521 transpose_in_place_local(p, el_size, local_data, AFTER_TRANSPOSE);
523 if (send_buf)
524 fftw_free(send_buf);
525 } /* if (local_nx > 0 || local_ny > 0) */
528 /**************************************************************************/
530 /* non-blocking routines for overlapping communication and computation: */
532 #define USE_SYNCHRONOUS_ISEND 1
533 #if USE_SYNCHRONOUS_ISEND
534 #define ISEND MPI_Issend
535 #else
536 #define ISEND MPI_Isend
537 #endif
539 void transpose_get_send_block(transpose_mpi_plan p, int step,
540 int *block_y_start, int *block_ny)
542 if (p->local_nx > 0) {
543 *block_y_start =
544 p->send_block_size / p->local_nx * p->exchange[step].block_num;
545 *block_ny = p->exchange[step].send_size / p->local_nx;
547 else {
548 *block_y_start = 0;
549 *block_ny = 0;
553 void transpose_start_exchange_step(transpose_mpi_plan p,
554 int el_size,
555 TRANSPOSE_EL_TYPE *local_data,
556 TRANSPOSE_EL_TYPE *send_buf,
557 int step,
558 transpose_sync_type sync_type)
560 if (p->local_nx > 0 || p->local_ny > 0) {
561 transpose_mpi_exchange *exchange = p->exchange;
562 int block = exchange[step].block_num;
563 int send_block_size = p->send_block_size;
564 int recv_block_size = p->recv_block_size;
566 if (exchange[step].dest_pe != p->my_pe) {
568 /* first, copy to send buffer: */
569 if (exchange[step].send_size > 0)
570 memcpy(send_buf,
571 local_data + el_size*send_block_size*block,
572 el_size * exchange[step].send_size *
573 sizeof(TRANSPOSE_EL_TYPE));
575 #define DO_ISEND \
576 if (exchange[step].send_size > 0) { \
577 ISEND(send_buf, \
578 exchange[step].send_size * el_size, \
579 p->el_type, \
580 exchange[step].dest_pe, 0, \
581 p->comm, \
582 &p->request[0]); \
585 p->request[0] = MPI_REQUEST_NULL;
586 p->request[1] = MPI_REQUEST_NULL;
588 if (sync_type == TRANSPOSE_ASYNC) {
589 /* Note that we impose an ordering on the sends and
590 receives (lower pe sends first) so that we won't
591 have deadlock if Isend & Irecv are blocking in some
592 MPI implementation: */
594 if (p->my_pe < exchange[step].dest_pe)
595 DO_ISEND;
597 if (exchange[step].recv_size > 0) {
598 MPI_Irecv(local_data + el_size*recv_block_size*block,
599 exchange[step].recv_size * el_size,
600 p->el_type,
601 exchange[step].dest_pe, MPI_ANY_TAG,
602 p->comm,
603 &p->request[1]);
606 if (p->my_pe > exchange[step].dest_pe)
607 DO_ISEND;
609 else /* (sync_type == TRANSPOSE_SYNC) */ {
610 MPI_Status status;
612 MPI_Sendrecv(send_buf,
613 exchange[step].send_size * el_size,
614 p->el_type,
615 exchange[step].dest_pe, 0,
617 local_data + el_size*recv_block_size*block,
618 exchange[step].recv_size * el_size,
619 p->el_type,
620 exchange[step].dest_pe, MPI_ANY_TAG,
622 p->comm, &status);
625 else if (exchange[step].recv_size > 0 &&
626 recv_block_size != send_block_size)
627 memmove(local_data + el_size*recv_block_size*block,
628 local_data + el_size*send_block_size*block,
629 exchange[step].recv_size * el_size *
630 sizeof(TRANSPOSE_EL_TYPE));
634 void transpose_finish_exchange_step(transpose_mpi_plan p, int step)
636 if ((p->local_nx > 0 || p->local_ny > 0) && step >= 0
637 && p->exchange[step].dest_pe != p->my_pe) {
638 MPI_Status status[2];
640 MPI_Waitall(2,p->request,status);