2 * Copyright (c) 1997-1999 Massachusetts Institute of Technology
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
30 void perform_transpose_async(transpose_mpi_plan tp
, int el_size
,
31 TRANSPOSE_EL_TYPE
*data
,
32 int local_nx
, int local_x_start
, int ny
)
35 TRANSPOSE_EL_TYPE
*send_buf
;
38 for (i
= 0; i
< local_nx
* ny
* el_size
; ++i
)
41 send_buf
= transpose_allocate_send_buf(tp
,el_size
);
43 transpose_in_place_local(tp
, el_size
, data
, BEFORE_TRANSPOSE
);
45 for (step
= 0; step
< tp
->num_steps
; ++step
) {
46 int block_y_start
, block_ny
;
48 /* initialize block for step here (overlap initialization
49 and communication): */
50 transpose_get_send_block(tp
, step
,
51 &block_y_start
, &block_ny
);
52 for (i
= 0; i
< local_nx
; ++i
)
53 for (j
= block_y_start
; j
< block_y_start
+ block_ny
; ++j
)
54 for (el
= 0; el
< el_size
; ++el
)
55 data
[el
+el_size
*(i
+j
*local_nx
)]
56 = ny
* (i
+ local_x_start
) + j
;
58 transpose_finish_exchange_step(tp
, step
- 1);
59 transpose_start_exchange_step(tp
,el_size
,data
,send_buf
,step
,
63 transpose_finish_exchange_step(tp
, step
- 1);
65 transpose_in_place_local(tp
, el_size
, data
, AFTER_TRANSPOSE
);
71 void test_transpose_plan(int nx
, int ny
, int el_size
, int verbose
,
72 transpose_mpi_plan tp
,
73 TRANSPOSE_EL_TYPE
*data
,
74 TRANSPOSE_EL_TYPE
*work
,
79 int local_x_start
,local_nx
,local_y_start
,local_ny
, xbsize
,ybsize
;
81 MPI_Comm_rank(MPI_COMM_WORLD
,&my_pe
);
82 MPI_Comm_size(MPI_COMM_WORLD
,&n_pes
);
84 transpose_mpi_get_local_size(nx
,my_pe
,n_pes
,
85 &local_nx
,&local_x_start
);
86 transpose_mpi_get_local_size(ny
,my_pe
,n_pes
,
87 &local_ny
,&local_y_start
);
90 perform_transpose_async(tp
,el_size
,data
,local_nx
,local_x_start
,ny
);
93 for (i
= 0; i
< local_nx
; ++i
)
94 for (j
= 0; j
< ny
; ++j
)
95 for (el
= 0; el
< el_size
; ++el
)
96 data
[el
+el_size
*(i
*ny
+j
)]
97 = ny
* (i
+ local_x_start
) + j
;
99 transpose_mpi(tp
,el_size
,data
,work
);
102 /* print out result matrix if it is small: */
103 if (verbose
&& local_ny
< 16 && nx
< 16) {
106 for (pe
= 0; pe
< n_pes
; ++pe
) {
107 MPI_Barrier(MPI_COMM_WORLD
);
109 printf("\nprocess %d result: \n", my_pe
);
110 for (j
= 0; j
< nx
; ++j
) {
111 for (i
= 0; i
< local_ny
; ++i
)
112 printf("%4.0f",data
[el_size
*(i
*nx
+ j
)]);
117 MPI_Barrier(MPI_COMM_WORLD
);
121 printf("Checking result on process %d...\n",my_pe
);
123 for (i
= 0; i
< local_ny
; ++i
)
124 for (j
= 0; j
< nx
; ++j
)
125 for (el
= 0; el
< el_size
; ++el
) {
126 if (data
[el
+el_size
*(i
*nx
+ j
)] !=
127 (TRANSPOSE_EL_TYPE
) (j
*ny
+ i
+local_y_start
)) {
129 "Error with x=%d, y=%d on process %d!\n"
130 " -- is %g rather than %g\n",
131 j
,i
+local_y_start
,my_pe
,
132 data
[el
+el_size
*(i
*nx
+ j
)],
133 (TRANSPOSE_EL_TYPE
) (j
*ny
+ i
+local_y_start
));
134 fftw_die("incorrect result.\n");
139 printf("Process %d okay!\n",my_pe
);
142 void test_transpose(int nx
, int ny
, int el_size
, int verbose
)
146 int local_x_start
,local_nx
,local_y_start
,local_ny
, xbsize
,ybsize
;
147 int total_local_size
;
148 TRANSPOSE_EL_TYPE
*data
, *work
;
149 transpose_mpi_plan tp
;
152 MPI_Comm_rank(MPI_COMM_WORLD
,&my_pe
);
153 MPI_Comm_size(MPI_COMM_WORLD
,&n_pes
);
156 printf("\nTesting transpose of %dx%d matrix (%d elements)...\n",
159 transpose_mpi_get_local_size(nx
,my_pe
,n_pes
,
160 &local_nx
,&local_x_start
);
161 transpose_mpi_get_local_size(ny
,my_pe
,n_pes
,
162 &local_ny
,&local_y_start
);
163 total_local_size
= transpose_mpi_get_local_storage_size(nx
,ny
,
166 if (total_local_size
== 0)
169 data
= (TRANSPOSE_EL_TYPE
*)
170 fftw_malloc(total_local_size
*el_size
*sizeof(TRANSPOSE_EL_TYPE
));
171 work
= (TRANSPOSE_EL_TYPE
*)
172 fftw_malloc(total_local_size
*el_size
*sizeof(TRANSPOSE_EL_TYPE
));
175 tp
= transpose_mpi_create_plan(nx
,ny
,MPI_COMM_WORLD
);
177 /* output plan data, one process at a time: */
179 for (j
= 0; j
< n_pes
; ++j
) {
181 printf("Plan for process %d:\n",j
);
182 printf(" nx = %d, ny = %d\n",tp
->nx
,tp
->ny
);
183 printf(" local_nx = %d, local_ny = %d\n",
184 tp
->local_nx
,tp
->local_ny
);
186 if (local_nx
> 0 || local_ny
> 0) {
187 printf(" send/recv block_size = %d/%d, "
189 tp
->send_block_size
, tp
->recv_block_size
,
191 for (i
= 0; i
< tp
->num_steps
; ++i
)
192 printf(" exchange[%d]: block = %d, dest_pe = %d, "
193 "send_size = %d, recv_size = %d\n",
194 i
,tp
->exchange
[i
].block_num
,
195 tp
->exchange
[i
].dest_pe
,
196 tp
->exchange
[i
].send_size
,
197 tp
->exchange
[i
].recv_size
);
198 printf(" perm_block_size = %d, num_perm_blocks = %d\n",
199 tp
->perm_block_size
, tp
->num_perm_blocks
);
201 for (i
= 0; i
< tp
->num_perm_blocks
; ++i
)
202 printf(" perm_block_dest[%d] = %d\n",
203 i
,tp
->perm_block_dest
[i
]);
209 MPI_Barrier(MPI_COMM_WORLD
);
212 /* testing blocking, in-place transpose */
213 if (verbose
&& my_pe
== 0)
214 printf("\nTesting blocking in-place transpose:\n");
215 test_transpose_plan(nx
,ny
,el_size
,verbose
,tp
,data
,NULL
,0);
216 MPI_Barrier(MPI_COMM_WORLD
);
218 /* Initialize data and check a second time, to make sure that
219 we can reuse plans:*/
220 if (verbose
&& my_pe
== 0) printf("\nTesting again:\n");
221 test_transpose_plan(nx
,ny
,el_size
,verbose
,tp
,data
,NULL
,0);
223 /* testing blocking, out-of-place transpose */
224 if (verbose
&& my_pe
== 0)
225 printf("\nTesting blocking out-of-place transpose:\n");
226 test_transpose_plan(nx
,ny
,el_size
,verbose
,tp
,data
,work
,0);
227 MPI_Barrier(MPI_COMM_WORLD
);
229 /* Initialize data and check a second time, to make sure that
230 we can reuse plans:*/
231 if (verbose
&& my_pe
== 0) printf("\nTesting again:\n");
232 test_transpose_plan(nx
,ny
,el_size
,verbose
,tp
,data
,work
,0);
234 /* Test non-blocking (asynchronous) transpose calls: */
235 if (verbose
&& my_pe
== 0) printf("\nTesting non-blocking transpose:\n");
237 test_transpose_plan(nx
,ny
,el_size
,verbose
,tp
,data
,NULL
,1);
238 MPI_Barrier(MPI_COMM_WORLD
);
240 /* Initialize data and check a second time, to make sure that
241 we can reuse plans:*/
242 if (verbose
&& my_pe
== 0) printf("\nTesting again:\n");
243 test_transpose_plan(nx
,ny
,el_size
,verbose
,tp
,data
,NULL
,1);
245 transpose_mpi_destroy_plan(tp
);
253 printf("Process %d okay!\n",my_pe
);
255 fftw_check_memory_leaks();
261 int main(int argc
, char **argv
)
268 MPI_Init(&argc
,&argv
);
270 fftw_die_hook
= fftw_mpi_die
;
273 MPI_Bcast(&seed
, 1, MPI_INT
, 0, MPI_COMM_WORLD
);
276 MPI_Comm_rank(MPI_COMM_WORLD
,&my_pe
);
278 /* only process 0 is guaranteed to have the correct args */
286 el_size
= atoi(argv
[3]);
288 /* broadcast args to other processes: */
289 MPI_Bcast(&nx
, 1, MPI_INT
, 0, MPI_COMM_WORLD
);
290 MPI_Bcast(&ny
, 1, MPI_INT
, 0, MPI_COMM_WORLD
);
291 MPI_Bcast(&el_size
, 1, MPI_INT
, 0, MPI_COMM_WORLD
);
293 if (nx
> 0 && ny
> 0 && el_size
> 0)
294 test_transpose(nx
,ny
,el_size
,1);
297 printf("Doing random tests (does not terminate)...\n");
299 test_transpose(rand() % 10 + 1, rand() % 10 + 1,
301 test_transpose(rand() % 100 + 1, rand() % 100 + 1,
303 test_transpose(rand() % 1000 + 1, rand() % 1000 + 1,