6 #define STANDARD_ERROR 2
8 #define STANDARD_OUTPUT 1
15 #define UP_EVEN(A) ((A)+abs((A)%2))
16 #define DOWN_EVEN(A) ((A) - abs((A)%2))
17 #define UP_ODD(A) ((A) + abs(((A)+1)%2))
18 #define DOWN_ODD(A) ((A) - abs(((A)+1)%2))
19 #define MIN(A,B) ((A)<(B)?(A):(B))
20 #define MAX(A,B) ((A)>(B)?(A):(B))
22 static int *y_curs_src
= NULL
;
23 static int *x_curs_src
= NULL
;
24 static int *y_curs_dst
= NULL
;
25 static int *x_curs_dst
= NULL
;
26 static int *x_peermask_src
= NULL
;
27 static int *x_peermask_dst
= NULL
;
28 static int *nbytes_src
= NULL
;
29 static int *nbytes_dst
= NULL
;
32 static MPI_Request
*x_recv
= NULL
, *x_send
= NULL
;
35 RSL_LITE_INIT_CYCLE ( int * Fcomm
,
36 int * xy0
, int * inout0
,
37 int * n3dR0
, int *n2dR0
, int * typesizeR0
,
38 int * n3dI0
, int *n2dI0
, int * typesizeI0
,
39 int * n3dD0
, int *n2dD0
, int * typesizeD0
,
40 int * n3dL0
, int *n2dL0
, int * typesizeL0
,
41 int * me0
, int * np0
, int * np_x0
, int * np_y0
,
43 int * ids0
, int * ide0
, int * jds0
, int * jde0
, int * kds0
, int * kde0
,
44 int * ips0
, int * ipe0
, int * jps0
, int * jpe0
, int * kps0
, int * kpe0
)
46 int n3dR
, n2dR
, typesizeR
;
47 int n3dI
, n2dI
, typesizeI
;
48 int n3dD
, n2dD
, typesizeD
;
49 int n3dL
, n2dL
, typesizeL
;
51 int me
, np
, np_x
, np_y
, np_dim
;
52 int ids
, ide
, jds
, jde
, kds
, kde
;
53 int ips
, ipe
, jps
, jpe
, kps
, kpe
;
54 int ips_send
, ipe_send
;
55 int npts
, i
, ii
, j
, jj
, m
, n
, ps
, pe
, ops
, ope
;
56 int Px
, Py
, P
, Q
, swap
, coords
[2] ;
58 MPI_Comm
*comm
, dummy_comm
;
62 *comm
= MPI_Comm_f2c( *Fcomm
) ;
65 inout
= *inout0
; /* 1 is in (uncycled to cycled) 0 is out */
66 n3dR
= *n3dR0
; n2dR
= *n2dR0
; typesizeR
= *typesizeR0
;
67 n3dI
= *n3dI0
; n2dI
= *n2dI0
; typesizeI
= *typesizeI0
;
68 n3dD
= *n3dD0
; n2dD
= *n2dD0
; typesizeD
= *typesizeD0
;
69 n3dL
= *n3dL0
; n2dL
= *n2dL0
; typesizeL
= *typesizeL0
;
70 me
= *me0
; np
= *np0
; np_x
= *np_x0
; np_y
= *np_y0
;
71 ids
= *ids0
-1 ; ide
= *ide0
-1 ; jds
= *jds0
-1 ; jde
= *jde0
-1 ; kds
= *kds0
-1 ; kde
= *kde0
-1 ;
72 ips
= *ips0
-1 ; ipe
= *ipe0
-1 ; jps
= *jps0
-1 ; jpe
= *jpe0
-1 ; kps
= *kps0
-1 ; kpe
= *kpe0
-1 ;
74 if ( nbytes_src
== NULL
) nbytes_src
= RSL_MALLOC ( int , np
) ;
75 if ( nbytes_dst
== NULL
) nbytes_dst
= RSL_MALLOC ( int , np
) ;
76 if ( x_curs_src
== NULL
) x_curs_src
= RSL_MALLOC ( int , np
) ;
77 if ( x_curs_dst
== NULL
) x_curs_dst
= RSL_MALLOC ( int , np
) ;
78 if ( x_peermask_src
== NULL
) x_peermask_src
= RSL_MALLOC ( int , np
) ;
79 if ( x_peermask_dst
== NULL
) x_peermask_dst
= RSL_MALLOC ( int , np
) ;
80 if ( x_recv
== NULL
) x_recv
= RSL_MALLOC ( MPI_Request
, np
) ;
81 if ( x_send
== NULL
) x_send
= RSL_MALLOC ( MPI_Request
, np
) ;
82 for ( i
= 0 ; i
< np
; i
++ ) { nbytes_src
[i
] = 0 ; x_curs_src
[i
] = 0 ; x_peermask_src
[i
] = 0 ; }
83 for ( i
= 0 ; i
< np
; i
++ ) { nbytes_dst
[i
] = 0 ; x_curs_dst
[i
] = 0 ; x_peermask_dst
[i
] = 0 ; }
85 if ( xy
== 1 ) { /* xy = 1, cycle in X, otherwise Y */
91 m
= (ide
-ids
+1)/np_dim
;
99 m
= (jde
-jds
+1)/np_dim
;
103 for ( i
= ps
; i
<= MIN(pe
,m
*np_dim
) ; i
++ ) {
104 ii
= (i
/n
) + (i
%n
)*m
;
105 jj
= (i
/m
) + (i
%m
)*n
;
107 TASK_FOR_POINT ( &ii
, &jps
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
108 min_subdomain
, min_subdomain
, &ierr
) ;
109 coords
[1] = Px
; coords
[0] = Py
;
110 MPI_Cart_rank( *comm
, coords
, &P
) ;
111 TASK_FOR_POINT ( &jj
, &jps
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
112 min_subdomain
, min_subdomain
, &ierr
) ;
113 coords
[1] = Px
; coords
[0] = Py
;
114 MPI_Cart_rank( *comm
, coords
, &Q
) ;
116 TASK_FOR_POINT ( &ips
, &ii
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
117 min_subdomain
, min_subdomain
, &ierr
) ;
118 coords
[1] = Px
; coords
[0] = Py
;
119 MPI_Cart_rank( *comm
, coords
, &P
) ;
120 TASK_FOR_POINT ( &ips
, &jj
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
121 min_subdomain
, min_subdomain
, &ierr
) ;
122 coords
[1] = Px
; coords
[0] = Py
;
123 MPI_Cart_rank( *comm
, coords
, &Q
) ;
125 if ( inout
== 0 ) { swap
= P
; P
= Q
; Q
= swap
; }
127 nbytes_src
[P
] += typesizeR
*(ope
-ops
+1)*(n3dR
*(kpe
-kps
+1)+n2dR
) +
128 typesizeI
*(ope
-ops
+1)*(n3dI
*(kpe
-kps
+1)+n2dI
) +
129 typesizeD
*(ope
-ops
+1)*(n3dD
*(kpe
-kps
+1)+n2dD
) +
130 typesizeL
*(ope
-ops
+1)*(n3dL
*(kpe
-kps
+1)+n2dL
) ;
132 nbytes_dst
[Q
] += typesizeR
*(ope
-ops
+1)*(n3dR
*(kpe
-kps
+1)+n2dR
) +
133 typesizeI
*(ope
-ops
+1)*(n3dI
*(kpe
-kps
+1)+n2dI
) +
134 typesizeD
*(ope
-ops
+1)*(n3dD
*(kpe
-kps
+1)+n2dD
) +
135 typesizeL
*(ope
-ops
+1)*(n3dL
*(kpe
-kps
+1)+n2dL
) ;
138 for ( P
= 0 ; P
< np
; P
++ ) {
139 buffer_for_proc ( P
, nbytes_src
[P
], RSL_SENDBUF
) ;
140 buffer_for_proc ( P
, nbytes_dst
[P
], RSL_RECVBUF
) ;
145 RSL_LITE_PACK_CYCLE ( int * Fcomm
, char * buf
, int * inout0
, int * typesize0
, int * xy0
, int * pu0
, char * memord
, int * xstag0
,
146 int *me0
, int * np0
, int * np_x0
, int * np_y0
,
148 int * ids0
, int * ide0
, int * jds0
, int * jde0
, int * kds0
, int * kde0
,
149 int * ims0
, int * ime0
, int * jms0
, int * jme0
, int * kms0
, int * kme0
,
150 int * ips0
, int * ipe0
, int * jps0
, int * jpe0
, int * kps0
, int * kpe0
)
152 int me
, np
, np_x
, np_y
, np_dim
;
153 int inout
, typesize
;
154 int ids
, ide
, jds
, jde
, kds
, kde
;
155 int ims
, ime
, jms
, jme
, kms
, kme
;
156 int ips
, ipe
, jps
, jpe
, kps
, kpe
;
157 int xstag
; /* 0 not stag, 1 stag */
158 int xy
; /* y = 0 , x = 1 */
159 int pu
; /* pack = 0 , unpack = 1 */
160 int i
, ii
, j
, jj
, m
, n
;
161 int ps
, pe
, ops
, ope
;
164 register int i2
,i3
,i4
,i_offset
;
168 int Px
, Py
, P
, coords
[2] ;
170 register int *pi
, *qi
;
173 MPI_Comm
*comm
, dummy_comm
;
176 *comm
= MPI_Comm_f2c( *Fcomm
) ;
178 me
= *me0
; np
= *np0
; np_x
= *np_x0
; np_y
= *np_y0
;
180 inout
= *inout0
; typesize
= *typesize0
;
181 ids
= *ids0
-1 ; ide
= *ide0
-1 ; jds
= *jds0
-1 ; jde
= *jde0
-1 ; kds
= *kds0
-1 ; kde
= *kde0
-1 ;
182 ims
= *ims0
-1 ; ime
= *ime0
-1 ; jms
= *jms0
-1 ; jme
= *jme0
-1 ; kms
= *kms0
-1 ; kme
= *kme0
-1 ;
183 ips
= *ips0
-1 ; ipe
= *ipe0
-1 ; jps
= *jps0
-1 ; jpe
= *jpe0
-1 ; kps
= *kps0
-1 ; kpe
= *kpe0
-1 ;
187 /* need to adapt for other memory orders */
188 #define IMAX(A) (((A)>ids)?(A):ids)
189 #define IMIN(A) (((A)<ide)?(A):ide)
190 #define JMAX(A) (((A)>jds)?(A):jds)
191 #define JMIN(A) (((A)<jde)?(A):jde)
193 da_buf
= ( pu
== 0 ) ? RSL_SENDBUF
: RSL_RECVBUF
;
195 if ( xy
== 1 ) { /* xy = 1, cycle in X, otherwise Y */
199 m
= (ide
-ids
+1)/np_dim
;
205 m
= (jde
-jds
+1)/np_dim
;
209 if ( np_x
> 1 && xy
== 1 ) {
211 for ( i
= ips
; i
<= MIN(ipe
,m
*np_dim
-1) ; i
++ ) {
213 ii
= (inout
)?(i
/n
)+(i
%n
)*m
:(i
/m
)+(i
%m
)*n
;
214 TASK_FOR_POINT ( &ii
, &jps
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
215 min_subdomain
, min_subdomain
, &ierr
) ;
216 coords
[1] = Px
; coords
[0] = Py
;
217 MPI_Cart_rank( *comm
, coords
, &P
) ;
218 p
= buffer_for_proc( P
, 0 , da_buf
) ;
219 if ( typesize
== sizeof(int) ) {
220 for ( j
= jps
; j
<= jpe
; j
++ ) {
221 for ( k
= kps
; k
<= kpe
; k
++ ) {
222 pi
= (int *)(p
+x_curs_src
[P
]) ;
223 qi
= (int *)((buf
+ typesize
*( (i
-ims
) + (ime
-ims
+1)*(
224 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))))) ;
226 x_curs_src
[P
] += typesize
;
231 for ( j
= jps
; j
<= jpe
; j
++ ) {
232 for ( k
= kps
; k
<= kpe
; k
++ ) {
233 for ( t
= 0 ; t
< typesize
; t
++ ) {
235 *(buf
+ t
+ typesize
*(
236 (i
-ims
) + (ime
-ims
+1)*(
237 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))) ) ;
244 ii
= (inout
)?(i
/m
)+(i
%m
)*n
:(i
/n
)+(i
%n
)*m
;
245 TASK_FOR_POINT ( &ii
, &jps
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
246 min_subdomain
, min_subdomain
, &ierr
) ;
247 coords
[1] = Px
; coords
[0] = Py
;
248 MPI_Cart_rank( *comm
, coords
, &P
) ;
249 p
= buffer_for_proc( P
, 0 , da_buf
) ;
250 if ( typesize
== sizeof(int) ) {
251 for ( j
= jps
; j
<= jpe
; j
++ ) {
252 for ( k
= kps
; k
<= kpe
; k
++ ) {
253 pi
= (int *)(p
+x_curs_dst
[P
]) ;
254 qi
= (int *)((buf
+ typesize
*( (i
-ims
) + (ime
-ims
+1)*(
255 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))))) ;
257 x_curs_dst
[P
] += typesize
;
262 for ( j
= jps
; j
<= jpe
; j
++ ) {
263 for ( k
= kps
; k
<= kpe
; k
++ ) {
264 for ( t
= 0 ; t
< typesize
; t
++ ) {
265 *(buf
+ t
+ typesize
*(
266 (i
-ims
) + (ime
-ims
+1)*(
267 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))) ) =
276 } else if ( np_y
> 1 && xy
== 0 ) {
277 for ( j
= jps
; j
<= MIN(jpe
,m
*np_dim
-1) ; j
++ ) {
279 jj
= (inout
)?(j
/n
) + (j
%n
)*m
:(j
/m
) + (j
%m
)*n
;
280 TASK_FOR_POINT ( &ips
, &jj
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
281 min_subdomain
, min_subdomain
, &ierr
) ;
282 coords
[1] = Px
; coords
[0] = Py
;
283 MPI_Cart_rank( *comm
, coords
, &P
) ;
284 p
= buffer_for_proc( P
, 0 , da_buf
) ;
285 if ( typesize
== sizeof(int) ) {
286 for ( i
= ips
; i
<= ipe
; i
++ ) {
287 for ( k
= kps
; k
<= kpe
; k
++ ) {
288 pi
= (int *)(p
+x_curs_src
[P
]) ;
289 qi
= (int *)((buf
+ typesize
*( (i
-ims
) + (ime
-ims
+1)*(
290 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))))) ;
292 x_curs_src
[P
] += typesize
;
297 for ( i
= ips
; i
<= ipe
; i
++ ) {
298 for ( k
= kps
; k
<= kpe
; k
++ ) {
299 for ( t
= 0 ; t
< typesize
; t
++ ) {
301 *(buf
+ t
+ typesize
*(
302 (i
-ims
) + (ime
-ims
+1)*(
303 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))) ) ;
310 jj
= (inout
)?(j
/m
) + (j
%m
)*n
:(j
/n
) + (j
%n
)*m
;
311 TASK_FOR_POINT ( &ips
, &jj
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
312 min_subdomain
, min_subdomain
, &ierr
) ;
313 coords
[1] = Px
; coords
[0] = Py
;
314 MPI_Cart_rank( *comm
, coords
, &P
) ;
315 p
= buffer_for_proc( P
, 0 , da_buf
) ;
316 if ( typesize
== sizeof(int) ) {
317 for ( i
= ips
; i
<= ipe
; i
++ ) {
318 for ( k
= kps
; k
<= kpe
; k
++ ) {
319 pi
= (int *)(p
+x_curs_dst
[P
]) ;
320 qi
= (int *)((buf
+ typesize
*( (i
-ims
) + (ime
-ims
+1)*(
321 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))))) ;
323 x_curs_dst
[P
] += typesize
;
328 for ( i
= ips
; i
<= ipe
; i
++ ) {
329 for ( k
= kps
; k
<= kpe
; k
++ ) {
330 for ( t
= 0 ; t
< typesize
; t
++ ) {
331 *(buf
+ t
+ typesize
*(
332 (i
-ims
) + (ime
-ims
+1)*(
333 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))) ) =
346 RSL_LITE_CYCLE ( int * Fcomm0
, int *me0
, int * np0
, int * np_x0
, int * np_y0
)
348 int me
, np
, np_x
, np_y
;
349 int yp
, ym
, xp
, xm
, nb
;
352 MPI_Comm comm
, *comm0
, dummy_comm
;
355 comm0
= &dummy_comm
;
356 *comm0
= MPI_Comm_f2c( *Fcomm0
) ;
358 comm
= *comm0
; me
= *me0
; np
= *np0
; np_x
= *np_x0
; np_y
= *np_y0
;
360 for ( P
= 0 ; P
< np
; P
++ ) {
361 nb
= buffer_size_for_proc( P
, RSL_RECVBUF
) ;
362 MPI_Irecv ( buffer_for_proc( P
, 0, RSL_RECVBUF
), nb
, MPI_CHAR
, P
, me
, comm
, &(x_recv
[P
]) ) ;
363 MPI_Isend ( buffer_for_proc( P
, 0, RSL_SENDBUF
), x_curs_src
[P
], MPI_CHAR
, P
, P
, comm
, &(x_send
[P
]) ) ;
365 for ( P
= 0 ; P
< np
; P
++ ) {
366 MPI_Wait( &x_recv
[P
], &stat
) ;
367 MPI_Wait( &x_send
[P
], &stat
) ;
369 for ( i
= 0 ; i
< np
; i
++ ) { x_curs_src
[i
] = 0 ; x_curs_dst
[i
] ; }