6 #define STANDARD_ERROR 2
8 #define STANDARD_OUTPUT 1
15 #define UP_EVEN(A) ((A)+abs((A)%2))
16 #define DOWN_EVEN(A) ((A) - abs((A)%2))
17 #define UP_ODD(A) ((A) + abs(((A)+1)%2))
18 #define DOWN_ODD(A) ((A) - abs(((A)+1)%2))
19 #define MIN(A,B) ((A)<(B)?(A):(B))
20 #define MAX(A,B) ((A)>(B)?(A):(B))
22 static int *y_curs
= NULL
;
23 static int *x_curs
= NULL
;
24 static int *x_peermask
= NULL
;
25 static int *nbytes
= NULL
;
27 static MPI_Request
*x_recv
= NULL
, *x_send
= NULL
;
33 int * n3dR0
, int *n2dR0
, int * typesizeR0
,
34 int * n3dI0
, int *n2dI0
, int * typesizeI0
,
35 int * n3dD0
, int *n2dD0
, int * typesizeD0
,
36 int * n3dL0
, int *n2dL0
, int * typesizeL0
,
37 int * me0
, int * np0
, int * np_x0
, int * np_y0
,
39 int * ids0
, int * ide0
, int * jds0
, int * jde0
, int * kds0
, int * kde0
,
40 int * ips0
, int * ipe0
, int * jps0
, int * jpe0
, int * kps0
, int * kpe0
)
43 int n3dR
, n2dR
, typesizeR
;
44 int n3dI
, n2dI
, typesizeI
;
45 int n3dD
, n2dD
, typesizeD
;
46 int n3dL
, n2dL
, typesizeL
;
48 int me
, np
, np_x
, np_y
;
49 int ids
, ide
, jds
, jde
, kds
, kde
;
50 int ips
, ipe
, jps
, jpe
, kps
, kpe
;
51 int ips_send
, ipe_send
;
52 int npts
, i
, ii
, j
, m
, n
, ps
, pe
, ops
, ope
;
53 int Px
, Py
, P
, coords
[2] ;
54 int ips_swap
, ipe_swap
;
55 MPI_Comm
*comm
, dummy_comm
;
59 *comm
= MPI_Comm_f2c( *Fcomm
) ;
62 n3dR
= *n3dR0
; n2dR
= *n2dR0
; typesizeR
= *typesizeR0
;
63 n3dI
= *n3dI0
; n2dI
= *n2dI0
; typesizeI
= *typesizeI0
;
64 n3dD
= *n3dD0
; n2dD
= *n2dD0
; typesizeD
= *typesizeD0
;
65 n3dL
= *n3dL0
; n2dL
= *n2dL0
; typesizeL
= *typesizeL0
;
66 me
= *me0
; np
= *np0
; np_x
= *np_x0
; np_y
= *np_y0
;
67 ids
= *ids0
-1 ; ide
= *ide0
-1 ; jds
= *jds0
-1 ; jde
= *jde0
-1 ; kds
= *kds0
-1 ; kde
= *kde0
-1 ;
68 ips
= *ips0
-1 ; ipe
= *ipe0
-1 ; jps
= *jps0
-1 ; jpe
= *jpe0
-1 ; kps
= *kps0
-1 ; kpe
= *kpe0
-1 ;
70 if ( nbytes
== NULL
) nbytes
= RSL_MALLOC ( int , np
) ;
71 if ( x_curs
== NULL
) x_curs
= RSL_MALLOC ( int , np
) ;
72 if ( x_peermask
== NULL
) x_peermask
= RSL_MALLOC ( int , np
) ;
73 if ( x_recv
== NULL
) x_recv
= RSL_MALLOC ( MPI_Request
, np
) ;
74 if ( x_send
== NULL
) x_send
= RSL_MALLOC ( MPI_Request
, np
) ;
75 for ( i
= 0 ; i
< np
; i
++ ) { nbytes
[i
] = 0 ; x_curs
[i
] = 0 ; x_peermask
[i
] = 0 ; }
77 if ( xy
== 1 ) { /* xy = 1, swap in X, otherwise Y */
93 for ( i
= UP_ODD( ps
) ; i
<= MIN(pe
,m
) ; i
+=2 ) {
96 TASK_FOR_POINT ( &ii
, &jps
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
97 min_subdomain
, min_subdomain
, &ierr
) ;
98 coords
[1] = Px
; coords
[0] = Py
;
99 MPI_Cart_rank( *comm
, coords
, &P
) ;
101 TASK_FOR_POINT ( &ips
, &ii
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
102 min_subdomain
, min_subdomain
, &ierr
) ;
103 coords
[1] = Px
; coords
[0] = Py
;
104 MPI_Cart_rank( *comm
, coords
, &P
) ;
106 nbytes
[P
] += typesizeR
*(ope
-ops
+1)*(n3dR
*(kpe
-kps
+1)+n2dR
) +
107 typesizeI
*(ope
-ops
+1)*(n3dI
*(kpe
-kps
+1)+n2dI
) +
108 typesizeD
*(ope
-ops
+1)*(n3dD
*(kpe
-kps
+1)+n2dD
) +
109 typesizeL
*(ope
-ops
+1)*(n3dL
*(kpe
-kps
+1)+n2dL
) ;
113 for ( P
= 0 ; P
< np
; P
++ ) {
114 if ( x_peermask
[P
] ) {
115 buffer_for_proc ( P
, nbytes
[P
], RSL_RECVBUF
) ;
116 buffer_for_proc ( P
, nbytes
[P
], RSL_SENDBUF
) ;
122 RSL_LITE_PACK_SWAP ( int * Fcomm
, char * buf
, int * odd0
, int * typesize0
, int * xy0
, int * pu0
, char * memord
, int * xstag0
,
123 int *me0
, int * np0
, int * np_x0
, int * np_y0
,
124 int * min_subdomain
,
125 int * ids0
, int * ide0
, int * jds0
, int * jde0
, int * kds0
, int * kde0
,
126 int * ims0
, int * ime0
, int * jms0
, int * jme0
, int * kms0
, int * kme0
,
127 int * ips0
, int * ipe0
, int * jps0
, int * jpe0
, int * kps0
, int * kpe0
)
130 int me
, np
, np_x
, np_y
;
132 int ids
, ide
, jds
, jde
, kds
, kde
;
133 int ims
, ime
, jms
, jme
, kms
, kme
;
134 int ips
, ipe
, jps
, jpe
, kps
, kpe
;
135 int xstag
; /* 0 not stag, 1 stag */
136 int xy
; /* y = 0 , x = 1 */
137 int pu
; /* pack = 0 , unpack = 1 */
138 int i
, ii
, j
, jj
, m
, n
;
139 int ps
, pe
, ops
, ope
;
142 register int i2
,i3
,i4
,i_offset
;
146 int Px
, Py
, P
, coords
[2] ;
148 register int *pi
, *qi
;
150 MPI_Comm
*comm
, dummy_comm
;
153 *comm
= MPI_Comm_f2c( *Fcomm
) ;
155 me
= *me0
; np
= *np0
; np_x
= *np_x0
; np_y
= *np_y0
;
157 odd
= *odd0
; typesize
= *typesize0
;
158 ids
= *ids0
-1 ; ide
= *ide0
-1 ; jds
= *jds0
-1 ; jde
= *jde0
-1 ; kds
= *kds0
-1 ; kde
= *kde0
-1 ;
159 ims
= *ims0
-1 ; ime
= *ime0
-1 ; jms
= *jms0
-1 ; jme
= *jme0
-1 ; kms
= *kms0
-1 ; kme
= *kme0
-1 ;
160 ips
= *ips0
-1 ; ipe
= *ipe0
-1 ; jps
= *jps0
-1 ; jpe
= *jpe0
-1 ; kps
= *kps0
-1 ; kpe
= *kpe0
-1 ;
164 /* need to adapt for other memory orders */
165 #define RANGE(S1,E1,S2,E2,S3,E3,S4,E4) (((E1)-(S1)+1)*((E2)-(S2)+1)*(((E3)-(S3)+1)/2)*((E4)-(S4)+1))
166 #define IMAX(A) (((A)>ids)?(A):ids)
167 #define IMIN(A) (((A)<ide)?(A):ide)
168 #define JMAX(A) (((A)>jds)?(A):jds)
169 #define JMIN(A) (((A)<jde)?(A):jde)
171 da_buf
= ( pu
== 0 ) ? RSL_SENDBUF
: RSL_RECVBUF
;
174 if ( xy
== 1 ) { /* xy = 1, swap in X, otherwise Y */
175 n
= (ide
-ids
+1)/4*2 ;
178 n
= (jde
-jds
+1)/4*2 ;
182 if ( np_x
> 1 && xy
== 1 ) {
184 for ( i
= UP_ODD(ips
) ; i
<= MIN(ipe
,m
) ; i
+=2 ) {
186 TASK_FOR_POINT ( &ii
, &jps
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
187 min_subdomain
, min_subdomain
, &ierr
) ;
188 coords
[1] = Px
; coords
[0] = Py
;
189 MPI_Cart_rank( *comm
, coords
, &P
) ;
190 p
= buffer_for_proc( P
, 0 , da_buf
) ;
192 if ( typesize
== sizeof(int) ) {
193 for ( j
= JMAX(jps
) ; j
<= JMIN(jpe
) ; j
++ ) {
194 for ( k
= kps
; k
<= kpe
; k
++ ) {
195 pi
= (int *)(p
+x_curs
[P
]) ;
196 qi
= (int *)((buf
+ typesize
*( (i
-ims
) + (ime
-ims
+1)*(
197 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))))) ;
199 x_curs
[P
] += typesize
;
204 for ( j
= JMAX(jps
) ; j
<= JMIN(jpe
) ; j
++ ) {
205 for ( k
= kps
; k
<= kpe
; k
++ ) {
206 for ( t
= 0 ; t
< typesize
; t
++ ) {
208 *(buf
+ t
+ typesize
*(
209 (i
-ims
) + (ime
-ims
+1)*(
210 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))) ) ;
217 if ( typesize
== sizeof(int) ) {
218 for ( j
= JMAX(jps
) ; j
<= JMIN(jpe
) ; j
++ ) {
219 for ( k
= kps
; k
<= kpe
; k
++ ) {
220 pi
= (int *)(p
+x_curs
[P
]) ;
221 qi
= (int *)((buf
+ typesize
*( (i
-ims
) + (ime
-ims
+1)*(
222 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))))) ;
224 x_curs
[P
] += typesize
;
229 for ( j
= JMAX(jps
) ; j
<= JMIN(jpe
) ; j
++ ) {
230 for ( k
= kps
; k
<= kpe
; k
++ ) {
231 for ( t
= 0 ; t
< typesize
; t
++ ) {
232 *(buf
+ t
+ typesize
*(
233 (i
-ims
) + (ime
-ims
+1)*(
234 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))) ) =
243 } else if ( np_y
> 1 && xy
== 0 ) {
244 for ( j
= UP_ODD(jps
) ; j
<= MIN(jpe
,m
) ; j
+=2 ) {
246 TASK_FOR_POINT ( &ips
, &jj
, &ids
, &ide
, &jds
, &jde
, &np_x
, &np_y
, &Px
, &Py
,
247 min_subdomain
, min_subdomain
, &ierr
) ;
248 coords
[1] = Px
; coords
[0] = Py
;
249 MPI_Cart_rank( *comm
, coords
, &P
) ;
250 p
= buffer_for_proc( P
, 0 , da_buf
) ;
252 if ( typesize
== sizeof(int) ) {
253 for ( i
= IMAX(ips
) ; i
<= IMIN(ipe
) ; i
++ ) {
254 for ( k
= kps
; k
<= kpe
; k
++ ) {
255 pi
= (int *)(p
+x_curs
[P
]) ;
256 qi
= (int *)((buf
+ typesize
*( (i
-ims
) + (ime
-ims
+1)*(
257 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))))) ;
259 x_curs
[P
] += typesize
;
264 for ( i
= IMAX(ips
) ; i
<= IMIN(ipe
) ; i
++ ) {
265 for ( k
= kps
; k
<= kpe
; k
++ ) {
266 for ( t
= 0 ; t
< typesize
; t
++ ) {
268 *(buf
+ t
+ typesize
*(
269 (i
-ims
) + (ime
-ims
+1)*(
270 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))) ) ;
277 if ( typesize
== sizeof(int) ) {
278 for ( i
= IMAX(ips
) ; i
<= IMIN(ipe
) ; i
++ ) {
279 for ( k
= kps
; k
<= kpe
; k
++ ) {
280 pi
= (int *)(p
+x_curs
[P
]) ;
281 qi
= (int *)((buf
+ typesize
*( (i
-ims
) + (ime
-ims
+1)*(
282 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))))) ;
284 x_curs
[P
] += typesize
;
289 for ( i
= IMAX(ips
) ; i
<= IMIN(ipe
) ; i
++ ) {
290 for ( k
= kps
; k
<= kpe
; k
++ ) {
291 for ( t
= 0 ; t
< typesize
; t
++ ) {
292 *(buf
+ t
+ typesize
*(
293 (i
-ims
) + (ime
-ims
+1)*(
294 (k
-kms
) + (j
-jms
)*(kme
-kms
+1))) ) =
307 RSL_LITE_SWAP ( int * Fcomm0
, int *me0
, int * np0
, int * np_x0
, int * np_y0
)
310 int me
, np
, np_x
, np_y
;
311 int yp
, ym
, xp
, xm
, nb
;
313 MPI_Comm comm
, *comm0
, dummy_comm
;
316 comm0
= &dummy_comm
;
317 *comm0
= MPI_Comm_f2c( *Fcomm0
) ;
320 comm
= *comm0
; me
= *me0
; np
= *np0
; np_x
= *np_x0
; np_y
= *np_y0
;
322 /* fprintf(stderr,"RSL_LITE_SWAP\n") ; */
324 for ( P
= 0 ; P
< np
; P
++ ) {
325 if ( x_peermask
[P
] ) {
326 nb
= buffer_size_for_proc( P
, RSL_RECVBUF
) ;
327 /* fprintf(stderr,"posting irecv from %d, nb = %d\n",P,nb) ; */
328 MPI_Irecv ( buffer_for_proc( P
, x_curs
[P
], RSL_RECVBUF
), nb
, MPI_CHAR
, P
, me
, comm
, &(x_recv
[P
]) ) ;
329 /* fprintf(stderr,"sending to %d, nb = %d\n",P,x_curs[P]) ; */
330 MPI_Isend ( buffer_for_proc( P
, 0, RSL_SENDBUF
), x_curs
[P
], MPI_CHAR
, P
, P
, comm
, &(x_send
[P
]) ) ;
333 for ( P
= 0 ; P
< np
; P
++ ) {
334 if ( x_peermask
[P
] ) {
335 MPI_Wait( &x_recv
[P
], &stat
) ;
336 MPI_Wait( &x_send
[P
], &stat
) ;
341 fprintf(stderr
,"RSL_LITE_SWAP disabled\n") ;
344 for ( i
= 0 ; i
< np
; i
++ ) { x_curs
[i
] = 0 ; }