Merge remote-tracking branch 'origin/release-v4.6.1'
[WRF.git] / external / RSL_LITE / cycle.c
blob0acfd9e3a8ec0e4b279dee6f5632645ca0a22962
1 #ifndef MS_SUA
2 # include <stdio.h>
3 #endif
4 #include <fcntl.h>
6 #define STANDARD_ERROR 2
8 #define STANDARD_OUTPUT 1
10 #ifndef STUBMPI
11 # include "mpi.h"
12 #endif
13 #include "rsl_lite.h"
15 #define UP_EVEN(A) ((A)+abs((A)%2))
16 #define DOWN_EVEN(A) ((A) - abs((A)%2))
17 #define UP_ODD(A) ((A) + abs(((A)+1)%2))
18 #define DOWN_ODD(A) ((A) - abs(((A)+1)%2))
19 #define MIN(A,B) ((A)<(B)?(A):(B))
20 #define MAX(A,B) ((A)>(B)?(A):(B))
22 static int *y_curs_src = NULL ;
23 static int *x_curs_src = NULL ;
24 static int *y_curs_dst = NULL ;
25 static int *x_curs_dst = NULL ;
26 static int *x_peermask_src = NULL ;
27 static int *x_peermask_dst = NULL ;
28 static int *nbytes_src = NULL ;
29 static int *nbytes_dst = NULL ;
31 #ifndef STUBMPI
32 static MPI_Request *x_recv = NULL , *x_send = NULL ;
33 #endif
35 void RSL_LITE_INIT_CYCLE ( int * Fcomm ,
36 int * xy0 , int * inout0 ,
37 int * n3dR0, int *n2dR0, int * typesizeR0 ,
38 int * n3dI0, int *n2dI0, int * typesizeI0 ,
39 int * n3dD0, int *n2dD0, int * typesizeD0 ,
40 int * n3dL0, int *n2dL0, int * typesizeL0 ,
41 int * me0, int * np0 , int * np_x0 , int * np_y0 ,
42 int * min_subdomain ,
43 int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
44 int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
46 int n3dR, n2dR, typesizeR ;
47 int n3dI, n2dI, typesizeI ;
48 int n3dD, n2dD, typesizeD ;
49 int n3dL, n2dL, typesizeL ;
50 int xy, inout ;
51 int me, np, np_x, np_y, np_dim ;
52 int ids , ide , jds , jde , kds , kde ;
53 int ips , ipe , jps , jpe , kps , kpe ;
54 int ips_send , ipe_send ;
55 int npts, i, ii, j, jj, m, n, ps, pe, ops, ope ;
56 int Px, Py, P, Q, swap, coords[2] ;
57 #ifndef STUBMPI
58 MPI_Comm *comm, dummy_comm ;
59 int ierr ;
61 comm = &dummy_comm ;
62 *comm = MPI_Comm_f2c( *Fcomm ) ;
64 xy = *xy0 ;
65 inout = *inout0 ; /* 1 is in (uncycled to cycled) 0 is out */
66 n3dR = *n3dR0 ; n2dR = *n2dR0 ; typesizeR = *typesizeR0 ;
67 n3dI = *n3dI0 ; n2dI = *n2dI0 ; typesizeI = *typesizeI0 ;
68 n3dD = *n3dD0 ; n2dD = *n2dD0 ; typesizeD = *typesizeD0 ;
69 n3dL = *n3dL0 ; n2dL = *n2dL0 ; typesizeL = *typesizeL0 ;
70 me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
71 ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
72 ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
74 if ( nbytes_src == NULL ) nbytes_src = RSL_MALLOC ( int , np ) ;
75 if ( nbytes_dst == NULL ) nbytes_dst = RSL_MALLOC ( int , np ) ;
76 if ( x_curs_src == NULL ) x_curs_src = RSL_MALLOC ( int , np ) ;
77 if ( x_curs_dst == NULL ) x_curs_dst = RSL_MALLOC ( int , np ) ;
78 if ( x_peermask_src == NULL ) x_peermask_src = RSL_MALLOC ( int , np ) ;
79 if ( x_peermask_dst == NULL ) x_peermask_dst = RSL_MALLOC ( int , np ) ;
80 if ( x_recv == NULL ) x_recv = RSL_MALLOC ( MPI_Request , np ) ;
81 if ( x_send == NULL ) x_send = RSL_MALLOC ( MPI_Request , np ) ;
82 for ( i = 0 ; i < np ; i++ ) { nbytes_src[i] = 0 ; x_curs_src[i] = 0 ; x_peermask_src[i] = 0 ; }
83 for ( i = 0 ; i < np ; i++ ) { nbytes_dst[i] = 0 ; x_curs_dst[i] = 0 ; x_peermask_dst[i] = 0 ; }
85 if ( xy == 1 ) { /* xy = 1, cycle in X, otherwise Y */
86 np_dim = np_x ;
87 ps = ips ;
88 pe = ipe ;
89 ops = jps ;
90 ope = jpe ;
91 m = (ide-ids+1)/np_dim ;
92 n = (m*np_dim)/m ;
93 } else {
94 np_dim = np_y ;
95 ps = jps ;
96 pe = jpe ;
97 ops = ips ;
98 ope = ipe ;
99 m = (jde-jds+1)/np_dim ;
100 n = (m*np_dim)/m ;
103 for ( i = ps ; i <= MIN(pe,m*np_dim) ; i++ ) {
104 ii = (i/n) + (i%n)*m ;
105 jj = (i/m) + (i%m)*n ;
106 if ( xy == 1 ) {
107 TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
108 min_subdomain, min_subdomain, &ierr ) ;
109 coords[1] = Px ; coords[0] = Py ;
110 MPI_Cart_rank( *comm, coords, &P ) ;
111 TASK_FOR_POINT ( &jj , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
112 min_subdomain, min_subdomain, &ierr ) ;
113 coords[1] = Px ; coords[0] = Py ;
114 MPI_Cart_rank( *comm, coords, &Q ) ;
115 } else {
116 TASK_FOR_POINT ( &ips , &ii , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
117 min_subdomain, min_subdomain, &ierr ) ;
118 coords[1] = Px ; coords[0] = Py ;
119 MPI_Cart_rank( *comm, coords, &P ) ;
120 TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
121 min_subdomain, min_subdomain, &ierr ) ;
122 coords[1] = Px ; coords[0] = Py ;
123 MPI_Cart_rank( *comm, coords, &Q ) ;
125 if ( inout == 0 ) { swap = P ; P = Q ; Q = swap ; }
127 nbytes_src[P] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
128 typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
129 typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
130 typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
132 nbytes_dst[Q] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
133 typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
134 typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
135 typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
138 for ( P = 0 ; P < np ; P++ ) {
139 buffer_for_proc ( P , nbytes_src[P], RSL_SENDBUF ) ;
140 buffer_for_proc ( P , nbytes_dst[P], RSL_RECVBUF ) ;
142 #endif
145 void RSL_LITE_PACK_CYCLE ( int * Fcomm, char * buf , int * inout0 , int * typesize0 , int * xy0 , int * pu0 , char * memord , int * xstag0 ,
146 int *me0, int * np0 , int * np_x0 , int * np_y0 ,
147 int * min_subdomain,
148 int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
149 int * ims0 , int * ime0 , int * jms0 , int * jme0 , int * kms0 , int * kme0 ,
150 int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
152 int me, np, np_x, np_y, np_dim ;
153 int inout , typesize ;
154 int ids , ide , jds , jde , kds , kde ;
155 int ims , ime , jms , jme , kms , kme ;
156 int ips , ipe , jps , jpe , kps , kpe ;
157 int xstag ; /* 0 not stag, 1 stag */
158 int xy ; /* y = 0 , x = 1 */
159 int pu ; /* pack = 0 , unpack = 1 */
160 int i, ii, j, jj, m, n ;
161 int ps, pe, ops, ope ;
162 register int k, t ;
163 #ifdef crayx1
164 register int i2,i3,i4,i_offset;
165 #endif
166 char *p ;
167 int da_buf ;
168 int Px, Py, P, coords[2] ;
169 int ierr = 0 ;
170 register int *pi, *qi ;
171 float f ;
172 #ifndef STUBMPI
173 MPI_Comm *comm, dummy_comm ;
175 comm = &dummy_comm ;
176 *comm = MPI_Comm_f2c( *Fcomm ) ;
178 me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
179 xstag = *xstag0 ;
180 inout = *inout0 ; typesize = *typesize0 ;
181 ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
182 ims = *ims0-1 ; ime = *ime0-1 ; jms = *jms0-1 ; jme = *jme0-1 ; kms = *kms0-1 ; kme = *kme0-1 ;
183 ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
184 xy = *xy0 ;
185 pu = *pu0 ;
187 /* need to adapt for other memory orders */
188 #define IMAX(A) (((A)>ids)?(A):ids)
189 #define IMIN(A) (((A)<ide)?(A):ide)
190 #define JMAX(A) (((A)>jds)?(A):jds)
191 #define JMIN(A) (((A)<jde)?(A):jde)
193 da_buf = ( pu == 0 ) ? RSL_SENDBUF : RSL_RECVBUF ;
195 if ( xy == 1 ) { /* xy = 1, cycle in X, otherwise Y */
196 np_dim = np_x ;
197 ps = ips ;
198 pe = ipe ;
199 m = (ide-ids+1)/np_dim ;
200 n = (m*np_dim)/m ;
201 } else {
202 np_dim = np_y ;
203 ps = jps ;
204 pe = jpe ;
205 m = (jde-jds+1)/np_dim ;
206 n = (m*np_dim)/m ;
209 if ( np_x > 1 && xy == 1 ) {
211 for ( i = ips ; i <= MIN(ipe,m*np_dim-1) ; i++ ) {
212 if ( pu == 0 ) {
213 ii = (inout)?(i/n)+(i%n)*m:(i/m)+(i%m)*n ;
214 TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
215 min_subdomain, min_subdomain, &ierr ) ;
216 coords[1] = Px ; coords[0] = Py ;
217 MPI_Cart_rank( *comm, coords, &P ) ;
218 p = buffer_for_proc( P , 0 , da_buf ) ;
219 if ( typesize == sizeof(int) ) {
220 for ( j = jps ; j <= jpe ; j++ ) {
221 for ( k = kps ; k <= kpe ; k++ ) {
222 pi = (int *)(p+x_curs_src[P]) ;
223 qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
224 (k-kms) + (j-jms)*(kme-kms+1))))) ;
225 *pi++ = *qi++ ;
226 x_curs_src[P] += typesize ;
230 else {
231 for ( j = jps ; j <= jpe ; j++ ) {
232 for ( k = kps ; k <= kpe ; k++ ) {
233 for ( t = 0 ; t < typesize ; t++ ) {
234 *(p+x_curs_src[P]) =
235 *(buf + t + typesize*(
236 (i-ims) + (ime-ims+1)*(
237 (k-kms) + (j-jms)*(kme-kms+1))) ) ;
238 x_curs_src[P]++ ;
243 } else {
244 ii = (inout)?(i/m)+(i%m)*n:(i/n)+(i%n)*m ;
245 TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
246 min_subdomain, min_subdomain, &ierr ) ;
247 coords[1] = Px ; coords[0] = Py ;
248 MPI_Cart_rank( *comm, coords, &P ) ;
249 p = buffer_for_proc( P , 0 , da_buf ) ;
250 if ( typesize == sizeof(int) ) {
251 for ( j = jps ; j <= jpe ; j++ ) {
252 for ( k = kps ; k <= kpe ; k++ ) {
253 pi = (int *)(p+x_curs_dst[P]) ;
254 qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
255 (k-kms) + (j-jms)*(kme-kms+1))))) ;
256 *qi++ = *pi++ ;
257 x_curs_dst[P] += typesize ;
261 else {
262 for ( j = jps ; j <= jpe ; j++ ) {
263 for ( k = kps ; k <= kpe ; k++ ) {
264 for ( t = 0 ; t < typesize ; t++ ) {
265 *(buf + t + typesize*(
266 (i-ims) + (ime-ims+1)*(
267 (k-kms) + (j-jms)*(kme-kms+1))) ) =
268 *(p+x_curs_dst[P]) ;
269 x_curs_dst[P]++ ;
276 } else if ( np_y > 1 && xy == 0 ) {
277 for ( j = jps ; j <= MIN(jpe,m*np_dim-1) ; j++ ) {
278 if ( pu == 0 ) {
279 jj = (inout)?(j/n) + (j%n)*m:(j/m) + (j%m)*n ;
280 TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
281 min_subdomain, min_subdomain, &ierr ) ;
282 coords[1] = Px ; coords[0] = Py ;
283 MPI_Cart_rank( *comm, coords, &P ) ;
284 p = buffer_for_proc( P , 0 , da_buf ) ;
285 if ( typesize == sizeof(int) ) {
286 for ( i = ips ; i <= ipe ; i++ ) {
287 for ( k = kps ; k <= kpe ; k++ ) {
288 pi = (int *)(p+x_curs_src[P]) ;
289 qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
290 (k-kms) + (j-jms)*(kme-kms+1))))) ;
291 *pi++ = *qi++ ;
292 x_curs_src[P] += typesize ;
296 else {
297 for ( i = ips ; i <= ipe ; i++ ) {
298 for ( k = kps ; k <= kpe ; k++ ) {
299 for ( t = 0 ; t < typesize ; t++ ) {
300 *(p+x_curs_src[P]) =
301 *(buf + t + typesize*(
302 (i-ims) + (ime-ims+1)*(
303 (k-kms) + (j-jms)*(kme-kms+1))) ) ;
304 x_curs_src[P]++ ;
309 } else {
310 jj = (inout)?(j/m) + (j%m)*n:(j/n) + (j%n)*m ;
311 TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
312 min_subdomain, min_subdomain, &ierr ) ;
313 coords[1] = Px ; coords[0] = Py ;
314 MPI_Cart_rank( *comm, coords, &P ) ;
315 p = buffer_for_proc( P , 0 , da_buf ) ;
316 if ( typesize == sizeof(int) ) {
317 for ( i = ips ; i <= ipe ; i++ ) {
318 for ( k = kps ; k <= kpe ; k++ ) {
319 pi = (int *)(p+x_curs_dst[P]) ;
320 qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
321 (k-kms) + (j-jms)*(kme-kms+1))))) ;
322 *qi++ = *pi++ ;
323 x_curs_dst[P] += typesize ;
327 else {
328 for ( i = ips ; i <= ipe ; i++ ) {
329 for ( k = kps ; k <= kpe ; k++ ) {
330 for ( t = 0 ; t < typesize ; t++ ) {
331 *(buf + t + typesize*(
332 (i-ims) + (ime-ims+1)*(
333 (k-kms) + (j-jms)*(kme-kms+1))) ) =
334 *(p+x_curs_dst[P]) ;
335 x_curs_dst[P]++ ;
343 #endif
346 void RSL_LITE_CYCLE ( int * Fcomm0, int *me0, int * np0 , int * np_x0 , int * np_y0 )
348 int me, np, np_x, np_y ;
349 int yp, ym, xp, xm, nb ;
350 #ifndef STUBMPI
351 MPI_Status stat ;
352 MPI_Comm comm, *comm0, dummy_comm ;
353 int i, P ;
355 comm0 = &dummy_comm ;
356 *comm0 = MPI_Comm_f2c( *Fcomm0 ) ;
358 comm = *comm0 ; me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
360 for ( P = 0 ; P < np ; P++ ) {
361 nb = buffer_size_for_proc( P, RSL_RECVBUF ) ;
362 MPI_Irecv ( buffer_for_proc( P, 0, RSL_RECVBUF ), nb, MPI_CHAR, P, me, comm, &(x_recv[P]) ) ;
363 MPI_Isend ( buffer_for_proc( P, 0, RSL_SENDBUF ), x_curs_src[P], MPI_CHAR, P, P, comm, &(x_send[P]) ) ;
365 for ( P = 0 ; P < np ; P++ ) {
366 MPI_Wait( &x_recv[P], &stat ) ;
367 MPI_Wait( &x_send[P], &stat ) ;
369 for ( i = 0 ; i < np ; i++ ) { x_curs_src[i] = 0 ; x_curs_dst[i] ; }
370 #endif