external/RSL_LITE/cycle.c

   1 #ifndef MS_SUA
   2 # include <stdio.h>
   3 #endif
   4 #include <fcntl.h>
   5
   6 #define STANDARD_ERROR 2
   7
   8 #define STANDARD_OUTPUT 1
   9
  10 #ifndef STUBMPI
  11 #  include "mpi.h"
  12 #endif
  13 #include "rsl_lite.h"
  14
  15 #define  UP_EVEN(A)   ((A)+abs((A)%2))
  16 #define  DOWN_EVEN(A) ((A) - abs((A)%2))
  17 #define  UP_ODD(A)    ((A) + abs(((A)+1)%2))
  18 #define  DOWN_ODD(A)  ((A) - abs(((A)+1)%2))
  19 #define  MIN(A,B)     ((A)<(B)?(A):(B))
  20 #define  MAX(A,B)     ((A)>(B)?(A):(B))
  21
  22 static int *y_curs_src = NULL ;
  23 static int *x_curs_src = NULL ;
  24 static int *y_curs_dst = NULL ;
  25 static int *x_curs_dst = NULL ;
  26 static int *x_peermask_src = NULL ;
  27 static int *x_peermask_dst = NULL ;
  28 static int *nbytes_src = NULL ;
  29 static int *nbytes_dst = NULL ;
  30
  31 #ifndef STUBMPI
  32 static MPI_Request *x_recv = NULL ,  *x_send = NULL ;
  33 #endif
  34
  35 RSL_LITE_INIT_CYCLE (  int * Fcomm ,
  36                 int * xy0 , int * inout0 ,
  37                 int * n3dR0, int *n2dR0, int * typesizeR0 ,
  38                 int * n3dI0, int *n2dI0, int * typesizeI0 ,
  39                 int * n3dD0, int *n2dD0, int * typesizeD0 ,
  40                 int * n3dL0, int *n2dL0, int * typesizeL0 ,
  41                 int * me0, int * np0 , int * np_x0 , int * np_y0 ,
  42                 int * min_subdomain ,
  43                 int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
  44                 int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
  45 {
  46   int n3dR, n2dR, typesizeR ;
  47   int n3dI, n2dI, typesizeI ;
  48   int n3dD, n2dD, typesizeD ;
  49   int n3dL, n2dL, typesizeL ;
  50   int xy, inout ;
  51   int me, np, np_x, np_y, np_dim ;
  52   int ids , ide , jds , jde , kds , kde ;
  53   int ips , ipe , jps , jpe , kps , kpe ;
  54   int ips_send , ipe_send ;
  55   int npts, i, ii, j, jj, m, n, ps, pe, ops, ope ;
  56   int Px, Py, P, Q, swap, coords[2] ;
  57 #ifndef STUBMPI
  58   MPI_Comm *comm, dummy_comm ;
  59   int ierr ;
  60
  61   comm = &dummy_comm ;
  62   *comm = MPI_Comm_f2c( *Fcomm ) ;
  63
  64   xy = *xy0 ;
  65   inout = *inout0 ;     /* 1 is in (uncycled to cycled) 0 is out */
  66   n3dR = *n3dR0 ; n2dR = *n2dR0 ; typesizeR = *typesizeR0 ;
  67   n3dI = *n3dI0 ; n2dI = *n2dI0 ; typesizeI = *typesizeI0 ;
  68   n3dD = *n3dD0 ; n2dD = *n2dD0 ; typesizeD = *typesizeD0 ;
  69   n3dL = *n3dL0 ; n2dL = *n2dL0 ; typesizeL = *typesizeL0 ;
  70   me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
  71   ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
  72   ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
  73
  74   if ( nbytes_src == NULL ) nbytes_src = RSL_MALLOC ( int , np ) ;
  75   if ( nbytes_dst == NULL ) nbytes_dst = RSL_MALLOC ( int , np ) ;
  76   if ( x_curs_src == NULL ) x_curs_src = RSL_MALLOC ( int , np ) ;
  77   if ( x_curs_dst == NULL ) x_curs_dst = RSL_MALLOC ( int , np ) ;
  78   if ( x_peermask_src == NULL ) x_peermask_src = RSL_MALLOC ( int , np ) ;
  79   if ( x_peermask_dst == NULL ) x_peermask_dst = RSL_MALLOC ( int , np ) ;
  80   if ( x_recv == NULL ) x_recv = RSL_MALLOC ( MPI_Request , np ) ;
  81   if ( x_send == NULL ) x_send = RSL_MALLOC ( MPI_Request , np ) ;
  82   for ( i = 0 ; i < np ; i++ ) { nbytes_src[i] = 0 ; x_curs_src[i] = 0 ; x_peermask_src[i] = 0 ; }
  83   for ( i = 0 ; i < np ; i++ ) { nbytes_dst[i] = 0 ; x_curs_dst[i] = 0 ; x_peermask_dst[i] = 0 ; }
  84
  85   if ( xy == 1 ) {   /* xy = 1, cycle in X, otherwise Y */
  86     np_dim = np_x ;
  87     ps = ips ;
  88     pe = ipe ;
  89     ops = jps ;
  90     ope = jpe ;
  91     m = (ide-ids+1)/np_dim ;
  92     n = (m*np_dim)/m ;
  93   } else {
  94     np_dim = np_y ;
  95     ps = jps ;
  96     pe = jpe ;
  97     ops = ips ;
  98     ope = ipe ;
  99     m = (jde-jds+1)/np_dim ;
 100     n = (m*np_dim)/m ;
 101   }
 102
 103   for ( i = ps ; i <= MIN(pe,m*np_dim) ; i++ ) {
 104     ii = (i/n) + (i%n)*m ;
 105     jj = (i/m) + (i%m)*n ;
 106     if ( xy == 1 ) {
 107       TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
 108                        min_subdomain, min_subdomain, &ierr ) ;
 109       coords[1] = Px ; coords[0] = Py ;
 110       MPI_Cart_rank( *comm, coords, &P ) ;
 111       TASK_FOR_POINT ( &jj , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
 112                        min_subdomain, min_subdomain, &ierr ) ;
 113       coords[1] = Px ; coords[0] = Py ;
 114       MPI_Cart_rank( *comm, coords, &Q ) ;
 115     } else {
 116       TASK_FOR_POINT ( &ips , &ii , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
 117                        min_subdomain, min_subdomain, &ierr ) ;
 118       coords[1] = Px ; coords[0] = Py ;
 119       MPI_Cart_rank( *comm, coords, &P ) ;
 120       TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
 121                        min_subdomain, min_subdomain, &ierr ) ;
 122       coords[1] = Px ; coords[0] = Py ;
 123       MPI_Cart_rank( *comm, coords, &Q ) ;
 124     }
 125     if ( inout == 0 ) { swap = P ; P = Q ; Q = swap ; }
 126
 127     nbytes_src[P] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
 128                      typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
 129                      typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
 130                      typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
 131
 132     nbytes_dst[Q] += typesizeR*(ope-ops+1)*(n3dR*(kpe-kps+1)+n2dR) +
 133                      typesizeI*(ope-ops+1)*(n3dI*(kpe-kps+1)+n2dI) +
 134                      typesizeD*(ope-ops+1)*(n3dD*(kpe-kps+1)+n2dD) +
 135                      typesizeL*(ope-ops+1)*(n3dL*(kpe-kps+1)+n2dL) ;
 136   }
 137
 138   for ( P = 0 ; P < np ; P++ ) {
 139        buffer_for_proc ( P , nbytes_src[P], RSL_SENDBUF ) ;
 140        buffer_for_proc ( P , nbytes_dst[P], RSL_RECVBUF ) ;
 141   }
 142 #endif
 143 }
 144
 145 RSL_LITE_PACK_CYCLE ( int * Fcomm, char * buf , int * inout0 , int * typesize0 , int * xy0 , int * pu0 , char * memord , int * xstag0 ,
 146            int *me0, int * np0 , int * np_x0 , int * np_y0 ,
 147            int * min_subdomain,
 148            int * ids0 , int * ide0 , int * jds0 , int * jde0 , int * kds0 , int * kde0 ,
 149            int * ims0 , int * ime0 , int * jms0 , int * jme0 , int * kms0 , int * kme0 ,
 150            int * ips0 , int * ipe0 , int * jps0 , int * jpe0 , int * kps0 , int * kpe0 )
 151 {
 152   int me, np, np_x, np_y, np_dim ;
 153   int inout , typesize ;
 154   int ids , ide , jds , jde , kds , kde ;
 155   int ims , ime , jms , jme , kms , kme ;
 156   int ips , ipe , jps , jpe , kps , kpe ;
 157   int xstag ;  /* 0 not stag, 1 stag */
 158   int xy ;   /* y = 0 , x = 1 */
 159   int pu ;   /* pack = 0 , unpack = 1 */
 160   int i, ii, j, jj, m, n  ;
 161   int ps, pe, ops, ope ;
 162   register int k, t ;
 163 #ifdef crayx1
 164   register int i2,i3,i4,i_offset;
 165 #endif
 166   char *p ;
 167   int da_buf ;
 168   int Px, Py, P, coords[2] ;
 169   int ierr = 0 ;
 170   register int *pi, *qi ;
 171   float f ;
 172 #ifndef STUBMPI
 173   MPI_Comm *comm, dummy_comm ;
 174
 175   comm = &dummy_comm ;
 176   *comm = MPI_Comm_f2c( *Fcomm ) ;
 177
 178   me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
 179   xstag = *xstag0 ;
 180   inout = *inout0 ; typesize = *typesize0 ;
 181   ids = *ids0-1 ; ide = *ide0-1 ; jds = *jds0-1 ; jde = *jde0-1 ; kds = *kds0-1 ; kde = *kde0-1 ;
 182   ims = *ims0-1 ; ime = *ime0-1 ; jms = *jms0-1 ; jme = *jme0-1 ; kms = *kms0-1 ; kme = *kme0-1 ;
 183   ips = *ips0-1 ; ipe = *ipe0-1 ; jps = *jps0-1 ; jpe = *jpe0-1 ; kps = *kps0-1 ; kpe = *kpe0-1 ;
 184   xy = *xy0 ;
 185   pu = *pu0 ;
 186
 187 /* need to adapt for other memory orders */
 188 #define IMAX(A) (((A)>ids)?(A):ids)
 189 #define IMIN(A) (((A)<ide)?(A):ide)
 190 #define JMAX(A) (((A)>jds)?(A):jds)
 191 #define JMIN(A) (((A)<jde)?(A):jde)
 192
 193   da_buf = ( pu == 0 ) ? RSL_SENDBUF : RSL_RECVBUF ;
 194
 195   if ( xy == 1 ) {   /* xy = 1, cycle in X, otherwise Y */
 196     np_dim = np_x ;
 197     ps = ips ;
 198     pe = ipe ;
 199     m = (ide-ids+1)/np_dim ;
 200     n = (m*np_dim)/m ;
 201   } else {
 202     np_dim = np_y ;
 203     ps = jps ;
 204     pe = jpe ;
 205     m = (jde-jds+1)/np_dim ;
 206     n = (m*np_dim)/m ;
 207   }
 208
 209   if ( np_x > 1 && xy == 1 ) {
 210
 211     for ( i = ips ; i <= MIN(ipe,m*np_dim-1) ; i++ ) {
 212       if ( pu == 0 ) {
 213         ii = (inout)?(i/n)+(i%n)*m:(i/m)+(i%m)*n  ;
 214         TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
 215                        min_subdomain, min_subdomain, &ierr ) ;
 216         coords[1] = Px ; coords[0] = Py ;
 217         MPI_Cart_rank( *comm, coords, &P ) ;
 218         p = buffer_for_proc( P , 0 , da_buf ) ;
 219         if ( typesize == sizeof(int) ) {
 220           for ( j = jps ; j <= jpe ; j++ ) {
 221             for ( k = kps ; k <= kpe ; k++ ) {
 222               pi = (int *)(p+x_curs_src[P]) ;
 223               qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
 224                                              (k-kms) + (j-jms)*(kme-kms+1))))) ;
 225               *pi++ = *qi++ ;
 226               x_curs_src[P] += typesize ;
 227             }
 228           }
 229         }
 230         else {
 231           for ( j = jps ; j <= jpe ; j++ ) {
 232             for ( k = kps ; k <= kpe ; k++ ) {
 233               for ( t = 0 ; t < typesize ; t++ ) {
 234                 *(p+x_curs_src[P]) =
 235                                *(buf + t + typesize*(
 236                                       (i-ims) + (ime-ims+1)*(
 237                                       (k-kms) + (j-jms)*(kme-kms+1))) ) ;
 238                 x_curs_src[P]++ ;
 239               }
 240             }
 241           }
 242         }
 243       } else {
 244         ii = (inout)?(i/m)+(i%m)*n:(i/n)+(i%n)*m  ;
 245         TASK_FOR_POINT ( &ii , &jps , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
 246                        min_subdomain, min_subdomain, &ierr ) ;
 247         coords[1] = Px ; coords[0] = Py ;
 248         MPI_Cart_rank( *comm, coords, &P ) ;
 249         p = buffer_for_proc( P , 0 , da_buf ) ;
 250         if ( typesize == sizeof(int) ) {
 251           for ( j = jps ; j <= jpe ; j++ ) {
 252             for ( k = kps ; k <= kpe ; k++ ) {
 253               pi = (int *)(p+x_curs_dst[P]) ;
 254               qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
 255                                              (k-kms) + (j-jms)*(kme-kms+1))))) ;
 256               *qi++ = *pi++ ;
 257               x_curs_dst[P] += typesize ;
 258             }
 259           }
 260         }
 261         else {
 262           for ( j = jps ; j <= jpe ; j++ ) {
 263             for ( k = kps ; k <= kpe ; k++ ) {
 264               for ( t = 0 ; t < typesize ; t++ ) {
 265                                *(buf + t + typesize*(
 266                                       (i-ims) + (ime-ims+1)*(
 267                                       (k-kms) + (j-jms)*(kme-kms+1))) ) =
 268                 *(p+x_curs_dst[P]) ;
 269                 x_curs_dst[P]++ ;
 270               }
 271             }
 272           }
 273         }
 274       }
 275     }
 276   } else if ( np_y > 1 && xy == 0 ) {
 277     for ( j = jps ; j <= MIN(jpe,m*np_dim-1) ; j++ ) {
 278       if ( pu == 0 ) {
 279         jj = (inout)?(j/n) + (j%n)*m:(j/m) + (j%m)*n ;
 280         TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
 281                        min_subdomain, min_subdomain, &ierr ) ;
 282         coords[1] = Px ; coords[0] = Py ;
 283         MPI_Cart_rank( *comm, coords, &P ) ;
 284         p = buffer_for_proc( P , 0 , da_buf ) ;
 285         if ( typesize == sizeof(int) ) {
 286           for ( i = ips ; i <= ipe ; i++ ) {
 287             for ( k = kps ; k <= kpe ; k++ ) {
 288               pi = (int *)(p+x_curs_src[P]) ;
 289               qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
 290                                              (k-kms) + (j-jms)*(kme-kms+1))))) ;
 291               *pi++ = *qi++ ;
 292               x_curs_src[P] += typesize ;
 293             }
 294           }
 295         }
 296         else {
 297           for ( i = ips ; i <= ipe ; i++ ) {
 298             for ( k = kps ; k <= kpe ; k++ ) {
 299               for ( t = 0 ; t < typesize ; t++ ) {
 300                 *(p+x_curs_src[P]) =
 301                                *(buf + t + typesize*(
 302                                       (i-ims) + (ime-ims+1)*(
 303                                       (k-kms) + (j-jms)*(kme-kms+1))) ) ;
 304                 x_curs_src[P]++ ;
 305               }
 306             }
 307           }
 308         }
 309       } else {
 310         jj = (inout)?(j/m) + (j%m)*n:(j/n) + (j%n)*m ;
 311         TASK_FOR_POINT ( &ips , &jj , &ids, &ide , &jds, &jde , &np_x , &np_y , &Px, &Py,
 312                        min_subdomain, min_subdomain, &ierr ) ;
 313         coords[1] = Px ; coords[0] = Py ;
 314         MPI_Cart_rank( *comm, coords, &P ) ;
 315         p = buffer_for_proc( P , 0 , da_buf ) ;
 316         if ( typesize == sizeof(int) ) {
 317           for ( i = ips ; i <= ipe ; i++ ) {
 318             for ( k = kps ; k <= kpe ; k++ ) {
 319               pi = (int *)(p+x_curs_dst[P]) ;
 320               qi = (int *)((buf + typesize*( (i-ims) + (ime-ims+1)*(
 321                                              (k-kms) + (j-jms)*(kme-kms+1))))) ;
 322               *qi++ = *pi++ ;
 323               x_curs_dst[P] += typesize ;
 324             }
 325           }
 326         }
 327         else {
 328           for ( i = ips ; i <= ipe ; i++ ) {
 329             for ( k = kps ; k <= kpe ; k++ ) {
 330               for ( t = 0 ; t < typesize ; t++ ) {
 331                                *(buf + t + typesize*(
 332                                       (i-ims) + (ime-ims+1)*(
 333                                       (k-kms) + (j-jms)*(kme-kms+1))) ) =
 334                 *(p+x_curs_dst[P]) ;
 335                 x_curs_dst[P]++ ;
 336               }
 337             }
 338           }
 339         }
 340       }
 341     }
 342   }
 343 #endif
 344 }
 345
 346 RSL_LITE_CYCLE ( int * Fcomm0, int *me0, int * np0 , int * np_x0 , int * np_y0 )
 347 {
 348   int me, np, np_x, np_y ;
 349   int yp, ym, xp, xm, nb ;
 350 #ifndef STUBMPI
 351   MPI_Status stat ;
 352   MPI_Comm comm, *comm0, dummy_comm ;
 353   int i, P ;
 354
 355   comm0 = &dummy_comm ;
 356   *comm0 = MPI_Comm_f2c( *Fcomm0 ) ;
 357
 358   comm = *comm0 ; me = *me0 ; np = *np0 ; np_x = *np_x0 ; np_y = *np_y0 ;
 359
 360   for ( P = 0 ; P < np ; P++ ) {
 361       nb = buffer_size_for_proc( P, RSL_RECVBUF ) ;
 362       MPI_Irecv ( buffer_for_proc( P, 0, RSL_RECVBUF ), nb, MPI_CHAR, P, me, comm, &(x_recv[P]) ) ;
 363       MPI_Isend ( buffer_for_proc( P, 0, RSL_SENDBUF ), x_curs_src[P], MPI_CHAR, P, P, comm, &(x_send[P]) ) ;
 364   }
 365   for ( P = 0 ; P < np ; P++ ) {
 366       MPI_Wait( &x_recv[P], &stat ) ;
 367       MPI_Wait( &x_send[P], &stat ) ;
 368   }
 369   for ( i = 0 ; i < np ; i++ ) {  x_curs_src[i] = 0 ; x_curs_dst[i] ; }
 370 #endif
 371 }
 372