src/gmxlib/thread_mpi/reduce.c

   1 /*
   2 This source code file is part of thread_mpi.
   3 Written by Sander Pronk, Erik Lindahl, and possibly others.
   4
   5 Copyright (c) 2009, Sander Pronk, Erik Lindahl.
   6 All rights reserved.
   7
   8 Redistribution and use in source and binary forms, with or without
   9 modification, are permitted provided that the following conditions are met:
  10 1) Redistributions of source code must retain the above copyright
  11    notice, this list of conditions and the following disclaimer.
  12 2) Redistributions in binary form must reproduce the above copyright
  13    notice, this list of conditions and the following disclaimer in the
  14    documentation and/or other materials provided with the distribution.
  15 3) Neither the name of the copyright holders nor the
  16    names of its contributors may be used to endorse or promote products
  17    derived from this software without specific prior written permission.
  18
  19 THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
  20 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  21 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22 DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
  23 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  24 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  25 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  26 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 If you want to redistribute modifications, please consider that
  31 scientific software is very special. Version control is crucial -
  32 bugs must be traceable. We will be happy to consider code for
  33 inclusion in the official distribution, but derived work should not
  34 be called official thread_mpi. Details are found in the README & COPYING
  35 files.
  36 */
  37
  38 #ifdef HAVE_TMPI_CONFIG_H
  39 #include "tmpi_config.h"
  40 #endif
  41
  42 #ifdef HAVE_CONFIG_H
  43 #include "config.h"
  44 #endif
  45
  46
  47 #ifdef HAVE_UNISTD_H
  48 #include <unistd.h>
  49 #endif
  50
  51 #include <errno.h>
  52 #include <stdlib.h>
  53 #include <stdio.h>
  54 #include <stdarg.h>
  55 #include <string.h>
  56
  57 #include "impl.h"
  58 #include "collective.h"
  59
  60
  61
  62 /* run a single binary reduce operation on src_a and src_b, producing dest.
  63       dest and src_a may be identical */
  64 static int tMPI_Reduce_run_op(void *dest, void *src_a, void *src_b,
  65                               tMPI_Datatype datatype, int count, tMPI_Op op,
  66                               tMPI_Comm comm);
  67
  68
  69 static int tMPI_Reduce_run_op(void *dest, void *src_a, void *src_b,
  70                               tMPI_Datatype datatype, int count, tMPI_Op op,
  71                               tMPI_Comm comm)
  72 {
  73     tMPI_Op_fn fn=datatype->op_functions[op];
  74
  75     if (src_a==src_b)
  76     {
  77         return tMPI_Error(comm, TMPI_ERR_XFER_BUF_OVERLAP);
  78     }
  79     fn(dest, src_a, src_b, count);
  80     return TMPI_SUCCESS;
  81 }
  82
  83 int tMPI_Reduce_fast(void* sendbuf, void* recvbuf, int count,
  84                      tMPI_Datatype datatype, tMPI_Op op, int root,
  85                      tMPI_Comm comm)
  86 {
  87     struct tmpi_thread *cur=tMPI_Get_current();
  88     int myrank=tMPI_Comm_seek_rank(comm, cur);
  89
  90     /* this function uses a binary tree-like reduction algorithm: */
  91     int N=tMPI_Comm_N(comm);
  92     int myrank_rtr=(N+myrank-root)%N; /* my rank relative to root */
  93     int Nred=N; /* number of neighbours that still communicate
  94                    (decreases exponentially) */
  95     int nbr_dist=1; /* distance between communicating neighbours
  96                        (increases exponentially) */
  97     int stepping=2; /* distance between non-communicating neighbours
  98                        (increases exponentially) */
  99     int iteration=0;
 100
 101     if (count==0)
 102         return TMPI_SUCCESS;
 103     if (!comm)
 104     {
 105         return tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_COMM);
 106     }
 107     if (!recvbuf)
 108     {
 109         return tMPI_Error(comm, TMPI_ERR_BUF);
 110     }
 111     if ( (!datatype->op_functions) || (!datatype->op_functions[op]) )
 112     {
 113         return tMPI_Error(comm, TMPI_ERR_OP_FN);
 114     }
 115
 116     if (sendbuf==TMPI_IN_PLACE)/* i.e. sendbuf == tMPI_IN_PLACE */
 117     {
 118         sendbuf=recvbuf;
 119     }
 120     /* we set our send and recv buffer s*/
 121     tMPI_Atomic_ptr_set(&(comm->reduce_sendbuf[myrank]),sendbuf);
 122     tMPI_Atomic_ptr_set(&(comm->reduce_recvbuf[myrank]),recvbuf);
 123
 124     while (Nred>1)
 125     {
 126         /* calculate neighbour rank (here we use the real rank) */
 127         int nbr=(myrank_rtr%stepping==0) ?
 128                   (N+myrank+nbr_dist)%N :
 129                   (N+myrank-nbr_dist)%N;
 130
 131 #ifdef TMPI_DEBUG
 132         printf("%d: iteration %d: myrank_rtr=%d, stepping=%d\n",
 133                myrank, iteration, myrank_rtr, stepping);
 134         fflush(stdout);
 135 #endif
 136         /* check if I'm the reducing thread in this iteration's pair: */
 137         if (myrank_rtr%stepping == 0)
 138         {
 139             void *a,*b;
 140             int ret;
 141
 142             /* now wait for my neighbor's data to become ready.
 143                First check if I actually have a neighbor. */
 144             if (myrank_rtr+nbr_dist<N)
 145             {
 146 #ifdef TMPI_DEBUG
 147                 printf("%d: waiting to reduce with %d, iteration=%d\n",
 148                        myrank, nbr, iteration);
 149                 fflush(stdout);
 150 #endif
 151
 152 #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT)
 153                 tMPI_Profile_wait_start(cur);
 154 #endif
 155                 tMPI_Event_wait( &(comm->csync[myrank].events[nbr]) );
 156                 tMPI_Event_process( &(comm->csync[myrank].events[nbr]), 1);
 157 #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT)
 158                 tMPI_Profile_wait_stop(cur, TMPIWAIT_Reduce);
 159 #endif
 160
 161 #ifdef TMPI_DEBUG
 162                 printf("%d: reducing with %d, iteration=%d\n",
 163                        myrank, nbr, iteration);
 164                 fflush(stdout);
 165 #endif
 166                 /* we reduce with our neighbour*/
 167                 if (iteration==0)
 168                 {
 169                     /* for the first iteration, the inputs are in the
 170                        sendbuf*/
 171                     a=sendbuf;
 172                     b=(void*)tMPI_Atomic_ptr_get(&(comm->reduce_sendbuf[nbr]));
 173                 }
 174                 else
 175                 {
 176                     /* after the first operation, they're already in
 177                        the recvbuf */
 178                     a=recvbuf;
 179                     b=(void*)tMPI_Atomic_ptr_get(&(comm->reduce_recvbuf[nbr]));
 180                 }
 181                 /* here we check for overlapping buffers */
 182                 if (a==b)
 183                 {
 184                     return tMPI_Error(comm, TMPI_ERR_XFER_BUF_OVERLAP);
 185                 }
 186
 187
 188                 if ((ret=tMPI_Reduce_run_op(recvbuf, a, b, datatype,
 189                                             count, op, comm)) != TMPI_SUCCESS)
 190                     return ret;
 191
 192                 /* signal to my neighbour that I'm ready. */
 193                 tMPI_Event_signal( &(comm->csync[nbr].events[myrank]) );
 194             }
 195             else
 196             {
 197 #ifdef TMPI_DEBUG
 198             printf("%d: not waiting copying buffer\n", myrank);
 199             fflush(stdout);
 200 #endif
 201                 /* we still need to put things in the right buffer for the next
 202                    iteration. We need to check for overlapping buffers
 203                    here because MPI_IN_PLACE might cause recvbuf to be the
 204                    same as sendbuf. */
 205                 if (iteration==0 && (recvbuf!=sendbuf))
 206                     memcpy(recvbuf, sendbuf, datatype->size*count);
 207             }
 208
 209         }
 210         else
 211         {
 212             /* the other thread is doing the reducing; we can just
 213                wait and break when ready */
 214            /* Awake our neighbour */
 215             tMPI_Event_signal( &(comm->csync[nbr].events[myrank]) );
 216
 217
 218 #ifdef TMPI_DEBUG
 219             printf("%d: signalled %d, now waiting: iteration=%d\n",
 220                    nbr, myrank,  iteration);
 221             fflush(stdout);
 222 #endif
 223
 224             /* And wait for an incoming event from out neighbour */
 225 #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT)
 226             tMPI_Profile_wait_start(cur);
 227 #endif
 228             tMPI_Event_wait( &(comm->csync[myrank].events[nbr]) );
 229             tMPI_Event_process( &(comm->csync[myrank].events[nbr]), 1);
 230 #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT)
 231             tMPI_Profile_wait_stop(cur, TMPIWAIT_Reduce);
 232 #endif
 233             /* now we can break because our data is reduced, and
 234                our neighbour goes on reducing it further. */
 235             break;
 236         }
 237
 238 #ifdef TMPI_DEBUG
 239         printf("%d: iteration over, iteration=%d\n", myrank,  iteration);
 240         fflush(stdout);
 241 #endif
 242
 243         Nred = Nred/2 + Nred%2;
 244         nbr_dist*=2;
 245         stepping*=2;
 246         iteration++;
 247     }
 248
 249     return TMPI_SUCCESS;
 250 }
 251
 252 int tMPI_Reduce(void* sendbuf, void* recvbuf, int count,
 253                tMPI_Datatype datatype, tMPI_Op op, int root, tMPI_Comm comm)
 254 {
 255     struct tmpi_thread *cur=tMPI_Get_current();
 256     int myrank=tMPI_Comm_seek_rank(comm, cur);
 257     int ret;
 258
 259 #ifdef TMPI_PROFILE
 260     tMPI_Profile_count_start(cur);
 261 #endif
 262 #ifdef TMPI_TRACE
 263     tMPI_Trace_print("tMPI_Reduce(%p, %p, %d, %p, %p, %d, %p)",
 264                        sendbuf, recvbuf, count, datatype, op, root, comm);
 265 #endif
 266
 267     if (myrank==root)
 268     {
 269         if (sendbuf==TMPI_IN_PLACE) /* i.e. sendbuf == TMPI_IN_PLACE */
 270         {
 271             sendbuf=recvbuf;
 272         }
 273     }
 274     else
 275     {
 276 #ifdef TMPI_WARN_MALLOC
 277         fprintf(stderr, "Warning: malloc during tMPI_Reduce\n");
 278 #endif
 279         recvbuf=(void*)tMPI_Malloc(datatype->size*count);
 280     }
 281     ret=tMPI_Reduce_fast(sendbuf, recvbuf, count, datatype, op, root, comm);
 282     if (myrank!=root)
 283     {
 284         free(recvbuf);
 285     }
 286 #ifdef TMPI_PROFILE
 287     tMPI_Profile_count_stop(cur, TMPIFN_Reduce);
 288 #endif
 289     return ret;
 290 }
 291
 292 int tMPI_Allreduce(void* sendbuf, void* recvbuf, int count,
 293                    tMPI_Datatype datatype, tMPI_Op op, tMPI_Comm comm)
 294 {
 295     void *rootbuf=NULL; /* root process' receive buffer */
 296     struct tmpi_thread *cur=tMPI_Get_current();
 297     int myrank=tMPI_Comm_seek_rank(comm, cur);
 298     int ret;
 299
 300 #ifdef TMPI_PROFILE
 301     tMPI_Profile_count_start(cur);
 302 #endif
 303 #ifdef TMPI_TRACE
 304     tMPI_Trace_print("tMPI_Allreduce(%p, %p, %d, %p, %p, %p)",
 305                      sendbuf, recvbuf, count, datatype, op, comm);
 306 #endif
 307     if (count==0)
 308         return TMPI_SUCCESS;
 309     if (!recvbuf)
 310     {
 311         return tMPI_Error(comm, TMPI_ERR_BUF);
 312     }
 313     if (sendbuf==TMPI_IN_PLACE) /* i.e. sendbuf == TMPI_IN_PLACE */
 314     {
 315         sendbuf=recvbuf;
 316     }
 317
 318     ret=tMPI_Reduce_fast(sendbuf, recvbuf, count, datatype, op, 0, comm);
 319 #if defined(TMPI_PROFILE)
 320     tMPI_Profile_wait_start(cur);
 321 #endif
 322     tMPI_Barrier_wait( &(comm->barrier));
 323 #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT)
 324     tMPI_Profile_wait_stop(cur, TMPIWAIT_Reduce);
 325 #endif
 326     /* distribute rootbuf */
 327     rootbuf=(void*)tMPI_Atomic_ptr_get(&(comm->reduce_recvbuf[0]));
 328
 329     /* and now we just copy things back. We know that the root thread
 330        arrives last, so there's no point in using tMPI_Scatter with
 331        copy buffers, etc. */
 332     if (myrank != 0)
 333     {
 334         if (rootbuf==recvbuf)
 335         {
 336             return tMPI_Error(comm, TMPI_ERR_XFER_BUF_OVERLAP);
 337         }
 338         memcpy(recvbuf, rootbuf, datatype->size*count );
 339     }
 340
 341 #if defined(TMPI_PROFILE) && defined(TMPI_CYCLE_COUNT)
 342     tMPI_Profile_wait_start(cur);
 343 #endif
 344     tMPI_Barrier_wait( &(comm->barrier));
 345 #if defined(TMPI_PROFILE)
 346     tMPI_Profile_wait_stop(cur, TMPIWAIT_Reduce);
 347     tMPI_Profile_count_stop(cur, TMPIFN_Allreduce);
 348 #endif
 349     return ret;
 350 }
 351