qpms/translations_python.c

   1 #include <math.h>
   2 #include "qpms_types.h"
   3 #include "qpms_specfunc.h"
   4 #include "gaunt.h"
   5 #include "translations.h"
   6 #include "indexing.h" // TODO replace size_t and int with own index types here
   7 #include <stdbool.h>
   8 #include <gsl/gsl_sf_legendre.h>
   9 #include <gsl/gsl_sf_bessel.h>
  10 #include "tiny_inlines.h"
  11 #include "assert_cython_workaround.h"
  12 #include "kahansum.h"
  13 #include <gsl/gsl_sf_coupling.h>
  14 #include "qpms_error.h"
  15 #include "normalisation.h"
  16
  17 //#ifdef QPMS_COMPILE_PYTHON_EXTENSIONS
  18 #include <string.h>
  19
  20 #ifdef QPMS_USE_OMP
  21 #include <omp.h>
  22 #endif
  23
  24 int qpms_cython_trans_calculator_get_AB_arrays_loop(
  25     const qpms_trans_calculator *c, const qpms_bessel_t J, const int resnd,
  26     const int daxis, const int saxis,
  27     char *A_data, const npy_intp *A_shape, const npy_intp *A_strides,
  28     char *B_data, const npy_intp *B_shape, const npy_intp *B_strides,
  29     const char *r_data, const npy_intp *r_shape, const npy_intp *r_strides,
  30     const char *theta_data, const npy_intp *theta_shape, const npy_intp *theta_strides,
  31     const char *phi_data, const npy_intp *phi_shape, const npy_intp *phi_strides,
  32     const char *r_ge_d_data, const npy_intp *r_ge_d_shape, const npy_intp *r_ge_d_strides){
  33   assert(daxis != saxis);
  34   assert(resnd >= 2);
  35   int longest_axis = 0;
  36   int longestshape = 1;
  37   const npy_intp *resultshape = A_shape, *resultstrides = A_strides;
  38   // TODO put some restrict's everywhere?
  39   for (int ax = 0; ax < resnd; ++ax){
  40     assert(A_shape[ax] == B_shape[ax]);
  41     assert(A_strides[ax] == B_strides[ax]);
  42     if (daxis == ax || saxis == ax) continue;
  43     if (A_shape[ax] > longestshape) {
  44       longest_axis = ax;
  45       longestshape = 1;
  46     }
  47   }
  48   const npy_intp longlen = resultshape[longest_axis];
  49
  50   npy_intp innerloop_shape[resnd];
  51   for (int ax = 0; ax < resnd; ++ax) {
  52     innerloop_shape[ax] = resultshape[ax];
  53   }
  54   /* longest axis will be iterated in the outer (parallelized) loop.
  55    * Therefore, longest axis, together with saxis and daxis,
  56    * will not be iterated in the inner loop:
  57    */
  58   innerloop_shape[longest_axis] = 1;
  59   innerloop_shape[daxis] = 1;
  60   innerloop_shape[saxis] = 1;
  61
  62   // these are the 'strides' passed to the qpms_trans_calculator_get_AB_arrays_ext
  63   // function, which expects 'const double *' strides, not 'char *' ones.
  64   const npy_intp dstride = resultstrides[daxis] / sizeof(complex double);
  65   const npy_intp sstride = resultstrides[saxis] / sizeof(complex double);
  66
  67   int errval = 0;
  68   // TODO here start parallelisation
  69   //#pragma omp parallel
  70   {
  71     npy_intp local_indices[resnd];
  72     memset(local_indices, 0, sizeof(local_indices));
  73     int errval_local = 0;
  74     size_t longi;
  75     //#pragma omp for
  76     for(longi = 0; longi < longlen; ++longi) {
  77       // this might be done also in the inverse order, but this is more
  78       // 'c-contiguous' way of incrementing the indices
  79       int ax = resnd - 1;
  80       while(ax >= 0) {
  81         /* calculate the correct index/pointer for each array used.
  82          * This can be further optimized from O(resnd * total size of
  83          * the result array) to O(total size of the result array), but
  84          * fick that now
  85          */
  86         const char *r_p = r_data + r_strides[longest_axis] * longi;
  87         const char *theta_p = theta_data + theta_strides[longest_axis] * longi;
  88         const char *phi_p = phi_data + phi_strides[longest_axis] * longi;
  89         const char *r_ge_d_p = r_ge_d_data + r_ge_d_strides[longest_axis] * longi;
  90         char *A_p = A_data + A_strides[longest_axis] * longi;
  91         char *B_p = B_data + B_strides[longest_axis] * longi;
  92         for(int i = 0; i < resnd; ++i) {
  93           // following two lines are probably not needed, as innerloop_shape is there 1 anyway
  94           // so if i == daxis, saxis, or longest_axis, local_indices[i] is zero.
  95           if (i == longest_axis) continue;
  96           if (daxis == i || saxis == i) continue;
  97           r_p += r_strides[i] * local_indices[i];
  98           theta_p += theta_strides[i] * local_indices[i];
  99           phi_p += phi_strides[i] * local_indices[i];
 100           A_p += A_strides[i] * local_indices[i];
 101           B_p += B_strides[i] * local_indices[i];
 102         }
 103
 104         // perform the actual task here
 105         errval_local |= qpms_trans_calculator_get_AB_arrays_ext(c, (complex double *)A_p,
 106             (complex double *)B_p,
 107             dstride, sstride,
 108             // FIXME change all the _ext function types to npy_... so that
 109             // these casts are not needed
 110             *((double *) r_p), *((double *) theta_p), *((double *)phi_p),
 111             (int)(*((npy_bool *) r_ge_d_p)), J);
 112         QPMS_ENSURE_SUCCESS(errval_local);
 113
 114         // increment the last index 'digit' (ax is now resnd-1; we don't have do-while loop in python)
 115         ++local_indices[ax];
 116         while(local_indices[ax] == innerloop_shape[ax] && ax >= 0) {
 117           // overflow to the next digit but stop when reached below the last one
 118           local_indices[ax] = 0;
 119           //local_indices[--ax]++; // dekrementace indexu pod nulu a následná inkrementace poruší paměť FIXME
 120           ax--;
 121           if (ax >= 0) local_indices[ax]++;
 122         }
 123         if (ax >= 0) // did not overflow, get back to the lowest index
 124           ax = resnd - 1;
 125       }
 126     }
 127     errval |= errval_local;
 128   }
 129   // FIXME when parallelizing
 130   // TODO Here end parallelisation
 131   return errval;
 132 }
 133
 134
 135 //#endif // QPMS_COMPILE_PYTHON_EXTENSIONS
 136