libclc/generic/lib/integer/mul_hi.cl

   1 #include <clc/clc.h>
   2
   3 //For all types EXCEPT long, which is implemented separately
   4 #define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
   5     _CLC_OVERLOAD _CLC_DEF GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \
   6         return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
   7     } \
   8
   9 //FOIL-based long mul_hi
  10 //
  11 // Summary: Treat mul_hi(long x, long y) as:
  12 // (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
  13 // and b and d are the low-order parts of x and y.
  14 // Thinking back to algebra, we use FOIL to do the work.
  15
  16 _CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){
  17     long f, o, i;
  18     ulong l;
  19
  20     //Move the high/low halves of x/y into the lower 32-bits of variables so
  21     //that we can multiply them without worrying about overflow.
  22     long x_hi = x >> 32;
  23     long x_lo = x & UINT_MAX;
  24     long y_hi = y >> 32;
  25     long y_lo = y & UINT_MAX;
  26
  27     //Multiply all of the components according to FOIL method
  28     f = x_hi * y_hi;
  29     o = x_hi * y_lo;
  30     i = x_lo * y_hi;
  31     l = x_lo * y_lo;
  32
  33     //Now add the components back together in the following steps:
  34     //F: doesn't need to be modified
  35     //O/I: Need to be added together.
  36     //L: Shift right by 32-bits, then add into the sum of O and I
  37     //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
  38     //
  39     //We use hadd to give us a bit of extra precision for the intermediate sums
  40     //but as a result, we shift by 31 bits instead of 32
  41     return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31));
  42 }
  43
  44 _CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y){
  45     ulong f, o, i;
  46     ulong l;
  47
  48     //Move the high/low halves of x/y into the lower 32-bits of variables so
  49     //that we can multiply them without worrying about overflow.
  50     ulong x_hi = x >> 32;
  51     ulong x_lo = x & UINT_MAX;
  52     ulong y_hi = y >> 32;
  53     ulong y_lo = y & UINT_MAX;
  54
  55     //Multiply all of the components according to FOIL method
  56     f = x_hi * y_hi;
  57     o = x_hi * y_lo;
  58     i = x_lo * y_hi;
  59     l = x_lo * y_lo;
  60
  61     //Now add the components back together, taking care to respect the fact that:
  62     //F: doesn't need to be modified
  63     //O/I: Need to be added together.
  64     //L: Shift right by 32-bits, then add into the sum of O and I
  65     //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
  66     //
  67     //We use hadd to give us a bit of extra precision for the intermediate sums
  68     //but as a result, we shift by 31 bits instead of 32
  69     return (f + (hadd(o, (i + (l>>32))) >> 31));
  70 }
  71
  72 #define __CLC_MUL_HI_VEC(GENTYPE) \
  73     _CLC_OVERLOAD _CLC_DEF GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \
  74         return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \
  75     } \
  76     _CLC_OVERLOAD _CLC_DEF GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \
  77         return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \
  78     } \
  79     _CLC_OVERLOAD _CLC_DEF GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \
  80         return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
  81     } \
  82     _CLC_OVERLOAD _CLC_DEF GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \
  83         return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
  84     } \
  85     _CLC_OVERLOAD _CLC_DEF GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \
  86         return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
  87     } \
  88
  89 #define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
  90     __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
  91     __CLC_MUL_HI_VEC(TYPE)
  92
  93 #define __CLC_MUL_HI_TYPES() \
  94     __CLC_MUL_HI_DEC_IMPL(short, char, 8) \
  95     __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \
  96     __CLC_MUL_HI_DEC_IMPL(int, short, 16) \
  97     __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
  98     __CLC_MUL_HI_DEC_IMPL(long, int, 32) \
  99     __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
 100     __CLC_MUL_HI_VEC(long) \
 101     __CLC_MUL_HI_VEC(ulong)
 102
 103 __CLC_MUL_HI_TYPES()
 104
 105 #undef __CLC_MUL_HI_TYPES
 106 #undef __CLC_MUL_HI_DEC_IMPL
 107 #undef __CLC_MUL_HI_IMPL
 108 #undef __CLC_MUL_HI_VEC
 109 #undef __CLC_B32