libc/src/math/generic/log_range_reduction.h

   1 //===-- Extra range reduction steps for accurate pass of logarithms -------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H
  10 #define LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H
  11
  12 #include "common_constants.h"
  13 #include "src/__support/FPUtil/dyadic_float.h"
  14 #include "src/__support/UInt128.h"
  15
  16 namespace __llvm_libc {
  17
  18 // Struct to store -log*(r) for 4 range reduction steps.
  19 struct LogRR {
  20   fputil::DyadicFloat<128> step_1[128];
  21   fputil::DyadicFloat<128> step_2[193];
  22   fputil::DyadicFloat<128> step_3[161];
  23   fputil::DyadicFloat<128> step_4[130];
  24 };
  25
  26 // Perform logarithm range reduction steps 2-4.
  27 // Inputs from the first step of range reduction:
  28 //   m_x : the reduced argument after the first step of range reduction
  29 //         satisfying  -2^-8 <= m_x < 2^-7  and  ulp(m_x) >= 2^-60.
  30 //   idx1: index of the -log(r1) table from the first step.
  31 // Outputs of the extra range reduction steps:
  32 //   sum: adding -log(r1) - log(r2) - log(r3) - log(r4) to the resulted sum.
  33 //   return value: the reduced argument v satisfying:
  34 //                 -0x1.0002143p-29 <= v < 0x1p-29,  and  ulp(v) >= 2^(-125).
  35 LIBC_INLINE fputil::DyadicFloat<128>
  36 log_range_reduction(double m_x, const LogRR &log_table,
  37                     fputil::DyadicFloat<128> &sum) {
  38   using Float128 = typename fputil::DyadicFloat<128>;
  39   using MType = typename Float128::MantissaType;
  40
  41   int64_t v = static_cast<int64_t>(m_x * 0x1.0p60); // ulp = 2^-60
  42
  43   // Range reduction - Step 2
  44   // Output range: vv2 in [-0x1.3ffcp-15, 0x1.3e3dp-15].
  45   // idx2 = trunc(2^14 * (v + 2^-8 + 2^-15))
  46   size_t idx2 = static_cast<size_t>((v + 0x10'2000'0000'0000) >> 46);
  47   sum = fputil::quick_add(sum, log_table.step_2[idx2]);
  48
  49   int64_t s2 = static_cast<int64_t>(S2[idx2]); // |s| <= 2^-7, ulp = 2^-16
  50   int64_t sv2 = s2 * v;             // |s*v| < 2^-14, ulp = 2^(-60-16) = 2^-76
  51   int64_t spv2 = (s2 << 44) + v;    // |s + v| < 2^-14, ulp = 2^-60
  52   int64_t vv2 = (spv2 << 16) + sv2; // |vv2| < 2^-14, ulp = 2^-76
  53
  54   // Range reduction - Step 3
  55   // Output range: vv3 in [-0x1.01928p-22 , 0x1p-22]
  56   // idx3 = trunc(2^21 * (v + 80*2^-21 + 2^-22))
  57   size_t idx3 = static_cast<size_t>((vv2 + 0x2840'0000'0000'0000) >> 55);
  58   sum = fputil::quick_add(sum, log_table.step_3[idx3]);
  59
  60   int64_t s3 = static_cast<int64_t>(S3[idx3]); // |s| < 2^-13, ulp = 2^-21
  61   int64_t spv3 = (s3 << 55) + vv2;             // |s + v| < 2^-21, ulp = 2^-76
  62   // |s*v| < 2^-27, ulp = 2^(-76-21) = 2^-97
  63   Int128 sv3 = static_cast<Int128>(s3) * static_cast<Int128>(vv2);
  64   // |vv3| < 2^-21, ulp = 2^-97
  65   Int128 vv3 = (static_cast<Int128>(spv3) << 21) + sv3;
  66
  67   // Range reduction - Step 4
  68   // Output range: vv4 in [-0x1.0002143p-29 , 0x1p-29]
  69   // idx4 = trunc(2^21 * (v + 65*2^-28 + 2^-29))
  70   size_t idx4 = static_cast<size_t>((static_cast<int>(vv3 >> 68) + 131) >> 1);
  71
  72   sum = fputil::quick_add(sum, log_table.step_4[idx4]);
  73
  74   Int128 s4 = static_cast<Int128>(S4[idx4]); // |s| < 2^-21, ulp = 2^-28
  75   // |s + v| < 2^-28, ulp = 2^-97
  76   Int128 spv4 = (s4 << 69) + vv3;
  77   // |s*v| < 2^-42, ulp = 2^(-97-28) = 2^-125
  78   Int128 sv4 = s4 * vv3;
  79   // |vv4| < 2^-28, ulp = 2^-125
  80   Int128 vv4 = (spv4 << 28) + sv4;
  81
  82   return (vv4 < 0) ? Float128(true, -125,
  83                               MType({static_cast<uint64_t>(-vv4),
  84                                      static_cast<uint64_t>((-vv4) >> 64)}))
  85                    : Float128(false, -125,
  86                               MType({static_cast<uint64_t>(vv4),
  87                                      static_cast<uint64_t>(vv4 >> 64)}));
  88 }
  89
  90 } // namespace __llvm_libc
  91
  92 #endif // LLVM_LIBC_SRC_MATH_GENERIC_LOG_RANGE_REDUCTION_H