libcxx/src/include/ryu/d2s_intrinsics.h

   1 //===----------------------------------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 // Copyright (c) Microsoft Corporation.
  10 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  11
  12 // Copyright 2018 Ulf Adams
  13 // Copyright (c) Microsoft Corporation. All rights reserved.
  14
  15 // Boost Software License - Version 1.0 - August 17th, 2003
  16
  17 // Permission is hereby granted, free of charge, to any person or organization
  18 // obtaining a copy of the software and accompanying documentation covered by
  19 // this license (the "Software") to use, reproduce, display, distribute,
  20 // execute, and transmit the Software, and to prepare derivative works of the
  21 // Software, and to permit third-parties to whom the Software is furnished to
  22 // do so, all subject to the following:
  23
  24 // The copyright notices in the Software and this entire statement, including
  25 // the above license grant, this restriction and the following disclaimer,
  26 // must be included in all copies of the Software, in whole or in part, and
  27 // all derivative works of the Software, unless such copies or derivative
  28 // works are solely in the form of machine-executable object code generated by
  29 // a source language processor.
  30
  31 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  32 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  33 // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
  34 // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
  35 // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
  36 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  37 // DEALINGS IN THE SOFTWARE.
  38
  39 #ifndef _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H
  40 #define _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H
  41
  42 // Avoid formatting to keep the changes with the original code minimal.
  43 // clang-format off
  44
  45 #include <__assert>
  46 #include <__config>
  47
  48 #include "include/ryu/ryu.h"
  49
  50 _LIBCPP_BEGIN_NAMESPACE_STD
  51
  52 #if defined(_M_X64) && defined(_MSC_VER)
  53 #define _LIBCPP_INTRINSIC128 1
  54 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
  55   return _umul128(__a, __b, __productHi);
  56 }
  57
  58 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
  59   // For the __shiftright128 intrinsic, the shift value is always
  60   // modulo 64.
  61   // In the current implementation of the double-precision version
  62   // of Ryu, the shift value is always < 64.
  63   // (The shift value is in the range [49, 58].)
  64   // Check this here in case a future change requires larger shift
  65   // values. In this case this function needs to be adjusted.
  66   _LIBCPP_ASSERT_UNCATEGORIZED(__dist < 64, "");
  67   return __shiftright128(__lo, __hi, static_cast<unsigned char>(__dist));
  68 }
  69
  70 // ^^^ intrinsics available ^^^ / vvv __int128 available vvv
  71 #elif defined(__SIZEOF_INT128__) && ( \
  72     (defined(__clang__) && !defined(_MSC_VER)) || \
  73     (defined(__GNUC__) && !defined(__clang__) && !defined(__CUDACC__)))
  74 #define _LIBCPP_INTRINSIC128 1
  75   // We have __uint128 support in clang or gcc
  76 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
  77   auto __temp = __a * (unsigned __int128)__b;
  78   *__productHi = __temp >> 64;
  79   return static_cast<uint64_t>(__temp);
  80 }
  81
  82 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
  83   // In the current implementation of the double-precision version
  84   // of Ryu, the shift value is always < 64.
  85   // (The shift value is in the range [49, 58].)
  86   // Check this here in case a future change requires larger shift
  87   // values. In this case this function needs to be adjusted.
  88   _LIBCPP_ASSERT_UNCATEGORIZED(__dist < 64, "");
  89   auto __temp = __lo | ((unsigned __int128)__hi << 64);
  90   // For x64 128-bit shfits using the `shrd` instruction and two 64-bit
  91   // registers, the shift value is modulo 64.  Thus the `& 63` is free.
  92   return static_cast<uint64_t>(__temp >> (__dist & 63));
  93 }
  94 #else // ^^^ __int128 available ^^^ / vvv intrinsics unavailable vvv
  95
  96 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_ALWAYS_INLINE uint64_t __ryu_umul128(const uint64_t __a, const uint64_t __b, uint64_t* const __productHi) {
  97   // TRANSITION, VSO-634761
  98   // The casts here help MSVC to avoid calls to the __allmul library function.
  99   const uint32_t __aLo = static_cast<uint32_t>(__a);
 100   const uint32_t __aHi = static_cast<uint32_t>(__a >> 32);
 101   const uint32_t __bLo = static_cast<uint32_t>(__b);
 102   const uint32_t __bHi = static_cast<uint32_t>(__b >> 32);
 103
 104   const uint64_t __b00 = static_cast<uint64_t>(__aLo) * __bLo;
 105   const uint64_t __b01 = static_cast<uint64_t>(__aLo) * __bHi;
 106   const uint64_t __b10 = static_cast<uint64_t>(__aHi) * __bLo;
 107   const uint64_t __b11 = static_cast<uint64_t>(__aHi) * __bHi;
 108
 109   const uint32_t __b00Lo = static_cast<uint32_t>(__b00);
 110   const uint32_t __b00Hi = static_cast<uint32_t>(__b00 >> 32);
 111
 112   const uint64_t __mid1 = __b10 + __b00Hi;
 113   const uint32_t __mid1Lo = static_cast<uint32_t>(__mid1);
 114   const uint32_t __mid1Hi = static_cast<uint32_t>(__mid1 >> 32);
 115
 116   const uint64_t __mid2 = __b01 + __mid1Lo;
 117   const uint32_t __mid2Lo = static_cast<uint32_t>(__mid2);
 118   const uint32_t __mid2Hi = static_cast<uint32_t>(__mid2 >> 32);
 119
 120   const uint64_t __pHi = __b11 + __mid1Hi + __mid2Hi;
 121   const uint64_t __pLo = (static_cast<uint64_t>(__mid2Lo) << 32) | __b00Lo;
 122
 123   *__productHi = __pHi;
 124   return __pLo;
 125 }
 126
 127 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __ryu_shiftright128(const uint64_t __lo, const uint64_t __hi, const uint32_t __dist) {
 128   // We don't need to handle the case __dist >= 64 here (see above).
 129   _LIBCPP_ASSERT_UNCATEGORIZED(__dist < 64, "");
 130 #ifdef _LIBCPP_64_BIT
 131   _LIBCPP_ASSERT_UNCATEGORIZED(__dist > 0, "");
 132   return (__hi << (64 - __dist)) | (__lo >> __dist);
 133 #else // ^^^ 64-bit ^^^ / vvv 32-bit vvv
 134   // Avoid a 64-bit shift by taking advantage of the range of shift values.
 135   _LIBCPP_ASSERT_UNCATEGORIZED(__dist >= 32, "");
 136   return (__hi << (64 - __dist)) | (static_cast<uint32_t>(__lo >> 32) >> (__dist - 32));
 137 #endif // ^^^ 32-bit ^^^
 138 }
 139
 140 #endif // ^^^ intrinsics unavailable ^^^
 141
 142 #ifndef _LIBCPP_64_BIT
 143
 144 // Returns the high 64 bits of the 128-bit product of __a and __b.
 145 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __umulh(const uint64_t __a, const uint64_t __b) {
 146   // Reuse the __ryu_umul128 implementation.
 147   // Optimizers will likely eliminate the instructions used to compute the
 148   // low part of the product.
 149   uint64_t __hi;
 150   (void) __ryu_umul128(__a, __b, &__hi);
 151   return __hi;
 152 }
 153
 154 // On 32-bit platforms, compilers typically generate calls to library
 155 // functions for 64-bit divisions, even if the divisor is a constant.
 156 //
 157 // TRANSITION, LLVM-37932
 158 //
 159 // The functions here perform division-by-constant using multiplications
 160 // in the same way as 64-bit compilers would do.
 161 //
 162 // NB:
 163 // The multipliers and shift values are the ones generated by clang x64
 164 // for expressions like x/5, x/10, etc.
 165
 166 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div5(const uint64_t __x) {
 167   return __umulh(__x, 0xCCCCCCCCCCCCCCCDu) >> 2;
 168 }
 169
 170 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div10(const uint64_t __x) {
 171   return __umulh(__x, 0xCCCCCCCCCCCCCCCDu) >> 3;
 172 }
 173
 174 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div100(const uint64_t __x) {
 175   return __umulh(__x >> 2, 0x28F5C28F5C28F5C3u) >> 2;
 176 }
 177
 178 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e8(const uint64_t __x) {
 179   return __umulh(__x, 0xABCC77118461CEFDu) >> 26;
 180 }
 181
 182 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e9(const uint64_t __x) {
 183   return __umulh(__x >> 9, 0x44B82FA09B5A53u) >> 11;
 184 }
 185
 186 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __mod1e9(const uint64_t __x) {
 187   // Avoid 64-bit math as much as possible.
 188   // Returning static_cast<uint32_t>(__x - 1000000000 * __div1e9(__x)) would
 189   // perform 32x64-bit multiplication and 64-bit subtraction.
 190   // __x and 1000000000 * __div1e9(__x) are guaranteed to differ by
 191   // less than 10^9, so their highest 32 bits must be identical,
 192   // so we can truncate both sides to uint32_t before subtracting.
 193   // We can also simplify static_cast<uint32_t>(1000000000 * __div1e9(__x)).
 194   // We can truncate before multiplying instead of after, as multiplying
 195   // the highest 32 bits of __div1e9(__x) can't affect the lowest 32 bits.
 196   return static_cast<uint32_t>(__x) - 1000000000 * static_cast<uint32_t>(__div1e9(__x));
 197 }
 198
 199 #else // ^^^ 32-bit ^^^ / vvv 64-bit vvv
 200
 201 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div5(const uint64_t __x) {
 202   return __x / 5;
 203 }
 204
 205 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div10(const uint64_t __x) {
 206   return __x / 10;
 207 }
 208
 209 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div100(const uint64_t __x) {
 210   return __x / 100;
 211 }
 212
 213 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e8(const uint64_t __x) {
 214   return __x / 100000000;
 215 }
 216
 217 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint64_t __div1e9(const uint64_t __x) {
 218   return __x / 1000000000;
 219 }
 220
 221 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __mod1e9(const uint64_t __x) {
 222   return static_cast<uint32_t>(__x - 1000000000 * __div1e9(__x));
 223 }
 224
 225 #endif // ^^^ 64-bit ^^^
 226
 227 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline uint32_t __pow5Factor(uint64_t __value) {
 228   uint32_t __count = 0;
 229   for (;;) {
 230     _LIBCPP_ASSERT_UNCATEGORIZED(__value != 0, "");
 231     const uint64_t __q = __div5(__value);
 232     const uint32_t __r = static_cast<uint32_t>(__value) - 5 * static_cast<uint32_t>(__q);
 233     if (__r != 0) {
 234       break;
 235     }
 236     __value = __q;
 237     ++__count;
 238   }
 239   return __count;
 240 }
 241
 242 // Returns true if __value is divisible by 5^__p.
 243 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline bool __multipleOfPowerOf5(const uint64_t __value, const uint32_t __p) {
 244   // I tried a case distinction on __p, but there was no performance difference.
 245   return __pow5Factor(__value) >= __p;
 246 }
 247
 248 // Returns true if __value is divisible by 2^__p.
 249 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline bool __multipleOfPowerOf2(const uint64_t __value, const uint32_t __p) {
 250   _LIBCPP_ASSERT_UNCATEGORIZED(__value != 0, "");
 251   _LIBCPP_ASSERT_UNCATEGORIZED(__p < 64, "");
 252   // __builtin_ctzll doesn't appear to be faster here.
 253   return (__value & ((1ull << __p) - 1)) == 0;
 254 }
 255
 256 _LIBCPP_END_NAMESPACE_STD
 257
 258 // clang-format on
 259
 260 #endif // _LIBCPP_SRC_INCLUDE_RYU_DS2_INTRINSICS_H