[AMDGPU][True16][MC] update VOP1 dasm test with latest script (#120281)
[llvm-project.git] / compiler-rt / lib / builtins / udivmodti4.c
blob55def37c9e1fee9bc3e9be93fb5889bbe0481188
1 //===-- udivmodti4.c - Implement __udivmodti4 -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements __udivmodti4 for the compiler_rt library.
11 //===----------------------------------------------------------------------===//
13 #include "int_lib.h"
15 #ifdef CRT_HAS_128BIT
17 // Returns the 128 bit division result by 64 bit. Result must fit in 64 bits.
18 // Remainder stored in r.
19 // Taken and adjusted from libdivide libdivide_128_div_64_to_64 division
20 // fallback. For a correctness proof see the reference for this algorithm
21 // in Knuth, Volume 2, section 4.3.1, Algorithm D.
22 UNUSED
23 static inline du_int udiv128by64to64default(du_int u1, du_int u0, du_int v,
24 du_int *r) {
25 const unsigned n_udword_bits = sizeof(du_int) * CHAR_BIT;
26 const du_int b = (1ULL << (n_udword_bits / 2)); // Number base (32 bits)
27 du_int un1, un0; // Norm. dividend LSD's
28 du_int vn1, vn0; // Norm. divisor digits
29 du_int q1, q0; // Quotient digits
30 du_int un64, un21, un10; // Dividend digit pairs
31 du_int rhat; // A remainder
32 si_int s; // Shift amount for normalization
34 s = __builtin_clzll(v);
35 if (s > 0) {
36 // Normalize the divisor.
37 v = v << s;
38 un64 = (u1 << s) | (u0 >> (n_udword_bits - s));
39 un10 = u0 << s; // Shift dividend left
40 } else {
41 // Avoid undefined behavior of (u0 >> 64).
42 un64 = u1;
43 un10 = u0;
46 // Break divisor up into two 32-bit digits.
47 vn1 = v >> (n_udword_bits / 2);
48 vn0 = v & 0xFFFFFFFF;
50 // Break right half of dividend into two digits.
51 un1 = un10 >> (n_udword_bits / 2);
52 un0 = un10 & 0xFFFFFFFF;
54 // Compute the first quotient digit, q1.
55 q1 = un64 / vn1;
56 rhat = un64 - q1 * vn1;
58 // q1 has at most error 2. No more than 2 iterations.
59 while (q1 >= b || q1 * vn0 > b * rhat + un1) {
60 q1 = q1 - 1;
61 rhat = rhat + vn1;
62 if (rhat >= b)
63 break;
66 un21 = un64 * b + un1 - q1 * v;
68 // Compute the second quotient digit.
69 q0 = un21 / vn1;
70 rhat = un21 - q0 * vn1;
72 // q0 has at most error 2. No more than 2 iterations.
73 while (q0 >= b || q0 * vn0 > b * rhat + un0) {
74 q0 = q0 - 1;
75 rhat = rhat + vn1;
76 if (rhat >= b)
77 break;
80 *r = (un21 * b + un0 - q0 * v) >> s;
81 return q1 * b + q0;
84 static inline du_int udiv128by64to64(du_int u1, du_int u0, du_int v,
85 du_int *r) {
86 #if defined(__x86_64__)
87 du_int result;
88 __asm__("divq %[v]"
89 : "=a"(result), "=d"(*r)
90 : [ v ] "r"(v), "a"(u0), "d"(u1));
91 return result;
92 #else
93 return udiv128by64to64default(u1, u0, v, r);
94 #endif
97 // Effects: if rem != 0, *rem = a % b
98 // Returns: a / b
100 COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem) {
101 const unsigned n_utword_bits = sizeof(tu_int) * CHAR_BIT;
102 utwords dividend;
103 dividend.all = a;
104 utwords divisor;
105 divisor.all = b;
106 utwords quotient;
107 utwords remainder;
108 if (divisor.all > dividend.all) {
109 if (rem)
110 *rem = dividend.all;
111 return 0;
113 // When the divisor fits in 64 bits, we can use an optimized path.
114 if (divisor.s.high == 0) {
115 remainder.s.high = 0;
116 if (dividend.s.high < divisor.s.low) {
117 // The result fits in 64 bits.
118 quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
119 divisor.s.low, &remainder.s.low);
120 quotient.s.high = 0;
121 } else {
122 // First, divide with the high part to get the remainder in dividend.s.high.
123 // After that dividend.s.high < divisor.s.low.
124 quotient.s.high = dividend.s.high / divisor.s.low;
125 dividend.s.high = dividend.s.high % divisor.s.low;
126 quotient.s.low = udiv128by64to64(dividend.s.high, dividend.s.low,
127 divisor.s.low, &remainder.s.low);
129 if (rem)
130 *rem = remainder.all;
131 return quotient.all;
133 // 0 <= shift <= 63.
134 si_int shift =
135 __builtin_clzll(divisor.s.high) - __builtin_clzll(dividend.s.high);
136 divisor.all <<= shift;
137 quotient.s.high = 0;
138 quotient.s.low = 0;
139 for (; shift >= 0; --shift) {
140 quotient.s.low <<= 1;
141 // Branch free version of.
142 // if (dividend.all >= divisor.all)
143 // {
144 // dividend.all -= divisor.all;
145 // carry = 1;
146 // }
147 const ti_int s =
148 (ti_int)(divisor.all - dividend.all - 1) >> (n_utword_bits - 1);
149 quotient.s.low |= s & 1;
150 dividend.all -= divisor.all & s;
151 divisor.all >>= 1;
153 if (rem)
154 *rem = dividend.all;
155 return quotient.all;
158 #endif // CRT_HAS_128BIT