1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
3 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
4 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
6 ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
7 ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
8 ; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
10 target triple = "nvptx64-nvidia-cuda"
12 ; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
13 @"bfloat_array" = addrspace(1) constant [4 x bfloat]
14 [bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]
16 define bfloat @test_fadd(bfloat %0, bfloat %1) {
17 ; SM70-LABEL: test_fadd(
19 ; SM70-NEXT: .reg .pred %p<2>;
20 ; SM70-NEXT: .reg .b16 %rs<3>;
21 ; SM70-NEXT: .reg .b32 %r<11>;
22 ; SM70-NEXT: .reg .f32 %f<4>;
24 ; SM70-NEXT: // %bb.0:
25 ; SM70-NEXT: ld.param.u16 %r1, [test_fadd_param_1];
26 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
27 ; SM70-NEXT: mov.b32 %f1, %r2;
28 ; SM70-NEXT: ld.param.u16 %r3, [test_fadd_param_0];
29 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
30 ; SM70-NEXT: mov.b32 %f2, %r4;
31 ; SM70-NEXT: add.rn.f32 %f3, %f2, %f1;
32 ; SM70-NEXT: mov.b32 %r5, %f3;
33 ; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1;
34 ; SM70-NEXT: add.s32 %r7, %r6, %r5;
35 ; SM70-NEXT: add.s32 %r8, %r7, 32767;
36 ; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
37 ; SM70-NEXT: or.b32 %r9, %r5, 4194304;
38 ; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
39 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
40 ; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
43 ; SM80-LABEL: test_fadd(
45 ; SM80-NEXT: .reg .b16 %rs<4>;
46 ; SM80-NEXT: .reg .f32 %f<4>;
48 ; SM80-NEXT: // %bb.0:
49 ; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
50 ; SM80-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
51 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
52 ; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
53 ; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
54 ; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
55 ; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
58 ; SM80-FTZ-LABEL: test_fadd(
60 ; SM80-FTZ-NEXT: .reg .b16 %rs<4>;
61 ; SM80-FTZ-NEXT: .reg .f32 %f<4>;
63 ; SM80-FTZ-NEXT: // %bb.0:
64 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
65 ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
66 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
67 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1;
68 ; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
69 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
70 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
73 ; SM90-LABEL: test_fadd(
75 ; SM90-NEXT: .reg .b16 %rs<4>;
77 ; SM90-NEXT: // %bb.0:
78 ; SM90-NEXT: ld.param.b16 %rs1, [test_fadd_param_0];
79 ; SM90-NEXT: ld.param.b16 %rs2, [test_fadd_param_1];
80 ; SM90-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
81 ; SM90-NEXT: st.param.b16 [func_retval0], %rs3;
83 %3 = fadd bfloat %0, %1
87 define bfloat @test_fsub(bfloat %0, bfloat %1) {
88 ; SM70-LABEL: test_fsub(
90 ; SM70-NEXT: .reg .pred %p<2>;
91 ; SM70-NEXT: .reg .b16 %rs<3>;
92 ; SM70-NEXT: .reg .b32 %r<11>;
93 ; SM70-NEXT: .reg .f32 %f<4>;
95 ; SM70-NEXT: // %bb.0:
96 ; SM70-NEXT: ld.param.u16 %r1, [test_fsub_param_1];
97 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
98 ; SM70-NEXT: mov.b32 %f1, %r2;
99 ; SM70-NEXT: ld.param.u16 %r3, [test_fsub_param_0];
100 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
101 ; SM70-NEXT: mov.b32 %f2, %r4;
102 ; SM70-NEXT: sub.rn.f32 %f3, %f2, %f1;
103 ; SM70-NEXT: mov.b32 %r5, %f3;
104 ; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1;
105 ; SM70-NEXT: add.s32 %r7, %r6, %r5;
106 ; SM70-NEXT: add.s32 %r8, %r7, 32767;
107 ; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
108 ; SM70-NEXT: or.b32 %r9, %r5, 4194304;
109 ; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
110 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
111 ; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
114 ; SM80-LABEL: test_fsub(
116 ; SM80-NEXT: .reg .b16 %rs<4>;
117 ; SM80-NEXT: .reg .f32 %f<4>;
119 ; SM80-NEXT: // %bb.0:
120 ; SM80-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
121 ; SM80-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
122 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
123 ; SM80-NEXT: cvt.f32.bf16 %f2, %rs1;
124 ; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
125 ; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
126 ; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
129 ; SM80-FTZ-LABEL: test_fsub(
131 ; SM80-FTZ-NEXT: .reg .b16 %rs<4>;
132 ; SM80-FTZ-NEXT: .reg .f32 %f<4>;
134 ; SM80-FTZ-NEXT: // %bb.0:
135 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
136 ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
137 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
138 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs1;
139 ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
140 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f3;
141 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
142 ; SM80-FTZ-NEXT: ret;
144 ; SM90-LABEL: test_fsub(
146 ; SM90-NEXT: .reg .b16 %rs<4>;
148 ; SM90-NEXT: // %bb.0:
149 ; SM90-NEXT: ld.param.b16 %rs1, [test_fsub_param_0];
150 ; SM90-NEXT: ld.param.b16 %rs2, [test_fsub_param_1];
151 ; SM90-NEXT: sub.rn.bf16 %rs3, %rs1, %rs2;
152 ; SM90-NEXT: st.param.b16 [func_retval0], %rs3;
154 %3 = fsub bfloat %0, %1
158 define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
159 ; SM70-LABEL: test_faddx2(
161 ; SM70-NEXT: .reg .pred %p<3>;
162 ; SM70-NEXT: .reg .b16 %rs<13>;
163 ; SM70-NEXT: .reg .b32 %r<24>;
164 ; SM70-NEXT: .reg .f32 %f<7>;
166 ; SM70-NEXT: // %bb.0:
167 ; SM70-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
168 ; SM70-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
169 ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
170 ; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
171 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
172 ; SM70-NEXT: mov.b32 %f1, %r4;
173 ; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1;
174 ; SM70-NEXT: cvt.u32.u16 %r5, %rs5;
175 ; SM70-NEXT: shl.b32 %r6, %r5, 16;
176 ; SM70-NEXT: mov.b32 %f2, %r6;
177 ; SM70-NEXT: add.rn.f32 %f3, %f2, %f1;
178 ; SM70-NEXT: mov.b32 %r7, %f3;
179 ; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
180 ; SM70-NEXT: add.s32 %r9, %r8, %r7;
181 ; SM70-NEXT: add.s32 %r10, %r9, 32767;
182 ; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
183 ; SM70-NEXT: or.b32 %r11, %r7, 4194304;
184 ; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
185 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
186 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
187 ; SM70-NEXT: shl.b32 %r14, %r13, 16;
188 ; SM70-NEXT: mov.b32 %f4, %r14;
189 ; SM70-NEXT: cvt.u32.u16 %r15, %rs4;
190 ; SM70-NEXT: shl.b32 %r16, %r15, 16;
191 ; SM70-NEXT: mov.b32 %f5, %r16;
192 ; SM70-NEXT: add.rn.f32 %f6, %f5, %f4;
193 ; SM70-NEXT: mov.b32 %r17, %f6;
194 ; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
195 ; SM70-NEXT: add.s32 %r19, %r18, %r17;
196 ; SM70-NEXT: add.s32 %r20, %r19, 32767;
197 ; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
198 ; SM70-NEXT: or.b32 %r21, %r17, 4194304;
199 ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
200 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
201 ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
202 ; SM70-NEXT: st.param.b32 [func_retval0], %r23;
205 ; SM80-LABEL: test_faddx2(
207 ; SM80-NEXT: .reg .b16 %rs<7>;
208 ; SM80-NEXT: .reg .b32 %r<4>;
209 ; SM80-NEXT: .reg .f32 %f<7>;
211 ; SM80-NEXT: // %bb.0:
212 ; SM80-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
213 ; SM80-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
214 ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
215 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
216 ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
217 ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
218 ; SM80-NEXT: add.rn.f32 %f3, %f2, %f1;
219 ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
220 ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
221 ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
222 ; SM80-NEXT: add.rn.f32 %f6, %f5, %f4;
223 ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
224 ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
225 ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
228 ; SM80-FTZ-LABEL: test_faddx2(
230 ; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
231 ; SM80-FTZ-NEXT: .reg .b32 %r<4>;
232 ; SM80-FTZ-NEXT: .reg .f32 %f<7>;
234 ; SM80-FTZ-NEXT: // %bb.0:
235 ; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_faddx2_param_0];
236 ; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_faddx2_param_1];
237 ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
238 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
239 ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
240 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
241 ; SM80-FTZ-NEXT: add.rn.ftz.f32 %f3, %f2, %f1;
242 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
243 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
244 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
245 ; SM80-FTZ-NEXT: add.rn.ftz.f32 %f6, %f5, %f4;
246 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
247 ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
248 ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
249 ; SM80-FTZ-NEXT: ret;
251 ; SM90-LABEL: test_faddx2(
253 ; SM90-NEXT: .reg .b32 %r<4>;
255 ; SM90-NEXT: // %bb.0:
256 ; SM90-NEXT: ld.param.b32 %r1, [test_faddx2_param_1];
257 ; SM90-NEXT: ld.param.b32 %r2, [test_faddx2_param_0];
258 ; SM90-NEXT: add.rn.bf16x2 %r3, %r2, %r1;
259 ; SM90-NEXT: st.param.b32 [func_retval0], %r3;
261 %r = fadd <2 x bfloat> %a, %b
265 define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
266 ; SM70-LABEL: test_fsubx2(
268 ; SM70-NEXT: .reg .pred %p<3>;
269 ; SM70-NEXT: .reg .b16 %rs<13>;
270 ; SM70-NEXT: .reg .b32 %r<24>;
271 ; SM70-NEXT: .reg .f32 %f<7>;
273 ; SM70-NEXT: // %bb.0:
274 ; SM70-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
275 ; SM70-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
276 ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
277 ; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
278 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
279 ; SM70-NEXT: mov.b32 %f1, %r4;
280 ; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1;
281 ; SM70-NEXT: cvt.u32.u16 %r5, %rs5;
282 ; SM70-NEXT: shl.b32 %r6, %r5, 16;
283 ; SM70-NEXT: mov.b32 %f2, %r6;
284 ; SM70-NEXT: sub.rn.f32 %f3, %f2, %f1;
285 ; SM70-NEXT: mov.b32 %r7, %f3;
286 ; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
287 ; SM70-NEXT: add.s32 %r9, %r8, %r7;
288 ; SM70-NEXT: add.s32 %r10, %r9, 32767;
289 ; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
290 ; SM70-NEXT: or.b32 %r11, %r7, 4194304;
291 ; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
292 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
293 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
294 ; SM70-NEXT: shl.b32 %r14, %r13, 16;
295 ; SM70-NEXT: mov.b32 %f4, %r14;
296 ; SM70-NEXT: cvt.u32.u16 %r15, %rs4;
297 ; SM70-NEXT: shl.b32 %r16, %r15, 16;
298 ; SM70-NEXT: mov.b32 %f5, %r16;
299 ; SM70-NEXT: sub.rn.f32 %f6, %f5, %f4;
300 ; SM70-NEXT: mov.b32 %r17, %f6;
301 ; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
302 ; SM70-NEXT: add.s32 %r19, %r18, %r17;
303 ; SM70-NEXT: add.s32 %r20, %r19, 32767;
304 ; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
305 ; SM70-NEXT: or.b32 %r21, %r17, 4194304;
306 ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
307 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
308 ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
309 ; SM70-NEXT: st.param.b32 [func_retval0], %r23;
312 ; SM80-LABEL: test_fsubx2(
314 ; SM80-NEXT: .reg .b16 %rs<7>;
315 ; SM80-NEXT: .reg .b32 %r<4>;
316 ; SM80-NEXT: .reg .f32 %f<7>;
318 ; SM80-NEXT: // %bb.0:
319 ; SM80-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
320 ; SM80-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
321 ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
322 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
323 ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
324 ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
325 ; SM80-NEXT: sub.rn.f32 %f3, %f2, %f1;
326 ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
327 ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
328 ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
329 ; SM80-NEXT: sub.rn.f32 %f6, %f5, %f4;
330 ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
331 ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
332 ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
335 ; SM80-FTZ-LABEL: test_fsubx2(
337 ; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
338 ; SM80-FTZ-NEXT: .reg .b32 %r<4>;
339 ; SM80-FTZ-NEXT: .reg .f32 %f<7>;
341 ; SM80-FTZ-NEXT: // %bb.0:
342 ; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fsubx2_param_0];
343 ; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fsubx2_param_1];
344 ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
345 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
346 ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
347 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
348 ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f3, %f2, %f1;
349 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
350 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
351 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
352 ; SM80-FTZ-NEXT: sub.rn.ftz.f32 %f6, %f5, %f4;
353 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
354 ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
355 ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
356 ; SM80-FTZ-NEXT: ret;
358 ; SM90-LABEL: test_fsubx2(
360 ; SM90-NEXT: .reg .b32 %r<4>;
362 ; SM90-NEXT: // %bb.0:
363 ; SM90-NEXT: ld.param.b32 %r1, [test_fsubx2_param_1];
364 ; SM90-NEXT: ld.param.b32 %r2, [test_fsubx2_param_0];
365 ; SM90-NEXT: sub.rn.bf16x2 %r3, %r2, %r1;
366 ; SM90-NEXT: st.param.b32 [func_retval0], %r3;
368 %r = fsub <2 x bfloat> %a, %b
372 define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
373 ; SM70-LABEL: test_fmulx2(
375 ; SM70-NEXT: .reg .pred %p<3>;
376 ; SM70-NEXT: .reg .b16 %rs<13>;
377 ; SM70-NEXT: .reg .b32 %r<24>;
378 ; SM70-NEXT: .reg .f32 %f<7>;
380 ; SM70-NEXT: // %bb.0:
381 ; SM70-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
382 ; SM70-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
383 ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
384 ; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
385 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
386 ; SM70-NEXT: mov.b32 %f1, %r4;
387 ; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1;
388 ; SM70-NEXT: cvt.u32.u16 %r5, %rs5;
389 ; SM70-NEXT: shl.b32 %r6, %r5, 16;
390 ; SM70-NEXT: mov.b32 %f2, %r6;
391 ; SM70-NEXT: mul.rn.f32 %f3, %f2, %f1;
392 ; SM70-NEXT: mov.b32 %r7, %f3;
393 ; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
394 ; SM70-NEXT: add.s32 %r9, %r8, %r7;
395 ; SM70-NEXT: add.s32 %r10, %r9, 32767;
396 ; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
397 ; SM70-NEXT: or.b32 %r11, %r7, 4194304;
398 ; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
399 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
400 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
401 ; SM70-NEXT: shl.b32 %r14, %r13, 16;
402 ; SM70-NEXT: mov.b32 %f4, %r14;
403 ; SM70-NEXT: cvt.u32.u16 %r15, %rs4;
404 ; SM70-NEXT: shl.b32 %r16, %r15, 16;
405 ; SM70-NEXT: mov.b32 %f5, %r16;
406 ; SM70-NEXT: mul.rn.f32 %f6, %f5, %f4;
407 ; SM70-NEXT: mov.b32 %r17, %f6;
408 ; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
409 ; SM70-NEXT: add.s32 %r19, %r18, %r17;
410 ; SM70-NEXT: add.s32 %r20, %r19, 32767;
411 ; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
412 ; SM70-NEXT: or.b32 %r21, %r17, 4194304;
413 ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
414 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
415 ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
416 ; SM70-NEXT: st.param.b32 [func_retval0], %r23;
419 ; SM80-LABEL: test_fmulx2(
421 ; SM80-NEXT: .reg .b16 %rs<7>;
422 ; SM80-NEXT: .reg .b32 %r<4>;
423 ; SM80-NEXT: .reg .f32 %f<7>;
425 ; SM80-NEXT: // %bb.0:
426 ; SM80-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
427 ; SM80-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
428 ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
429 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
430 ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
431 ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
432 ; SM80-NEXT: mul.rn.f32 %f3, %f2, %f1;
433 ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
434 ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
435 ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
436 ; SM80-NEXT: mul.rn.f32 %f6, %f5, %f4;
437 ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
438 ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
439 ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
442 ; SM80-FTZ-LABEL: test_fmulx2(
444 ; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
445 ; SM80-FTZ-NEXT: .reg .b32 %r<4>;
446 ; SM80-FTZ-NEXT: .reg .f32 %f<7>;
448 ; SM80-FTZ-NEXT: // %bb.0:
449 ; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fmulx2_param_0];
450 ; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fmulx2_param_1];
451 ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
452 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
453 ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
454 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
455 ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f3, %f2, %f1;
456 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
457 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
458 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
459 ; SM80-FTZ-NEXT: mul.rn.ftz.f32 %f6, %f5, %f4;
460 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
461 ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
462 ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
463 ; SM80-FTZ-NEXT: ret;
465 ; SM90-LABEL: test_fmulx2(
467 ; SM90-NEXT: .reg .b32 %r<4>;
469 ; SM90-NEXT: // %bb.0:
470 ; SM90-NEXT: ld.param.b32 %r1, [test_fmulx2_param_1];
471 ; SM90-NEXT: ld.param.b32 %r2, [test_fmulx2_param_0];
472 ; SM90-NEXT: mul.rn.bf16x2 %r3, %r2, %r1;
473 ; SM90-NEXT: st.param.b32 [func_retval0], %r3;
475 %r = fmul <2 x bfloat> %a, %b
479 define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
480 ; SM70-LABEL: test_fdiv(
482 ; SM70-NEXT: .reg .pred %p<3>;
483 ; SM70-NEXT: .reg .b16 %rs<13>;
484 ; SM70-NEXT: .reg .b32 %r<24>;
485 ; SM70-NEXT: .reg .f32 %f<7>;
487 ; SM70-NEXT: // %bb.0:
488 ; SM70-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
489 ; SM70-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
490 ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
491 ; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
492 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
493 ; SM70-NEXT: mov.b32 %f1, %r4;
494 ; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1;
495 ; SM70-NEXT: cvt.u32.u16 %r5, %rs5;
496 ; SM70-NEXT: shl.b32 %r6, %r5, 16;
497 ; SM70-NEXT: mov.b32 %f2, %r6;
498 ; SM70-NEXT: div.rn.f32 %f3, %f2, %f1;
499 ; SM70-NEXT: mov.b32 %r7, %f3;
500 ; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
501 ; SM70-NEXT: add.s32 %r9, %r8, %r7;
502 ; SM70-NEXT: add.s32 %r10, %r9, 32767;
503 ; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
504 ; SM70-NEXT: or.b32 %r11, %r7, 4194304;
505 ; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
506 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
507 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
508 ; SM70-NEXT: shl.b32 %r14, %r13, 16;
509 ; SM70-NEXT: mov.b32 %f4, %r14;
510 ; SM70-NEXT: cvt.u32.u16 %r15, %rs4;
511 ; SM70-NEXT: shl.b32 %r16, %r15, 16;
512 ; SM70-NEXT: mov.b32 %f5, %r16;
513 ; SM70-NEXT: div.rn.f32 %f6, %f5, %f4;
514 ; SM70-NEXT: mov.b32 %r17, %f6;
515 ; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
516 ; SM70-NEXT: add.s32 %r19, %r18, %r17;
517 ; SM70-NEXT: add.s32 %r20, %r19, 32767;
518 ; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
519 ; SM70-NEXT: or.b32 %r21, %r17, 4194304;
520 ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
521 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
522 ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
523 ; SM70-NEXT: st.param.b32 [func_retval0], %r23;
526 ; SM80-LABEL: test_fdiv(
528 ; SM80-NEXT: .reg .b16 %rs<7>;
529 ; SM80-NEXT: .reg .b32 %r<4>;
530 ; SM80-NEXT: .reg .f32 %f<7>;
532 ; SM80-NEXT: // %bb.0:
533 ; SM80-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
534 ; SM80-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
535 ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
536 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs2;
537 ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r1;
538 ; SM80-NEXT: cvt.f32.bf16 %f2, %rs4;
539 ; SM80-NEXT: div.rn.f32 %f3, %f2, %f1;
540 ; SM80-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
541 ; SM80-NEXT: cvt.f32.bf16 %f4, %rs1;
542 ; SM80-NEXT: cvt.f32.bf16 %f5, %rs3;
543 ; SM80-NEXT: div.rn.f32 %f6, %f5, %f4;
544 ; SM80-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
545 ; SM80-NEXT: mov.b32 %r3, {%rs6, %rs5};
546 ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
549 ; SM80-FTZ-LABEL: test_fdiv(
551 ; SM80-FTZ-NEXT: .reg .b16 %rs<7>;
552 ; SM80-FTZ-NEXT: .reg .b32 %r<4>;
553 ; SM80-FTZ-NEXT: .reg .f32 %f<7>;
555 ; SM80-FTZ-NEXT: // %bb.0:
556 ; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
557 ; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
558 ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r2;
559 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs2;
560 ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r1;
561 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs4;
562 ; SM80-FTZ-NEXT: div.rn.ftz.f32 %f3, %f2, %f1;
563 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
564 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs1;
565 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs3;
566 ; SM80-FTZ-NEXT: div.rn.ftz.f32 %f6, %f5, %f4;
567 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
568 ; SM80-FTZ-NEXT: mov.b32 %r3, {%rs6, %rs5};
569 ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
570 ; SM80-FTZ-NEXT: ret;
572 ; SM90-LABEL: test_fdiv(
574 ; SM90-NEXT: .reg .b16 %rs<7>;
575 ; SM90-NEXT: .reg .b32 %r<4>;
576 ; SM90-NEXT: .reg .f32 %f<7>;
578 ; SM90-NEXT: // %bb.0:
579 ; SM90-NEXT: ld.param.b32 %r1, [test_fdiv_param_0];
580 ; SM90-NEXT: ld.param.b32 %r2, [test_fdiv_param_1];
581 ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r2;
582 ; SM90-NEXT: cvt.f32.bf16 %f1, %rs2;
583 ; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r1;
584 ; SM90-NEXT: cvt.f32.bf16 %f2, %rs4;
585 ; SM90-NEXT: div.rn.f32 %f3, %f2, %f1;
586 ; SM90-NEXT: cvt.rn.bf16.f32 %rs5, %f3;
587 ; SM90-NEXT: cvt.f32.bf16 %f4, %rs1;
588 ; SM90-NEXT: cvt.f32.bf16 %f5, %rs3;
589 ; SM90-NEXT: div.rn.f32 %f6, %f5, %f4;
590 ; SM90-NEXT: cvt.rn.bf16.f32 %rs6, %f6;
591 ; SM90-NEXT: mov.b32 %r3, {%rs6, %rs5};
592 ; SM90-NEXT: st.param.b32 [func_retval0], %r3;
594 %r = fdiv <2 x bfloat> %a, %b
598 define bfloat @test_extract_0(<2 x bfloat> %a) #0 {
599 ; CHECK-LABEL: test_extract_0(
601 ; CHECK-NEXT: .reg .b16 %rs<2>;
603 ; CHECK-NEXT: // %bb.0:
604 ; CHECK-NEXT: ld.param.b16 %rs1, [test_extract_0_param_0];
605 ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
607 %e = extractelement <2 x bfloat> %a, i32 0
611 define bfloat @test_extract_1(<2 x bfloat> %a) #0 {
612 ; CHECK-LABEL: test_extract_1(
614 ; CHECK-NEXT: .reg .b16 %rs<2>;
616 ; CHECK-NEXT: // %bb.0:
617 ; CHECK-NEXT: ld.param.b16 %rs1, [test_extract_1_param_0+2];
618 ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
620 %e = extractelement <2 x bfloat> %a, i32 1
624 define float @test_fpext_float(bfloat %a) #0 {
625 ; SM70-LABEL: test_fpext_float(
627 ; SM70-NEXT: .reg .b32 %r<3>;
628 ; SM70-NEXT: .reg .f32 %f<2>;
630 ; SM70-NEXT: // %bb.0:
631 ; SM70-NEXT: ld.param.u16 %r1, [test_fpext_float_param_0];
632 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
633 ; SM70-NEXT: mov.b32 %f1, %r2;
634 ; SM70-NEXT: st.param.f32 [func_retval0], %f1;
637 ; SM80-LABEL: test_fpext_float(
639 ; SM80-NEXT: .reg .b16 %rs<2>;
640 ; SM80-NEXT: .reg .f32 %f<2>;
642 ; SM80-NEXT: // %bb.0:
643 ; SM80-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
644 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
645 ; SM80-NEXT: st.param.f32 [func_retval0], %f1;
648 ; SM80-FTZ-LABEL: test_fpext_float(
650 ; SM80-FTZ-NEXT: .reg .b16 %rs<2>;
651 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
653 ; SM80-FTZ-NEXT: // %bb.0:
654 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
655 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
656 ; SM80-FTZ-NEXT: st.param.f32 [func_retval0], %f1;
657 ; SM80-FTZ-NEXT: ret;
659 ; SM90-LABEL: test_fpext_float(
661 ; SM90-NEXT: .reg .b16 %rs<2>;
662 ; SM90-NEXT: .reg .f32 %f<2>;
664 ; SM90-NEXT: // %bb.0:
665 ; SM90-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
666 ; SM90-NEXT: cvt.f32.bf16 %f1, %rs1;
667 ; SM90-NEXT: st.param.f32 [func_retval0], %f1;
669 %r = fpext bfloat %a to float
673 define bfloat @test_fptrunc_float(float %a) #0 {
674 ; SM70-LABEL: test_fptrunc_float(
676 ; SM70-NEXT: .reg .pred %p<2>;
677 ; SM70-NEXT: .reg .b16 %rs<3>;
678 ; SM70-NEXT: .reg .b32 %r<7>;
679 ; SM70-NEXT: .reg .f32 %f<2>;
681 ; SM70-NEXT: // %bb.0:
682 ; SM70-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0];
683 ; SM70-NEXT: mov.b32 %r1, %f1;
684 ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
685 ; SM70-NEXT: add.s32 %r3, %r2, %r1;
686 ; SM70-NEXT: add.s32 %r4, %r3, 32767;
687 ; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
688 ; SM70-NEXT: or.b32 %r5, %r1, 4194304;
689 ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
690 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; }
691 ; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
694 ; SM80-LABEL: test_fptrunc_float(
696 ; SM80-NEXT: .reg .b16 %rs<2>;
697 ; SM80-NEXT: .reg .f32 %f<2>;
699 ; SM80-NEXT: // %bb.0:
700 ; SM80-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0];
701 ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
702 ; SM80-NEXT: st.param.b16 [func_retval0], %rs1;
705 ; SM80-FTZ-LABEL: test_fptrunc_float(
707 ; SM80-FTZ-NEXT: .reg .b16 %rs<2>;
708 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
710 ; SM80-FTZ-NEXT: // %bb.0:
711 ; SM80-FTZ-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0];
712 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
713 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
714 ; SM80-FTZ-NEXT: ret;
716 ; SM90-LABEL: test_fptrunc_float(
718 ; SM90-NEXT: .reg .b16 %rs<2>;
719 ; SM90-NEXT: .reg .f32 %f<2>;
721 ; SM90-NEXT: // %bb.0:
722 ; SM90-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0];
723 ; SM90-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
724 ; SM90-NEXT: st.param.b16 [func_retval0], %rs1;
726 %r = fptrunc float %a to bfloat
730 define bfloat @test_fadd_imm_1(bfloat %a) #0 {
731 ; SM70-LABEL: test_fadd_imm_1(
733 ; SM70-NEXT: .reg .pred %p<2>;
734 ; SM70-NEXT: .reg .b16 %rs<3>;
735 ; SM70-NEXT: .reg .b32 %r<9>;
736 ; SM70-NEXT: .reg .f32 %f<3>;
738 ; SM70-NEXT: // %bb.0:
739 ; SM70-NEXT: ld.param.u16 %r1, [test_fadd_imm_1_param_0];
740 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
741 ; SM70-NEXT: mov.b32 %f1, %r2;
742 ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
743 ; SM70-NEXT: mov.b32 %r3, %f2;
744 ; SM70-NEXT: bfe.u32 %r4, %r3, 16, 1;
745 ; SM70-NEXT: add.s32 %r5, %r4, %r3;
746 ; SM70-NEXT: add.s32 %r6, %r5, 32767;
747 ; SM70-NEXT: setp.nan.f32 %p1, %f2, %f2;
748 ; SM70-NEXT: or.b32 %r7, %r3, 4194304;
749 ; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
750 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
751 ; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
754 ; SM80-LABEL: test_fadd_imm_1(
756 ; SM80-NEXT: .reg .b16 %rs<3>;
757 ; SM80-NEXT: .reg .f32 %f<3>;
759 ; SM80-NEXT: // %bb.0:
760 ; SM80-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
761 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
762 ; SM80-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
763 ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2;
764 ; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
767 ; SM80-FTZ-LABEL: test_fadd_imm_1(
769 ; SM80-FTZ-NEXT: .reg .b16 %rs<3>;
770 ; SM80-FTZ-NEXT: .reg .f32 %f<3>;
772 ; SM80-FTZ-NEXT: // %bb.0:
773 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
774 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
775 ; SM80-FTZ-NEXT: add.rn.ftz.f32 %f2, %f1, 0f3F800000;
776 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f2;
777 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
778 ; SM80-FTZ-NEXT: ret;
780 ; SM90-LABEL: test_fadd_imm_1(
782 ; SM90-NEXT: .reg .b16 %rs<4>;
784 ; SM90-NEXT: // %bb.0:
785 ; SM90-NEXT: ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
786 ; SM90-NEXT: mov.b16 %rs2, 0x3F80;
787 ; SM90-NEXT: add.rn.bf16 %rs3, %rs1, %rs2;
788 ; SM90-NEXT: st.param.b16 [func_retval0], %rs3;
790 %r = fadd bfloat %a, 1.0
794 define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat %d) #0 {
795 ; CHECK-LABEL: test_select_cc_bf16_f64(
797 ; CHECK-NEXT: .reg .pred %p<2>;
798 ; CHECK-NEXT: .reg .b16 %rs<4>;
799 ; CHECK-NEXT: .reg .f64 %fd<3>;
801 ; CHECK-NEXT: // %bb.0:
802 ; CHECK-NEXT: ld.param.f64 %fd1, [test_select_cc_bf16_f64_param_0];
803 ; CHECK-NEXT: ld.param.f64 %fd2, [test_select_cc_bf16_f64_param_1];
804 ; CHECK-NEXT: setp.lt.f64 %p1, %fd1, %fd2;
805 ; CHECK-NEXT: ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2];
806 ; CHECK-NEXT: ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3];
807 ; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
808 ; CHECK-NEXT: st.param.b16 [func_retval0], %rs3;
810 %cc = fcmp olt double %a, %b
811 %r = select i1 %cc, bfloat %c, bfloat %d
815 define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
816 ; SM70-LABEL: test_extload_bf16x8(
818 ; SM70-NEXT: .reg .b16 %rs<17>;
819 ; SM70-NEXT: .reg .b32 %r<21>;
820 ; SM70-NEXT: .reg .f32 %f<9>;
821 ; SM70-NEXT: .reg .b64 %rd<2>;
823 ; SM70-NEXT: // %bb.0:
824 ; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
825 ; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
826 ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
827 ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
828 ; SM70-NEXT: mov.b32 {%rs5, %rs6}, %r3;
829 ; SM70-NEXT: mov.b32 {%rs7, %rs8}, %r4;
830 ; SM70-NEXT: cvt.u32.u16 %r5, %rs8;
831 ; SM70-NEXT: shl.b32 %r6, %r5, 16;
832 ; SM70-NEXT: mov.b32 %f1, %r6;
833 ; SM70-NEXT: cvt.u32.u16 %r7, %rs7;
834 ; SM70-NEXT: shl.b32 %r8, %r7, 16;
835 ; SM70-NEXT: mov.b32 %f2, %r8;
836 ; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
837 ; SM70-NEXT: shl.b32 %r10, %r9, 16;
838 ; SM70-NEXT: mov.b32 %f3, %r10;
839 ; SM70-NEXT: cvt.u32.u16 %r11, %rs5;
840 ; SM70-NEXT: shl.b32 %r12, %r11, 16;
841 ; SM70-NEXT: mov.b32 %f4, %r12;
842 ; SM70-NEXT: cvt.u32.u16 %r13, %rs4;
843 ; SM70-NEXT: shl.b32 %r14, %r13, 16;
844 ; SM70-NEXT: mov.b32 %f5, %r14;
845 ; SM70-NEXT: cvt.u32.u16 %r15, %rs3;
846 ; SM70-NEXT: shl.b32 %r16, %r15, 16;
847 ; SM70-NEXT: mov.b32 %f6, %r16;
848 ; SM70-NEXT: cvt.u32.u16 %r17, %rs2;
849 ; SM70-NEXT: shl.b32 %r18, %r17, 16;
850 ; SM70-NEXT: mov.b32 %f7, %r18;
851 ; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
852 ; SM70-NEXT: shl.b32 %r20, %r19, 16;
853 ; SM70-NEXT: mov.b32 %f8, %r20;
854 ; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
855 ; SM70-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
858 ; SM80-LABEL: test_extload_bf16x8(
860 ; SM80-NEXT: .reg .b16 %rs<9>;
861 ; SM80-NEXT: .reg .b32 %r<5>;
862 ; SM80-NEXT: .reg .f32 %f<9>;
863 ; SM80-NEXT: .reg .b64 %rd<2>;
865 ; SM80-NEXT: // %bb.0:
866 ; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
867 ; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
868 ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
869 ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
870 ; SM80-NEXT: mov.b32 {%rs5, %rs6}, %r3;
871 ; SM80-NEXT: mov.b32 {%rs7, %rs8}, %r4;
872 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs8;
873 ; SM80-NEXT: cvt.f32.bf16 %f2, %rs7;
874 ; SM80-NEXT: cvt.f32.bf16 %f3, %rs6;
875 ; SM80-NEXT: cvt.f32.bf16 %f4, %rs5;
876 ; SM80-NEXT: cvt.f32.bf16 %f5, %rs4;
877 ; SM80-NEXT: cvt.f32.bf16 %f6, %rs3;
878 ; SM80-NEXT: cvt.f32.bf16 %f7, %rs2;
879 ; SM80-NEXT: cvt.f32.bf16 %f8, %rs1;
880 ; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
881 ; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
884 ; SM80-FTZ-LABEL: test_extload_bf16x8(
886 ; SM80-FTZ-NEXT: .reg .b16 %rs<9>;
887 ; SM80-FTZ-NEXT: .reg .b32 %r<5>;
888 ; SM80-FTZ-NEXT: .reg .f32 %f<9>;
889 ; SM80-FTZ-NEXT: .reg .b64 %rd<2>;
891 ; SM80-FTZ-NEXT: // %bb.0:
892 ; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
893 ; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
894 ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1;
895 ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2;
896 ; SM80-FTZ-NEXT: mov.b32 {%rs5, %rs6}, %r3;
897 ; SM80-FTZ-NEXT: mov.b32 {%rs7, %rs8}, %r4;
898 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs8;
899 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f2, %rs7;
900 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f3, %rs6;
901 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f4, %rs5;
902 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f5, %rs4;
903 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3;
904 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2;
905 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1;
906 ; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
907 ; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
908 ; SM80-FTZ-NEXT: ret;
910 ; SM90-LABEL: test_extload_bf16x8(
912 ; SM90-NEXT: .reg .b16 %rs<9>;
913 ; SM90-NEXT: .reg .b32 %r<5>;
914 ; SM90-NEXT: .reg .f32 %f<9>;
915 ; SM90-NEXT: .reg .b64 %rd<2>;
917 ; SM90-NEXT: // %bb.0:
918 ; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
919 ; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
920 ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1;
921 ; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2;
922 ; SM90-NEXT: mov.b32 {%rs5, %rs6}, %r3;
923 ; SM90-NEXT: mov.b32 {%rs7, %rs8}, %r4;
924 ; SM90-NEXT: cvt.f32.bf16 %f1, %rs8;
925 ; SM90-NEXT: cvt.f32.bf16 %f2, %rs7;
926 ; SM90-NEXT: cvt.f32.bf16 %f3, %rs6;
927 ; SM90-NEXT: cvt.f32.bf16 %f4, %rs5;
928 ; SM90-NEXT: cvt.f32.bf16 %f5, %rs4;
929 ; SM90-NEXT: cvt.f32.bf16 %f6, %rs3;
930 ; SM90-NEXT: cvt.f32.bf16 %f7, %rs2;
931 ; SM90-NEXT: cvt.f32.bf16 %f8, %rs1;
932 ; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
933 ; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
935 %load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16
936 %res = fpext <8 x bfloat> %load to <8 x float>
940 define i16 @test_fptosi_i16(bfloat %a) {
941 ; SM70-LABEL: test_fptosi_i16(
943 ; SM70-NEXT: .reg .b16 %rs<2>;
944 ; SM70-NEXT: .reg .b32 %r<4>;
945 ; SM70-NEXT: .reg .f32 %f<2>;
947 ; SM70-NEXT: // %bb.0:
948 ; SM70-NEXT: ld.param.u16 %r1, [test_fptosi_i16_param_0];
949 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
950 ; SM70-NEXT: mov.b32 %f1, %r2;
951 ; SM70-NEXT: cvt.rzi.s16.f32 %rs1, %f1;
952 ; SM70-NEXT: cvt.u32.u16 %r3, %rs1;
953 ; SM70-NEXT: st.param.b32 [func_retval0], %r3;
956 ; SM80-LABEL: test_fptosi_i16(
958 ; SM80-NEXT: .reg .b16 %rs<3>;
959 ; SM80-NEXT: .reg .b32 %r<2>;
960 ; SM80-NEXT: .reg .f32 %f<2>;
962 ; SM80-NEXT: // %bb.0:
963 ; SM80-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
964 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
965 ; SM80-NEXT: cvt.rzi.s16.f32 %rs2, %f1;
966 ; SM80-NEXT: cvt.u32.u16 %r1, %rs2;
967 ; SM80-NEXT: st.param.b32 [func_retval0], %r1;
970 ; SM80-FTZ-LABEL: test_fptosi_i16(
972 ; SM80-FTZ-NEXT: .reg .b16 %rs<3>;
973 ; SM80-FTZ-NEXT: .reg .b32 %r<2>;
974 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
976 ; SM80-FTZ-NEXT: // %bb.0:
977 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
978 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
979 ; SM80-FTZ-NEXT: cvt.rzi.ftz.s16.f32 %rs2, %f1;
980 ; SM80-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
981 ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
982 ; SM80-FTZ-NEXT: ret;
984 ; SM90-LABEL: test_fptosi_i16(
986 ; SM90-NEXT: .reg .b16 %rs<3>;
987 ; SM90-NEXT: .reg .b32 %r<2>;
989 ; SM90-NEXT: // %bb.0:
990 ; SM90-NEXT: ld.param.b16 %rs1, [test_fptosi_i16_param_0];
991 ; SM90-NEXT: cvt.rzi.s16.bf16 %rs2, %rs1;
992 ; SM90-NEXT: cvt.u32.u16 %r1, %rs2;
993 ; SM90-NEXT: st.param.b32 [func_retval0], %r1;
995 %r = fptosi bfloat %a to i16
999 define i16 @test_fptoui_i16(bfloat %a) {
1000 ; SM70-LABEL: test_fptoui_i16(
1002 ; SM70-NEXT: .reg .b16 %rs<2>;
1003 ; SM70-NEXT: .reg .b32 %r<4>;
1004 ; SM70-NEXT: .reg .f32 %f<2>;
1006 ; SM70-NEXT: // %bb.0:
1007 ; SM70-NEXT: ld.param.u16 %r1, [test_fptoui_i16_param_0];
1008 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
1009 ; SM70-NEXT: mov.b32 %f1, %r2;
1010 ; SM70-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
1011 ; SM70-NEXT: cvt.u32.u16 %r3, %rs1;
1012 ; SM70-NEXT: st.param.b32 [func_retval0], %r3;
1015 ; SM80-LABEL: test_fptoui_i16(
1017 ; SM80-NEXT: .reg .b16 %rs<3>;
1018 ; SM80-NEXT: .reg .b32 %r<2>;
1019 ; SM80-NEXT: .reg .f32 %f<2>;
1021 ; SM80-NEXT: // %bb.0:
1022 ; SM80-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
1023 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
1024 ; SM80-NEXT: cvt.rzi.u16.f32 %rs2, %f1;
1025 ; SM80-NEXT: cvt.u32.u16 %r1, %rs2;
1026 ; SM80-NEXT: st.param.b32 [func_retval0], %r1;
1029 ; SM80-FTZ-LABEL: test_fptoui_i16(
1031 ; SM80-FTZ-NEXT: .reg .b16 %rs<3>;
1032 ; SM80-FTZ-NEXT: .reg .b32 %r<2>;
1033 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
1035 ; SM80-FTZ-NEXT: // %bb.0:
1036 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
1037 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
1038 ; SM80-FTZ-NEXT: cvt.rzi.ftz.u16.f32 %rs2, %f1;
1039 ; SM80-FTZ-NEXT: cvt.u32.u16 %r1, %rs2;
1040 ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r1;
1041 ; SM80-FTZ-NEXT: ret;
1043 ; SM90-LABEL: test_fptoui_i16(
1045 ; SM90-NEXT: .reg .b16 %rs<3>;
1046 ; SM90-NEXT: .reg .b32 %r<2>;
1048 ; SM90-NEXT: // %bb.0:
1049 ; SM90-NEXT: ld.param.b16 %rs1, [test_fptoui_i16_param_0];
1050 ; SM90-NEXT: cvt.rzi.u16.bf16 %rs2, %rs1;
1051 ; SM90-NEXT: cvt.u32.u16 %r1, %rs2;
1052 ; SM90-NEXT: st.param.b32 [func_retval0], %r1;
1054 %r = fptoui bfloat %a to i16
1058 define bfloat @test_sitofp_i16(i16 %a) {
1059 ; SM70-LABEL: test_sitofp_i16(
1061 ; SM70-NEXT: .reg .pred %p<2>;
1062 ; SM70-NEXT: .reg .b16 %rs<4>;
1063 ; SM70-NEXT: .reg .b32 %r<7>;
1064 ; SM70-NEXT: .reg .f32 %f<2>;
1066 ; SM70-NEXT: // %bb.0:
1067 ; SM70-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0];
1068 ; SM70-NEXT: cvt.rn.f32.s16 %f1, %rs1;
1069 ; SM70-NEXT: mov.b32 %r1, %f1;
1070 ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
1071 ; SM70-NEXT: add.s32 %r3, %r2, %r1;
1072 ; SM70-NEXT: add.s32 %r4, %r3, 32767;
1073 ; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
1074 ; SM70-NEXT: or.b32 %r5, %r1, 4194304;
1075 ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
1076 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1077 ; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
1080 ; SM80-LABEL: test_sitofp_i16(
1082 ; SM80-NEXT: .reg .b16 %rs<3>;
1083 ; SM80-NEXT: .reg .f32 %f<2>;
1085 ; SM80-NEXT: // %bb.0:
1086 ; SM80-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0];
1087 ; SM80-NEXT: cvt.rn.f32.s16 %f1, %rs1;
1088 ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
1089 ; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
1092 ; SM80-FTZ-LABEL: test_sitofp_i16(
1094 ; SM80-FTZ-NEXT: .reg .b16 %rs<3>;
1095 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
1097 ; SM80-FTZ-NEXT: // %bb.0:
1098 ; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0];
1099 ; SM80-FTZ-NEXT: cvt.rn.f32.s16 %f1, %rs1;
1100 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
1101 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
1102 ; SM80-FTZ-NEXT: ret;
1104 ; SM90-LABEL: test_sitofp_i16(
1106 ; SM90-NEXT: .reg .b16 %rs<3>;
1108 ; SM90-NEXT: // %bb.0:
1109 ; SM90-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0];
1110 ; SM90-NEXT: cvt.rn.bf16.s16 %rs2, %rs1;
1111 ; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
1113 %r = sitofp i16 %a to bfloat
1117 define bfloat @test_uitofp_i8(i8 %a) {
1118 ; SM70-LABEL: test_uitofp_i8(
1120 ; SM70-NEXT: .reg .pred %p<2>;
1121 ; SM70-NEXT: .reg .b16 %rs<4>;
1122 ; SM70-NEXT: .reg .b32 %r<7>;
1123 ; SM70-NEXT: .reg .f32 %f<2>;
1125 ; SM70-NEXT: // %bb.0:
1126 ; SM70-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0];
1127 ; SM70-NEXT: cvt.rn.f32.u16 %f1, %rs1;
1128 ; SM70-NEXT: mov.b32 %r1, %f1;
1129 ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
1130 ; SM70-NEXT: add.s32 %r3, %r2, %r1;
1131 ; SM70-NEXT: add.s32 %r4, %r3, 32767;
1132 ; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
1133 ; SM70-NEXT: or.b32 %r5, %r1, 4194304;
1134 ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
1135 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1136 ; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
1139 ; SM80-LABEL: test_uitofp_i8(
1141 ; SM80-NEXT: .reg .b16 %rs<3>;
1142 ; SM80-NEXT: .reg .f32 %f<2>;
1144 ; SM80-NEXT: // %bb.0:
1145 ; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0];
1146 ; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1;
1147 ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
1148 ; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
1151 ; SM80-FTZ-LABEL: test_uitofp_i8(
1153 ; SM80-FTZ-NEXT: .reg .b16 %rs<3>;
1154 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
1156 ; SM80-FTZ-NEXT: // %bb.0:
1157 ; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0];
1158 ; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1;
1159 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
1160 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
1161 ; SM80-FTZ-NEXT: ret;
1163 ; SM90-LABEL: test_uitofp_i8(
1165 ; SM90-NEXT: .reg .b16 %rs<3>;
1167 ; SM90-NEXT: // %bb.0:
1168 ; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0];
1169 ; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
1170 ; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
1172 %r = uitofp i8 %a to bfloat
1176 define bfloat @test_uitofp_i1(i1 %a) {
1177 ; SM70-LABEL: test_uitofp_i1(
1179 ; SM70-NEXT: .reg .pred %p<3>;
1180 ; SM70-NEXT: .reg .b16 %rs<5>;
1181 ; SM70-NEXT: .reg .b32 %r<8>;
1182 ; SM70-NEXT: .reg .f32 %f<2>;
1184 ; SM70-NEXT: // %bb.0:
1185 ; SM70-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0];
1186 ; SM70-NEXT: and.b16 %rs2, %rs1, 1;
1187 ; SM70-NEXT: setp.eq.b16 %p1, %rs2, 1;
1188 ; SM70-NEXT: selp.u32 %r1, 1, 0, %p1;
1189 ; SM70-NEXT: cvt.rn.f32.u32 %f1, %r1;
1190 ; SM70-NEXT: mov.b32 %r2, %f1;
1191 ; SM70-NEXT: bfe.u32 %r3, %r2, 16, 1;
1192 ; SM70-NEXT: add.s32 %r4, %r3, %r2;
1193 ; SM70-NEXT: add.s32 %r5, %r4, 32767;
1194 ; SM70-NEXT: setp.nan.f32 %p2, %f1, %f1;
1195 ; SM70-NEXT: or.b32 %r6, %r2, 4194304;
1196 ; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p2;
1197 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r7; }
1198 ; SM70-NEXT: st.param.b16 [func_retval0], %rs3;
1201 ; SM80-LABEL: test_uitofp_i1(
1203 ; SM80-NEXT: .reg .pred %p<2>;
1204 ; SM80-NEXT: .reg .b16 %rs<4>;
1205 ; SM80-NEXT: .reg .b32 %r<2>;
1206 ; SM80-NEXT: .reg .f32 %f<2>;
1208 ; SM80-NEXT: // %bb.0:
1209 ; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0];
1210 ; SM80-NEXT: and.b16 %rs2, %rs1, 1;
1211 ; SM80-NEXT: setp.eq.b16 %p1, %rs2, 1;
1212 ; SM80-NEXT: selp.u32 %r1, 1, 0, %p1;
1213 ; SM80-NEXT: cvt.rn.f32.u32 %f1, %r1;
1214 ; SM80-NEXT: cvt.rn.bf16.f32 %rs3, %f1;
1215 ; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
1218 ; SM80-FTZ-LABEL: test_uitofp_i1(
1220 ; SM80-FTZ-NEXT: .reg .pred %p<2>;
1221 ; SM80-FTZ-NEXT: .reg .b16 %rs<4>;
1222 ; SM80-FTZ-NEXT: .reg .b32 %r<2>;
1223 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
1225 ; SM80-FTZ-NEXT: // %bb.0:
1226 ; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0];
1227 ; SM80-FTZ-NEXT: and.b16 %rs2, %rs1, 1;
1228 ; SM80-FTZ-NEXT: setp.eq.b16 %p1, %rs2, 1;
1229 ; SM80-FTZ-NEXT: selp.u32 %r1, 1, 0, %p1;
1230 ; SM80-FTZ-NEXT: cvt.rn.f32.u32 %f1, %r1;
1231 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs3, %f1;
1232 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
1233 ; SM80-FTZ-NEXT: ret;
1235 ; SM90-LABEL: test_uitofp_i1(
1237 ; SM90-NEXT: .reg .pred %p<2>;
1238 ; SM90-NEXT: .reg .b16 %rs<4>;
1239 ; SM90-NEXT: .reg .b32 %r<2>;
1241 ; SM90-NEXT: // %bb.0:
1242 ; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0];
1243 ; SM90-NEXT: and.b16 %rs2, %rs1, 1;
1244 ; SM90-NEXT: setp.eq.b16 %p1, %rs2, 1;
1245 ; SM90-NEXT: selp.u32 %r1, 1, 0, %p1;
1246 ; SM90-NEXT: cvt.rn.bf16.u32 %rs3, %r1;
1247 ; SM90-NEXT: st.param.b16 [func_retval0], %rs3;
1249 %r = uitofp i1 %a to bfloat
1253 define bfloat @test_uitofp_i16(i16 %a) {
1254 ; SM70-LABEL: test_uitofp_i16(
1256 ; SM70-NEXT: .reg .pred %p<2>;
1257 ; SM70-NEXT: .reg .b16 %rs<4>;
1258 ; SM70-NEXT: .reg .b32 %r<7>;
1259 ; SM70-NEXT: .reg .f32 %f<2>;
1261 ; SM70-NEXT: // %bb.0:
1262 ; SM70-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0];
1263 ; SM70-NEXT: cvt.rn.f32.u16 %f1, %rs1;
1264 ; SM70-NEXT: mov.b32 %r1, %f1;
1265 ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
1266 ; SM70-NEXT: add.s32 %r3, %r2, %r1;
1267 ; SM70-NEXT: add.s32 %r4, %r3, 32767;
1268 ; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
1269 ; SM70-NEXT: or.b32 %r5, %r1, 4194304;
1270 ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
1271 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs2}, %r6; }
1272 ; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
1275 ; SM80-LABEL: test_uitofp_i16(
1277 ; SM80-NEXT: .reg .b16 %rs<3>;
1278 ; SM80-NEXT: .reg .f32 %f<2>;
1280 ; SM80-NEXT: // %bb.0:
1281 ; SM80-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0];
1282 ; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1;
1283 ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
1284 ; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
1287 ; SM80-FTZ-LABEL: test_uitofp_i16(
1289 ; SM80-FTZ-NEXT: .reg .b16 %rs<3>;
1290 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
1292 ; SM80-FTZ-NEXT: // %bb.0:
1293 ; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0];
1294 ; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1;
1295 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
1296 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
1297 ; SM80-FTZ-NEXT: ret;
1299 ; SM90-LABEL: test_uitofp_i16(
1301 ; SM90-NEXT: .reg .b16 %rs<3>;
1303 ; SM90-NEXT: // %bb.0:
1304 ; SM90-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0];
1305 ; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
1306 ; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
1308 %r = uitofp i16 %a to bfloat
1312 define bfloat @test_uitofp_i32(i32 %a) {
1313 ; SM70-LABEL: test_uitofp_i32(
1315 ; SM70-NEXT: .reg .pred %p<2>;
1316 ; SM70-NEXT: .reg .b16 %rs<3>;
1317 ; SM70-NEXT: .reg .b32 %r<8>;
1318 ; SM70-NEXT: .reg .f32 %f<2>;
1320 ; SM70-NEXT: // %bb.0:
1321 ; SM70-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0];
1322 ; SM70-NEXT: cvt.rn.f32.u32 %f1, %r1;
1323 ; SM70-NEXT: mov.b32 %r2, %f1;
1324 ; SM70-NEXT: bfe.u32 %r3, %r2, 16, 1;
1325 ; SM70-NEXT: add.s32 %r4, %r3, %r2;
1326 ; SM70-NEXT: add.s32 %r5, %r4, 32767;
1327 ; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
1328 ; SM70-NEXT: or.b32 %r6, %r2, 4194304;
1329 ; SM70-NEXT: selp.b32 %r7, %r6, %r5, %p1;
1330 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r7; }
1331 ; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
1334 ; SM80-LABEL: test_uitofp_i32(
1336 ; SM80-NEXT: .reg .b16 %rs<2>;
1337 ; SM80-NEXT: .reg .b32 %r<2>;
1338 ; SM80-NEXT: .reg .f32 %f<2>;
1340 ; SM80-NEXT: // %bb.0:
1341 ; SM80-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0];
1342 ; SM80-NEXT: cvt.rn.f32.u32 %f1, %r1;
1343 ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
1344 ; SM80-NEXT: st.param.b16 [func_retval0], %rs1;
1347 ; SM80-FTZ-LABEL: test_uitofp_i32(
1349 ; SM80-FTZ-NEXT: .reg .b16 %rs<2>;
1350 ; SM80-FTZ-NEXT: .reg .b32 %r<2>;
1351 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
1353 ; SM80-FTZ-NEXT: // %bb.0:
1354 ; SM80-FTZ-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0];
1355 ; SM80-FTZ-NEXT: cvt.rn.f32.u32 %f1, %r1;
1356 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
1357 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
1358 ; SM80-FTZ-NEXT: ret;
1360 ; SM90-LABEL: test_uitofp_i32(
1362 ; SM90-NEXT: .reg .b16 %rs<2>;
1363 ; SM90-NEXT: .reg .b32 %r<2>;
1365 ; SM90-NEXT: // %bb.0:
1366 ; SM90-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0];
1367 ; SM90-NEXT: cvt.rn.bf16.u32 %rs1, %r1;
1368 ; SM90-NEXT: st.param.b16 [func_retval0], %rs1;
1370 %r = uitofp i32 %a to bfloat
1374 define bfloat @test_uitofp_i64(i64 %a) {
1375 ; SM70-LABEL: test_uitofp_i64(
1377 ; SM70-NEXT: .reg .pred %p<2>;
1378 ; SM70-NEXT: .reg .b16 %rs<3>;
1379 ; SM70-NEXT: .reg .b32 %r<7>;
1380 ; SM70-NEXT: .reg .f32 %f<2>;
1381 ; SM70-NEXT: .reg .b64 %rd<2>;
1383 ; SM70-NEXT: // %bb.0:
1384 ; SM70-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0];
1385 ; SM70-NEXT: cvt.rn.f32.u64 %f1, %rd1;
1386 ; SM70-NEXT: mov.b32 %r1, %f1;
1387 ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
1388 ; SM70-NEXT: add.s32 %r3, %r2, %r1;
1389 ; SM70-NEXT: add.s32 %r4, %r3, 32767;
1390 ; SM70-NEXT: setp.nan.f32 %p1, %f1, %f1;
1391 ; SM70-NEXT: or.b32 %r5, %r1, 4194304;
1392 ; SM70-NEXT: selp.b32 %r6, %r5, %r4, %p1;
1393 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r6; }
1394 ; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
1397 ; SM80-LABEL: test_uitofp_i64(
1399 ; SM80-NEXT: .reg .b16 %rs<2>;
1400 ; SM80-NEXT: .reg .f32 %f<2>;
1401 ; SM80-NEXT: .reg .b64 %rd<2>;
1403 ; SM80-NEXT: // %bb.0:
1404 ; SM80-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0];
1405 ; SM80-NEXT: cvt.rn.f32.u64 %f1, %rd1;
1406 ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
1407 ; SM80-NEXT: st.param.b16 [func_retval0], %rs1;
1410 ; SM80-FTZ-LABEL: test_uitofp_i64(
1412 ; SM80-FTZ-NEXT: .reg .b16 %rs<2>;
1413 ; SM80-FTZ-NEXT: .reg .f32 %f<2>;
1414 ; SM80-FTZ-NEXT: .reg .b64 %rd<2>;
1416 ; SM80-FTZ-NEXT: // %bb.0:
1417 ; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0];
1418 ; SM80-FTZ-NEXT: cvt.rn.f32.u64 %f1, %rd1;
1419 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
1420 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
1421 ; SM80-FTZ-NEXT: ret;
1423 ; SM90-LABEL: test_uitofp_i64(
1425 ; SM90-NEXT: .reg .b16 %rs<2>;
1426 ; SM90-NEXT: .reg .b64 %rd<2>;
1428 ; SM90-NEXT: // %bb.0:
1429 ; SM90-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0];
1430 ; SM90-NEXT: cvt.rn.bf16.u64 %rs1, %rd1;
1431 ; SM90-NEXT: st.param.b16 [func_retval0], %rs1;
1433 %r = uitofp i64 %a to bfloat
1437 define bfloat @test_roundeven(bfloat %a) {
1438 ; SM70-LABEL: test_roundeven(
1440 ; SM70-NEXT: .reg .pred %p<2>;
1441 ; SM70-NEXT: .reg .b16 %rs<3>;
1442 ; SM70-NEXT: .reg .b32 %r<9>;
1443 ; SM70-NEXT: .reg .f32 %f<3>;
1445 ; SM70-NEXT: // %bb.0:
1446 ; SM70-NEXT: ld.param.u16 %r1, [test_roundeven_param_0];
1447 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
1448 ; SM70-NEXT: mov.b32 %f1, %r2;
1449 ; SM70-NEXT: cvt.rni.f32.f32 %f2, %f1;
1450 ; SM70-NEXT: mov.b32 %r3, %f2;
1451 ; SM70-NEXT: bfe.u32 %r4, %r3, 16, 1;
1452 ; SM70-NEXT: add.s32 %r5, %r4, %r3;
1453 ; SM70-NEXT: add.s32 %r6, %r5, 32767;
1454 ; SM70-NEXT: setp.nan.f32 %p1, %f2, %f2;
1455 ; SM70-NEXT: or.b32 %r7, %r3, 4194304;
1456 ; SM70-NEXT: selp.b32 %r8, %r7, %r6, %p1;
1457 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r8; }
1458 ; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
1461 ; SM80-LABEL: test_roundeven(
1463 ; SM80-NEXT: .reg .b16 %rs<3>;
1464 ; SM80-NEXT: .reg .f32 %f<3>;
1466 ; SM80-NEXT: // %bb.0:
1467 ; SM80-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0];
1468 ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
1469 ; SM80-NEXT: cvt.rni.f32.f32 %f2, %f1;
1470 ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f2;
1471 ; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
1474 ; SM80-FTZ-LABEL: test_roundeven(
1476 ; SM80-FTZ-NEXT: .reg .b16 %rs<3>;
1477 ; SM80-FTZ-NEXT: .reg .f32 %f<3>;
1479 ; SM80-FTZ-NEXT: // %bb.0:
1480 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0];
1481 ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
1482 ; SM80-FTZ-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1;
1483 ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f2;
1484 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
1485 ; SM80-FTZ-NEXT: ret;
1487 ; SM90-LABEL: test_roundeven(
1489 ; SM90-NEXT: .reg .b16 %rs<3>;
1491 ; SM90-NEXT: // %bb.0:
1492 ; SM90-NEXT: ld.param.b16 %rs1, [test_roundeven_param_0];
1493 ; SM90-NEXT: cvt.rni.bf16.bf16 %rs2, %rs1;
1494 ; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
1496 %r = call bfloat @llvm.roundeven.bf16(bfloat %a)
1500 define bfloat @test_maximum(bfloat %a, bfloat %b) {
1501 ; SM70-LABEL: test_maximum(
1503 ; SM70-NEXT: .reg .pred %p<6>;
1504 ; SM70-NEXT: .reg .b16 %rs<11>;
1505 ; SM70-NEXT: .reg .b32 %r<7>;
1506 ; SM70-NEXT: .reg .f32 %f<4>;
1508 ; SM70-NEXT: // %bb.0:
1509 ; SM70-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
1510 ; SM70-NEXT: ld.param.b16 %rs3, [test_maximum_param_1];
1511 ; SM70-NEXT: cvt.u32.u16 %r1, %rs3;
1512 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
1513 ; SM70-NEXT: mov.b32 %f1, %r2;
1514 ; SM70-NEXT: cvt.u32.u16 %r3, %rs1;
1515 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
1516 ; SM70-NEXT: mov.b32 %f2, %r4;
1517 ; SM70-NEXT: setp.gt.f32 %p1, %f2, %f1;
1518 ; SM70-NEXT: selp.b16 %rs5, %rs1, %rs3, %p1;
1519 ; SM70-NEXT: setp.nan.f32 %p2, %f2, %f1;
1520 ; SM70-NEXT: selp.b16 %rs6, 0x7FC0, %rs5, %p2;
1521 ; SM70-NEXT: setp.eq.s16 %p3, %rs1, 0;
1522 ; SM70-NEXT: selp.b16 %rs7, %rs1, %rs6, %p3;
1523 ; SM70-NEXT: setp.eq.s16 %p4, %rs3, 0;
1524 ; SM70-NEXT: selp.b16 %rs8, %rs3, %rs7, %p4;
1525 ; SM70-NEXT: cvt.u32.u16 %r5, %rs6;
1526 ; SM70-NEXT: shl.b32 %r6, %r5, 16;
1527 ; SM70-NEXT: mov.b32 %f3, %r6;
1528 ; SM70-NEXT: setp.eq.f32 %p5, %f3, 0f00000000;
1529 ; SM70-NEXT: selp.b16 %rs10, %rs8, %rs6, %p5;
1530 ; SM70-NEXT: st.param.b16 [func_retval0], %rs10;
1533 ; SM80-LABEL: test_maximum(
1535 ; SM80-NEXT: .reg .b16 %rs<4>;
1537 ; SM80-NEXT: // %bb.0:
1538 ; SM80-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
1539 ; SM80-NEXT: ld.param.b16 %rs2, [test_maximum_param_1];
1540 ; SM80-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2;
1541 ; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
1544 ; SM80-FTZ-LABEL: test_maximum(
1546 ; SM80-FTZ-NEXT: .reg .b16 %rs<4>;
1548 ; SM80-FTZ-NEXT: // %bb.0:
1549 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
1550 ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_maximum_param_1];
1551 ; SM80-FTZ-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2;
1552 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
1553 ; SM80-FTZ-NEXT: ret;
1555 ; SM90-LABEL: test_maximum(
1557 ; SM90-NEXT: .reg .b16 %rs<4>;
1559 ; SM90-NEXT: // %bb.0:
1560 ; SM90-NEXT: ld.param.b16 %rs1, [test_maximum_param_0];
1561 ; SM90-NEXT: ld.param.b16 %rs2, [test_maximum_param_1];
1562 ; SM90-NEXT: max.NaN.bf16 %rs3, %rs1, %rs2;
1563 ; SM90-NEXT: st.param.b16 [func_retval0], %rs3;
1565 %r = call bfloat @llvm.maximum.bf16(bfloat %a, bfloat %b)
1569 define bfloat @test_maxnum(bfloat %a, bfloat %b) {
1570 ; SM70-LABEL: test_maxnum(
1572 ; SM70-NEXT: .reg .pred %p<2>;
1573 ; SM70-NEXT: .reg .b16 %rs<3>;
1574 ; SM70-NEXT: .reg .b32 %r<11>;
1575 ; SM70-NEXT: .reg .f32 %f<4>;
1577 ; SM70-NEXT: // %bb.0:
1578 ; SM70-NEXT: ld.param.u16 %r1, [test_maxnum_param_1];
1579 ; SM70-NEXT: shl.b32 %r2, %r1, 16;
1580 ; SM70-NEXT: mov.b32 %f1, %r2;
1581 ; SM70-NEXT: ld.param.u16 %r3, [test_maxnum_param_0];
1582 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
1583 ; SM70-NEXT: mov.b32 %f2, %r4;
1584 ; SM70-NEXT: max.f32 %f3, %f2, %f1;
1585 ; SM70-NEXT: mov.b32 %r5, %f3;
1586 ; SM70-NEXT: bfe.u32 %r6, %r5, 16, 1;
1587 ; SM70-NEXT: add.s32 %r7, %r6, %r5;
1588 ; SM70-NEXT: add.s32 %r8, %r7, 32767;
1589 ; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
1590 ; SM70-NEXT: or.b32 %r9, %r5, 4194304;
1591 ; SM70-NEXT: selp.b32 %r10, %r9, %r8, %p1;
1592 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r10; }
1593 ; SM70-NEXT: st.param.b16 [func_retval0], %rs1;
1596 ; SM80-LABEL: test_maxnum(
1598 ; SM80-NEXT: .reg .b16 %rs<4>;
1600 ; SM80-NEXT: // %bb.0:
1601 ; SM80-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0];
1602 ; SM80-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1];
1603 ; SM80-NEXT: max.bf16 %rs3, %rs1, %rs2;
1604 ; SM80-NEXT: st.param.b16 [func_retval0], %rs3;
1607 ; SM80-FTZ-LABEL: test_maxnum(
1609 ; SM80-FTZ-NEXT: .reg .b16 %rs<4>;
1611 ; SM80-FTZ-NEXT: // %bb.0:
1612 ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0];
1613 ; SM80-FTZ-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1];
1614 ; SM80-FTZ-NEXT: max.bf16 %rs3, %rs1, %rs2;
1615 ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs3;
1616 ; SM80-FTZ-NEXT: ret;
1618 ; SM90-LABEL: test_maxnum(
1620 ; SM90-NEXT: .reg .b16 %rs<4>;
1622 ; SM90-NEXT: // %bb.0:
1623 ; SM90-NEXT: ld.param.b16 %rs1, [test_maxnum_param_0];
1624 ; SM90-NEXT: ld.param.b16 %rs2, [test_maxnum_param_1];
1625 ; SM90-NEXT: max.bf16 %rs3, %rs1, %rs2;
1626 ; SM90-NEXT: st.param.b16 [func_retval0], %rs3;
1628 %r = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
1632 define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
1633 ; SM70-LABEL: test_maximum_v2(
1635 ; SM70-NEXT: .reg .pred %p<11>;
1636 ; SM70-NEXT: .reg .b16 %rs<21>;
1637 ; SM70-NEXT: .reg .b32 %r<16>;
1638 ; SM70-NEXT: .reg .f32 %f<7>;
1640 ; SM70-NEXT: // %bb.0:
1641 ; SM70-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_0];
1642 ; SM70-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_1];
1643 ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
1644 ; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
1645 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
1646 ; SM70-NEXT: mov.b32 %f1, %r4;
1647 ; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1;
1648 ; SM70-NEXT: cvt.u32.u16 %r5, %rs5;
1649 ; SM70-NEXT: shl.b32 %r6, %r5, 16;
1650 ; SM70-NEXT: mov.b32 %f2, %r6;
1651 ; SM70-NEXT: setp.gt.f32 %p1, %f2, %f1;
1652 ; SM70-NEXT: selp.b16 %rs7, %rs5, %rs2, %p1;
1653 ; SM70-NEXT: setp.nan.f32 %p2, %f2, %f1;
1654 ; SM70-NEXT: selp.b16 %rs8, 0x7FC0, %rs7, %p2;
1655 ; SM70-NEXT: setp.eq.s16 %p3, %rs5, 0;
1656 ; SM70-NEXT: selp.b16 %rs9, %rs5, %rs8, %p3;
1657 ; SM70-NEXT: setp.eq.s16 %p4, %rs2, 0;
1658 ; SM70-NEXT: selp.b16 %rs10, %rs2, %rs9, %p4;
1659 ; SM70-NEXT: cvt.u32.u16 %r7, %rs8;
1660 ; SM70-NEXT: shl.b32 %r8, %r7, 16;
1661 ; SM70-NEXT: mov.b32 %f3, %r8;
1662 ; SM70-NEXT: setp.eq.f32 %p5, %f3, 0f00000000;
1663 ; SM70-NEXT: selp.b16 %rs12, %rs10, %rs8, %p5;
1664 ; SM70-NEXT: cvt.u32.u16 %r9, %rs1;
1665 ; SM70-NEXT: shl.b32 %r10, %r9, 16;
1666 ; SM70-NEXT: mov.b32 %f4, %r10;
1667 ; SM70-NEXT: cvt.u32.u16 %r11, %rs4;
1668 ; SM70-NEXT: shl.b32 %r12, %r11, 16;
1669 ; SM70-NEXT: mov.b32 %f5, %r12;
1670 ; SM70-NEXT: setp.gt.f32 %p6, %f5, %f4;
1671 ; SM70-NEXT: selp.b16 %rs15, %rs4, %rs1, %p6;
1672 ; SM70-NEXT: setp.nan.f32 %p7, %f5, %f4;
1673 ; SM70-NEXT: selp.b16 %rs16, 0x7FC0, %rs15, %p7;
1674 ; SM70-NEXT: setp.eq.s16 %p8, %rs4, 0;
1675 ; SM70-NEXT: selp.b16 %rs17, %rs4, %rs16, %p8;
1676 ; SM70-NEXT: setp.eq.s16 %p9, %rs1, 0;
1677 ; SM70-NEXT: selp.b16 %rs18, %rs1, %rs17, %p9;
1678 ; SM70-NEXT: cvt.u32.u16 %r13, %rs16;
1679 ; SM70-NEXT: shl.b32 %r14, %r13, 16;
1680 ; SM70-NEXT: mov.b32 %f6, %r14;
1681 ; SM70-NEXT: setp.eq.f32 %p10, %f6, 0f00000000;
1682 ; SM70-NEXT: selp.b16 %rs20, %rs18, %rs16, %p10;
1683 ; SM70-NEXT: mov.b32 %r15, {%rs20, %rs12};
1684 ; SM70-NEXT: st.param.b32 [func_retval0], %r15;
1687 ; SM80-LABEL: test_maximum_v2(
1689 ; SM80-NEXT: .reg .b32 %r<4>;
1691 ; SM80-NEXT: // %bb.0:
1692 ; SM80-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1];
1693 ; SM80-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0];
1694 ; SM80-NEXT: max.NaN.bf16x2 %r3, %r2, %r1;
1695 ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
1698 ; SM80-FTZ-LABEL: test_maximum_v2(
1700 ; SM80-FTZ-NEXT: .reg .b32 %r<4>;
1702 ; SM80-FTZ-NEXT: // %bb.0:
1703 ; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1];
1704 ; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0];
1705 ; SM80-FTZ-NEXT: max.NaN.bf16x2 %r3, %r2, %r1;
1706 ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
1707 ; SM80-FTZ-NEXT: ret;
1709 ; SM90-LABEL: test_maximum_v2(
1711 ; SM90-NEXT: .reg .b32 %r<4>;
1713 ; SM90-NEXT: // %bb.0:
1714 ; SM90-NEXT: ld.param.b32 %r1, [test_maximum_v2_param_1];
1715 ; SM90-NEXT: ld.param.b32 %r2, [test_maximum_v2_param_0];
1716 ; SM90-NEXT: max.NaN.bf16x2 %r3, %r2, %r1;
1717 ; SM90-NEXT: st.param.b32 [func_retval0], %r3;
1719 %r = call <2 x bfloat> @llvm.maximum.bf16(<2 x bfloat> %a, <2 x bfloat> %b)
1723 define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
1724 ; SM70-LABEL: test_maxnum_v2(
1726 ; SM70-NEXT: .reg .pred %p<3>;
1727 ; SM70-NEXT: .reg .b16 %rs<13>;
1728 ; SM70-NEXT: .reg .b32 %r<24>;
1729 ; SM70-NEXT: .reg .f32 %f<7>;
1731 ; SM70-NEXT: // %bb.0:
1732 ; SM70-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_0];
1733 ; SM70-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_1];
1734 ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r2;
1735 ; SM70-NEXT: cvt.u32.u16 %r3, %rs2;
1736 ; SM70-NEXT: shl.b32 %r4, %r3, 16;
1737 ; SM70-NEXT: mov.b32 %f1, %r4;
1738 ; SM70-NEXT: mov.b32 {%rs4, %rs5}, %r1;
1739 ; SM70-NEXT: cvt.u32.u16 %r5, %rs5;
1740 ; SM70-NEXT: shl.b32 %r6, %r5, 16;
1741 ; SM70-NEXT: mov.b32 %f2, %r6;
1742 ; SM70-NEXT: max.f32 %f3, %f2, %f1;
1743 ; SM70-NEXT: mov.b32 %r7, %f3;
1744 ; SM70-NEXT: bfe.u32 %r8, %r7, 16, 1;
1745 ; SM70-NEXT: add.s32 %r9, %r8, %r7;
1746 ; SM70-NEXT: add.s32 %r10, %r9, 32767;
1747 ; SM70-NEXT: setp.nan.f32 %p1, %f3, %f3;
1748 ; SM70-NEXT: or.b32 %r11, %r7, 4194304;
1749 ; SM70-NEXT: selp.b32 %r12, %r11, %r10, %p1;
1750 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r12; }
1751 ; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
1752 ; SM70-NEXT: shl.b32 %r14, %r13, 16;
1753 ; SM70-NEXT: mov.b32 %f4, %r14;
1754 ; SM70-NEXT: cvt.u32.u16 %r15, %rs4;
1755 ; SM70-NEXT: shl.b32 %r16, %r15, 16;
1756 ; SM70-NEXT: mov.b32 %f5, %r16;
1757 ; SM70-NEXT: max.f32 %f6, %f5, %f4;
1758 ; SM70-NEXT: mov.b32 %r17, %f6;
1759 ; SM70-NEXT: bfe.u32 %r18, %r17, 16, 1;
1760 ; SM70-NEXT: add.s32 %r19, %r18, %r17;
1761 ; SM70-NEXT: add.s32 %r20, %r19, 32767;
1762 ; SM70-NEXT: setp.nan.f32 %p2, %f6, %f6;
1763 ; SM70-NEXT: or.b32 %r21, %r17, 4194304;
1764 ; SM70-NEXT: selp.b32 %r22, %r21, %r20, %p2;
1765 ; SM70-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs11}, %r22; }
1766 ; SM70-NEXT: mov.b32 %r23, {%rs11, %rs7};
1767 ; SM70-NEXT: st.param.b32 [func_retval0], %r23;
1770 ; SM80-LABEL: test_maxnum_v2(
1772 ; SM80-NEXT: .reg .b32 %r<4>;
1774 ; SM80-NEXT: // %bb.0:
1775 ; SM80-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1];
1776 ; SM80-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0];
1777 ; SM80-NEXT: max.bf16x2 %r3, %r2, %r1;
1778 ; SM80-NEXT: st.param.b32 [func_retval0], %r3;
1781 ; SM80-FTZ-LABEL: test_maxnum_v2(
1783 ; SM80-FTZ-NEXT: .reg .b32 %r<4>;
1785 ; SM80-FTZ-NEXT: // %bb.0:
1786 ; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1];
1787 ; SM80-FTZ-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0];
1788 ; SM80-FTZ-NEXT: max.bf16x2 %r3, %r2, %r1;
1789 ; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %r3;
1790 ; SM80-FTZ-NEXT: ret;
1792 ; SM90-LABEL: test_maxnum_v2(
1794 ; SM90-NEXT: .reg .b32 %r<4>;
1796 ; SM90-NEXT: // %bb.0:
1797 ; SM90-NEXT: ld.param.b32 %r1, [test_maxnum_v2_param_1];
1798 ; SM90-NEXT: ld.param.b32 %r2, [test_maxnum_v2_param_0];
1799 ; SM90-NEXT: max.bf16x2 %r3, %r2, %r1;
1800 ; SM90-NEXT: st.param.b32 [func_retval0], %r3;
1802 %r = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)