1 ; RUN: llc -amdgpu-scalar-ir-passes=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
3 ; GCN-LABEL: {{^}}select_undef_lhs:
7 ; GCN-NEXT: s_setpc_b64
8 define float @select_undef_lhs(float %val, i1 %cond) {
9 %sel = select i1 %cond, float undef, float %val
13 ; GCN-LABEL: {{^}}select_undef_rhs:
17 ; GCN-NEXT: s_setpc_b64
18 define float @select_undef_rhs(float %val, i1 %cond) {
19 %sel = select i1 %cond, float %val, float undef
23 ; GCN-LABEL: {{^}}select_undef_n1:
24 ; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
25 ; GCN: store_dword {{[^,]+}}, [[RES]]
26 define void @select_undef_n1(ptr addrspace(1) %a, i32 %c) {
27 %cc = icmp eq i32 %c, 0
28 %sel = select i1 %cc, float 1.000000e+00, float undef
29 store float %sel, ptr addrspace(1) %a
33 ; GCN-LABEL: {{^}}select_undef_n2:
34 ; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
35 ; GCN: store_dword {{[^,]+}}, [[RES]]
36 define void @select_undef_n2(ptr addrspace(1) %a, i32 %c) {
37 %cc = icmp eq i32 %c, 0
38 %sel = select i1 %cc, float undef, float 1.000000e+00
39 store float %sel, ptr addrspace(1) %a
43 declare float @llvm.amdgcn.rcp.f32(float)
46 ; Make sure the vector undef isn't lowered into 0s.
47 ; GCN-LABEL: {{^}}undef_v6f32:
48 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
49 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
50 ; GCN: s_cbranch_vccnz
51 define amdgpu_kernel void @undef_v6f32(ptr addrspace(3) %ptr, i1 %cond) {
56 %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
57 %load = load volatile <6 x float>, ptr addrspace(3) undef
58 %add = fadd <6 x float> %load, %phi
59 br i1 %cond, label %loop, label %ret
62 store volatile <6 x float> %add, ptr addrspace(3) undef
66 ; GCN-LABEL: {{^}}undef_v6i32:
67 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
68 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
69 ; GCN: s_cbranch_vccnz
70 define amdgpu_kernel void @undef_v6i32(ptr addrspace(3) %ptr, i1 %cond) {
75 %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
76 %load = load volatile <6 x i32>, ptr addrspace(3) undef
77 %add = add <6 x i32> %load, %phi
78 br i1 %cond, label %loop, label %ret
81 store volatile <6 x i32> %add, ptr addrspace(3) undef
85 ; Make sure the vector undef isn't lowered into 0s.
86 ; GCN-LABEL: {{^}}undef_v5f32:
87 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
88 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
89 ; GCN: s_cbranch_vccnz
90 define amdgpu_kernel void @undef_v5f32(ptr addrspace(3) %ptr, i1 %cond) {
95 %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
96 %load = load volatile <5 x float>, ptr addrspace(3) undef
97 %add = fadd <5 x float> %load, %phi
98 br i1 %cond, label %loop, label %ret
101 store volatile <5 x float> %add, ptr addrspace(3) undef
105 ; GCN-LABEL: {{^}}undef_v5i32:
106 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
107 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
108 ; GCN: s_cbranch_vccnz
109 define amdgpu_kernel void @undef_v5i32(ptr addrspace(3) %ptr, i1 %cond) {
114 %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
115 %load = load volatile <5 x i32>, ptr addrspace(3) undef
116 %add = add <5 x i32> %load, %phi
117 br i1 %cond, label %loop, label %ret
120 store volatile <5 x i32> %add, ptr addrspace(3) undef
124 ; Make sure the vector undef isn't lowered into 0s.
125 ; GCN-LABEL: {{^}}undef_v3f64:
126 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
127 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
128 ; GCN: s_cbranch_vccnz
129 define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) {
134 %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
135 %load = load volatile <3 x double>, ptr addrspace(3) %ptr
136 %add = fadd <3 x double> %load, %phi
137 br i1 %cond, label %loop, label %ret
140 store volatile <3 x double> %add, ptr addrspace(3) %ptr
144 ; GCN-LABEL: {{^}}undef_v3i64:
145 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
146 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
147 ; GCN: s_cbranch_vccnz
148 define amdgpu_kernel void @undef_v3i64(ptr addrspace(3) %ptr, i1 %cond) {
153 %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
154 %load = load volatile <3 x i64>, ptr addrspace(3) %ptr
155 %add = add <3 x i64> %load, %phi
156 br i1 %cond, label %loop, label %ret
159 store volatile <3 x i64> %add, ptr addrspace(3) %ptr
163 ; Make sure the vector undef isn't lowered into 0s.
164 ; GCN-LABEL: {{^}}undef_v4f16:
165 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
166 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
167 ; GCN: s_cbranch_vccnz
168 define amdgpu_kernel void @undef_v4f16(ptr addrspace(3) %ptr, i1 %cond) {
173 %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
174 %load = load volatile <4 x half>, ptr addrspace(3) %ptr
175 %add = fadd <4 x half> %load, %phi
176 br i1 %cond, label %loop, label %ret
179 store volatile <4 x half> %add, ptr addrspace(3) %ptr
183 ; GCN-LABEL: {{^}}undef_v4i16:
184 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
185 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
186 ; GCN: s_cbranch_vccnz
187 define amdgpu_kernel void @undef_v4i16(ptr addrspace(3) %ptr, i1 %cond) {
192 %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
193 %load = load volatile <4 x i16>, ptr addrspace(3) %ptr
194 %add = add <4 x i16> %load, %phi
195 br i1 %cond, label %loop, label %ret
198 store volatile <4 x i16> %add, ptr addrspace(3) %ptr
202 ; Make sure the vector undef isn't lowered into 0s.
203 ; GCN-LABEL: {{^}}undef_v2f16:
204 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
205 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
206 ; GCN: s_cbranch_vccnz
207 define amdgpu_kernel void @undef_v2f16(ptr addrspace(3) %ptr, i1 %cond) {
212 %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
213 %load = load volatile <2 x half>, ptr addrspace(3) %ptr
214 %add = fadd <2 x half> %load, %phi
215 br i1 %cond, label %loop, label %ret
218 store volatile <2 x half> %add, ptr addrspace(3) %ptr
222 ; GCN-LABEL: {{^}}undef_v2i16:
223 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
224 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
225 ; GCN: s_cbranch_vccnz
226 define amdgpu_kernel void @undef_v2i16(ptr addrspace(3) %ptr, i1 %cond) {
231 %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
232 %load = load volatile <2 x i16>, ptr addrspace(3) %ptr
233 %add = add <2 x i16> %load, %phi
234 br i1 %cond, label %loop, label %ret
237 store volatile <2 x i16> %add, ptr addrspace(3) %ptr
241 ; We were expanding undef vectors into zero vectors. Optimizations
242 ; would then see we used no elements of the vector, and reform the
243 ; undef vector resulting in a combiner loop.
244 ; GCN-LABEL: {{^}}inf_loop_undef_vector:
246 ; GCN-NEXT: v_mad_u64_u32
247 ; GCN-NEXT: v_mul_lo_u32
248 ; GCN-NEXT: v_mul_lo_u32
249 ; GCN-NEXT: v_add3_u32
250 ; GCN-NEXT: global_store_dwordx2
251 define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
252 %i = insertelement <6 x float> %arg, float %arg1, i64 2
253 %i3 = bitcast <6 x float> %i to <3 x i64>
254 %i4 = extractelement <3 x i64> %i3, i64 0
255 %i5 = extractelement <3 x i64> %i3, i64 1
256 %i6 = mul i64 %i5, %arg2
257 %i7 = add i64 %i6, %i4
258 store volatile i64 %i7, ptr addrspace(1) undef, align 4
262 ; GCN-LABEL: {{^}}undef_bf16:
263 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
264 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
265 ; GCN: s_cbranch_vccnz
266 define amdgpu_kernel void @undef_bf16(ptr addrspace(3) %ptr, i1 %cond) {
271 %phi = phi bfloat [ undef, %entry ], [ %add, %loop ]
272 %load = load volatile bfloat, ptr addrspace(3) undef
273 %bc.0 = bitcast bfloat %load to i16
274 %bc.1 = bitcast bfloat %phi to i16
275 %add.i = add i16 %bc.0, %bc.1
276 %add = bitcast i16 %add.i to bfloat
277 br i1 %cond, label %loop, label %ret
280 store volatile bfloat %add, ptr addrspace(3) undef
284 ; GCN-LABEL: {{^}}undef_v2bf16:
285 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
286 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
287 ; GCN: s_cbranch_vccnz
288 define amdgpu_kernel void @undef_v2bf16(ptr addrspace(3) %ptr, i1 %cond) {
293 %phi = phi <2 x bfloat> [ undef, %entry ], [ %add, %loop ]
294 %load = load volatile <2 x bfloat>, ptr addrspace(3) undef
295 %bc.0 = bitcast <2 x bfloat> %load to <2 x i16>
296 %bc.1 = bitcast <2 x bfloat> %phi to <2 x i16>
297 %add.i = add <2 x i16> %bc.0, %bc.1
298 %add = bitcast <2 x i16> %add.i to <2 x bfloat>
299 br i1 %cond, label %loop, label %ret
302 store volatile <2 x bfloat> %add, ptr addrspace(3) undef
306 ; GCN-LABEL: {{^}}undef_v3bf16:
307 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
308 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
309 ; GCN: s_cbranch_vccnz
310 define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
315 %phi = phi <3 x bfloat> [ undef, %entry ], [ %add, %loop ]
316 %load = load volatile <3 x bfloat>, ptr addrspace(3) undef
317 %bc.0 = bitcast <3 x bfloat> %load to <3 x i16>
318 %bc.1 = bitcast <3 x bfloat> %phi to <3 x i16>
319 %add.i = add <3 x i16> %bc.0, %bc.1
320 %add = bitcast <3 x i16> %add.i to <3 x bfloat>
321 br i1 %cond, label %loop, label %ret
324 store volatile <3 x bfloat> %add, ptr addrspace(3) undef
328 ; GCN-LABEL: {{^}}undef_v4bf16:
329 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
330 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
331 ; GCN: s_cbranch_vccnz
332 define amdgpu_kernel void @undef_v4bf16(ptr addrspace(3) %ptr, i1 %cond) {
337 %phi = phi <4 x bfloat> [ undef, %entry ], [ %add, %loop ]
338 %load = load volatile <4 x bfloat>, ptr addrspace(3) undef
339 %bc.0 = bitcast <4 x bfloat> %load to <4 x i16>
340 %bc.1 = bitcast <4 x bfloat> %phi to <4 x i16>
341 %add.i = add <4 x i16> %bc.0, %bc.1
342 %add = bitcast <4 x i16> %add.i to <4 x bfloat>
343 br i1 %cond, label %loop, label %ret
346 store volatile <4 x bfloat> %add, ptr addrspace(3) undef
350 ; GCN-LABEL: {{^}}undef_v6bf16:
351 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
352 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
353 ; GCN: s_cbranch_vccnz
354 define amdgpu_kernel void @undef_v6bf16(ptr addrspace(3) %ptr, i1 %cond) {
359 %phi = phi <6 x bfloat> [ undef, %entry ], [ %add, %loop ]
360 %load = load volatile <6 x bfloat>, ptr addrspace(3) undef
361 %bc.0 = bitcast <6 x bfloat> %load to <6 x i16>
362 %bc.1 = bitcast <6 x bfloat> %phi to <6 x i16>
363 %add.i = add <6 x i16> %bc.0, %bc.1
364 %add = bitcast <6 x i16> %add.i to <6 x bfloat>
365 br i1 %cond, label %loop, label %ret
368 store volatile <6 x bfloat> %add, ptr addrspace(3) undef
372 ; GCN-LABEL: {{^}}undef_v8bf16:
373 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
374 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
375 ; GCN: s_cbranch_vccnz
376 define amdgpu_kernel void @undef_v8bf16(ptr addrspace(3) %ptr, i1 %cond) {
381 %phi = phi <8 x bfloat> [ undef, %entry ], [ %add, %loop ]
382 %load = load volatile <8 x bfloat>, ptr addrspace(3) undef
383 %bc.0 = bitcast <8 x bfloat> %load to <8 x i16>
384 %bc.1 = bitcast <8 x bfloat> %phi to <8 x i16>
385 %add.i = add <8 x i16> %bc.0, %bc.1
386 %add = bitcast <8 x i16> %add.i to <8 x bfloat>
387 br i1 %cond, label %loop, label %ret
390 store volatile <8 x bfloat> %add, ptr addrspace(3) undef
394 ; GCN-LABEL: {{^}}undef_v16bf16:
395 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
396 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
397 ; GCN: s_cbranch_vccnz
398 define amdgpu_kernel void @undef_v16bf16(ptr addrspace(3) %ptr, i1 %cond) {
403 %phi = phi <16 x bfloat> [ undef, %entry ], [ %add, %loop ]
404 %load = load volatile <16 x bfloat>, ptr addrspace(3) undef
405 %bc.0 = bitcast <16 x bfloat> %load to <16 x i16>
406 %bc.1 = bitcast <16 x bfloat> %phi to <16 x i16>
407 %add.i = add <16 x i16> %bc.0, %bc.1
408 %add = bitcast <16 x i16> %add.i to <16 x bfloat>
409 br i1 %cond, label %loop, label %ret
412 store volatile <16 x bfloat> %add, ptr addrspace(3) undef
416 ; GCN-LABEL: {{^}}undef_v32bf16:
417 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
418 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
419 ; GCN: s_cbranch_vccnz
420 define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) {
425 %phi = phi <32 x bfloat> [ undef, %entry ], [ %add, %loop ]
426 %load = load volatile <32 x bfloat>, ptr addrspace(3) undef
427 %bc.0 = bitcast <32 x bfloat> %load to <32 x i16>
428 %bc.1 = bitcast <32 x bfloat> %phi to <32 x i16>
429 %add.i = add <32 x i16> %bc.0, %bc.1
430 %add = bitcast <32 x i16> %add.i to <32 x bfloat>
431 br i1 %cond, label %loop, label %ret
434 store volatile <32 x bfloat> %add, ptr addrspace(3) undef