1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
3 ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s
5 declare i32 @llvm.amdgcn.workitem.id.x() #1
7 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
8 ; GCN-LABEL: v_pack_b32_v2f16:
10 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
11 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
12 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
13 ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
14 ; GCN-NEXT: s_waitcnt vmcnt(0)
15 ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
16 ; GCN-NEXT: s_waitcnt vmcnt(0)
17 ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
18 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
19 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
20 ; GCN-NEXT: ;;#ASMSTART
25 ; GISEL-LABEL: v_pack_b32_v2f16:
27 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
28 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
29 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
30 ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
31 ; GISEL-NEXT: s_waitcnt vmcnt(0)
32 ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
33 ; GISEL-NEXT: s_waitcnt vmcnt(0)
34 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
35 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
36 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
37 ; GISEL-NEXT: ;;#ASMSTART
38 ; GISEL-NEXT: ; use v0
39 ; GISEL-NEXT: ;;#ASMEND
40 ; GISEL-NEXT: s_endpgm
41 %tid = call i32 @llvm.amdgcn.workitem.id.x()
42 %tid.ext = sext i32 %tid to i64
43 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
44 %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
45 %v0 = load volatile half, ptr addrspace(1) %in0.gep
46 %v1 = load volatile half, ptr addrspace(1) %in1.gep
47 %v0.add = fadd half %v0, 2.0
48 %v1.add = fadd half %v1, 2.0
49 %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
50 %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
51 %vec.i32 = bitcast <2 x half> %vec.1 to i32
52 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
56 define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
57 ; GCN-LABEL: v_pack_b32_v2f16_sub:
59 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
60 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
61 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
62 ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
63 ; GCN-NEXT: s_waitcnt vmcnt(0)
64 ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
65 ; GCN-NEXT: s_waitcnt vmcnt(0)
66 ; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1
67 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
68 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
69 ; GCN-NEXT: ;;#ASMSTART
74 ; GISEL-LABEL: v_pack_b32_v2f16_sub:
76 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
77 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
78 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
79 ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
80 ; GISEL-NEXT: s_waitcnt vmcnt(0)
81 ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
82 ; GISEL-NEXT: s_waitcnt vmcnt(0)
83 ; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1
84 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
85 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
86 ; GISEL-NEXT: ;;#ASMSTART
87 ; GISEL-NEXT: ; use v0
88 ; GISEL-NEXT: ;;#ASMEND
89 ; GISEL-NEXT: s_endpgm
90 %tid = call i32 @llvm.amdgcn.workitem.id.x()
91 %tid.ext = sext i32 %tid to i64
92 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
93 %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
94 %v0 = load volatile half, ptr addrspace(1) %in0.gep
95 %v1 = load volatile half, ptr addrspace(1) %in1.gep
96 %v0.add = fsub half %v0, 2.0
97 %v1.add = fadd half %v1, 2.0
98 %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
99 %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
100 %vec.i32 = bitcast <2 x half> %vec.1 to i32
101 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
105 define amdgpu_kernel void @fptrunc(
106 ; GCN-LABEL: fptrunc:
108 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
109 ; GCN-NEXT: s_mov_b32 s6, -1
110 ; GCN-NEXT: s_mov_b32 s7, 0x31016000
111 ; GCN-NEXT: s_mov_b32 s10, s6
112 ; GCN-NEXT: s_mov_b32 s11, s7
113 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
114 ; GCN-NEXT: s_mov_b32 s8, s2
115 ; GCN-NEXT: s_mov_b32 s9, s3
116 ; GCN-NEXT: s_mov_b32 s4, s0
117 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
118 ; GCN-NEXT: s_mov_b32 s5, s1
119 ; GCN-NEXT: s_waitcnt vmcnt(0)
120 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
121 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
122 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
123 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
126 ; GISEL-LABEL: fptrunc:
128 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
129 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
130 ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
131 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
132 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
133 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
134 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
135 ; GISEL-NEXT: v_mov_b32_e32 v1, 0
136 ; GISEL-NEXT: global_store_dword v1, v0, s[0:1]
137 ; GISEL-NEXT: s_endpgm
139 ptr addrspace(1) %a) {
140 %a.val = load <2 x float>, ptr addrspace(1) %a
141 %r.val = fptrunc <2 x float> %a.val to <2 x half>
142 store <2 x half> %r.val, ptr addrspace(1) %r
146 define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
147 ; GCN-LABEL: v_pack_b32.fabs:
149 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
150 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
151 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
152 ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
153 ; GCN-NEXT: s_waitcnt vmcnt(0)
154 ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
155 ; GCN-NEXT: s_waitcnt vmcnt(0)
156 ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
157 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
158 ; GCN-NEXT: v_pack_b32_f16 v0, |v0|, |v1|
159 ; GCN-NEXT: ;;#ASMSTART
161 ; GCN-NEXT: ;;#ASMEND
164 ; GISEL-LABEL: v_pack_b32.fabs:
166 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
167 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
168 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
169 ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
170 ; GISEL-NEXT: s_waitcnt vmcnt(0)
171 ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
172 ; GISEL-NEXT: s_waitcnt vmcnt(0)
173 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
174 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
175 ; GISEL-NEXT: v_pack_b32_f16 v0, |v0|, |v1|
176 ; GISEL-NEXT: ;;#ASMSTART
177 ; GISEL-NEXT: ; use v0
178 ; GISEL-NEXT: ;;#ASMEND
179 ; GISEL-NEXT: s_endpgm
180 %tid = call i32 @llvm.amdgcn.workitem.id.x()
181 %tid.ext = sext i32 %tid to i64
182 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
183 %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
184 %v0 = load volatile half, ptr addrspace(1) %in0.gep
185 %v1 = load volatile half, ptr addrspace(1) %in1.gep
186 %v0.add = fadd half %v0, 2.0
187 %v1.add = fadd half %v1, 2.0
188 %v0.fabs = call half @llvm.fabs.f16(half %v0.add)
189 %v1.fabs = call half @llvm.fabs.f16(half %v1.add)
190 %vec.0 = insertelement <2 x half> undef, half %v0.fabs, i32 0
191 %vec.1 = insertelement <2 x half> %vec.0, half %v1.fabs, i32 1
192 %vec.i32 = bitcast <2 x half> %vec.1 to i32
193 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
197 define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
198 ; GCN-LABEL: v_pack_b32.fneg:
200 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
201 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
202 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
203 ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
204 ; GCN-NEXT: s_waitcnt vmcnt(0)
205 ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
206 ; GCN-NEXT: s_waitcnt vmcnt(0)
207 ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
208 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
209 ; GCN-NEXT: v_pack_b32_f16 v0, -v0, -v1
210 ; GCN-NEXT: ;;#ASMSTART
212 ; GCN-NEXT: ;;#ASMEND
215 ; GISEL-LABEL: v_pack_b32.fneg:
217 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
218 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
219 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
220 ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
221 ; GISEL-NEXT: s_waitcnt vmcnt(0)
222 ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
223 ; GISEL-NEXT: s_waitcnt vmcnt(0)
224 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
225 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
226 ; GISEL-NEXT: v_pack_b32_f16 v0, -v0, -v1
227 ; GISEL-NEXT: ;;#ASMSTART
228 ; GISEL-NEXT: ; use v0
229 ; GISEL-NEXT: ;;#ASMEND
230 ; GISEL-NEXT: s_endpgm
231 %tid = call i32 @llvm.amdgcn.workitem.id.x()
232 %tid.ext = sext i32 %tid to i64
233 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
234 %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
235 %v0 = load volatile half, ptr addrspace(1) %in0.gep
236 %v1 = load volatile half, ptr addrspace(1) %in1.gep
237 %v0.add = fadd half %v0, 2.0
238 %v1.add = fadd half %v1, 2.0
239 %v0.fneg = fsub half -0.0, %v0.add
240 %v1.fneg = fsub half -0.0, %v1.add
241 %vec.0 = insertelement <2 x half> undef, half %v0.fneg, i32 0
242 %vec.1 = insertelement <2 x half> %vec.0, half %v1.fneg, i32 1
243 %vec.i32 = bitcast <2 x half> %vec.1 to i32
244 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
248 declare half @llvm.fabs.f16(half) #1
250 attributes #0 = { nounwind }
251 attributes #1 = { nounwind readnone }