1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
3 ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s
5 declare i32 @llvm.amdgcn.workitem.id.x() #1
7 define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
8 ; GCN-LABEL: v_pack_b32_v2f16:
10 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
11 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
12 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
13 ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
14 ; GCN-NEXT: s_waitcnt vmcnt(0)
15 ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
16 ; GCN-NEXT: s_waitcnt vmcnt(0)
17 ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
18 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
19 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
20 ; GCN-NEXT: ;;#ASMSTART
25 ; GISEL-LABEL: v_pack_b32_v2f16:
27 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
28 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
29 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
30 ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
31 ; GISEL-NEXT: s_waitcnt vmcnt(0)
32 ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
33 ; GISEL-NEXT: s_waitcnt vmcnt(0)
34 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
35 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
36 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
37 ; GISEL-NEXT: ;;#ASMSTART
38 ; GISEL-NEXT: ; use v0
39 ; GISEL-NEXT: ;;#ASMEND
40 ; GISEL-NEXT: s_endpgm
41 %tid = call i32 @llvm.amdgcn.workitem.id.x()
42 %tid.ext = sext i32 %tid to i64
43 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
44 %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
45 %v0 = load volatile half, ptr addrspace(1) %in0.gep
46 %v1 = load volatile half, ptr addrspace(1) %in1.gep
47 %v0.add = fadd half %v0, 2.0
48 %v1.add = fadd half %v1, 2.0
49 %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
50 %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
51 %vec.i32 = bitcast <2 x half> %vec.1 to i32
52 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
56 define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
57 ; GCN-LABEL: v_pack_b32_v2f16_sub:
59 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
60 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
61 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
62 ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
63 ; GCN-NEXT: s_waitcnt vmcnt(0)
64 ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
65 ; GCN-NEXT: s_waitcnt vmcnt(0)
66 ; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1
67 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
68 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
69 ; GCN-NEXT: ;;#ASMSTART
74 ; GISEL-LABEL: v_pack_b32_v2f16_sub:
76 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
77 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
78 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
79 ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
80 ; GISEL-NEXT: s_waitcnt vmcnt(0)
81 ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
82 ; GISEL-NEXT: s_waitcnt vmcnt(0)
83 ; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1
84 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
85 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
86 ; GISEL-NEXT: ;;#ASMSTART
87 ; GISEL-NEXT: ; use v0
88 ; GISEL-NEXT: ;;#ASMEND
89 ; GISEL-NEXT: s_endpgm
90 %tid = call i32 @llvm.amdgcn.workitem.id.x()
91 %tid.ext = sext i32 %tid to i64
92 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
93 %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
94 %v0 = load volatile half, ptr addrspace(1) %in0.gep
95 %v1 = load volatile half, ptr addrspace(1) %in1.gep
96 %v0.add = fsub half %v0, 2.0
97 %v1.add = fadd half %v1, 2.0
98 %vec.0 = insertelement <2 x half> undef, half %v0.add, i32 0
99 %vec.1 = insertelement <2 x half> %vec.0, half %v1.add, i32 1
100 %vec.i32 = bitcast <2 x half> %vec.1 to i32
101 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
105 define amdgpu_kernel void @fptrunc(
106 ; GCN-LABEL: fptrunc:
108 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
109 ; GCN-NEXT: s_mov_b32 s6, -1
110 ; GCN-NEXT: s_mov_b32 s7, 0x31016000
111 ; GCN-NEXT: s_mov_b32 s10, s6
112 ; GCN-NEXT: s_mov_b32 s11, s7
113 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
114 ; GCN-NEXT: s_mov_b32 s8, s2
115 ; GCN-NEXT: s_mov_b32 s9, s3
116 ; GCN-NEXT: s_mov_b32 s4, s0
117 ; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
118 ; GCN-NEXT: s_mov_b32 s5, s1
119 ; GCN-NEXT: s_waitcnt vmcnt(0)
120 ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
121 ; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
122 ; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
123 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
126 ; GISEL-LABEL: fptrunc:
128 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
129 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
130 ; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
131 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
132 ; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
133 ; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
134 ; GISEL-NEXT: s_mov_b32 s2, -1
135 ; GISEL-NEXT: s_mov_b32 s3, 0x31016000
136 ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
137 ; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
138 ; GISEL-NEXT: s_endpgm
140 ptr addrspace(1) %a) {
141 %a.val = load <2 x float>, ptr addrspace(1) %a
142 %r.val = fptrunc <2 x float> %a.val to <2 x half>
143 store <2 x half> %r.val, ptr addrspace(1) %r
147 define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
148 ; GCN-LABEL: v_pack_b32.fabs:
150 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
151 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
152 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
153 ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
154 ; GCN-NEXT: s_waitcnt vmcnt(0)
155 ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
156 ; GCN-NEXT: s_waitcnt vmcnt(0)
157 ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
158 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
159 ; GCN-NEXT: v_pack_b32_f16 v0, |v0|, |v1|
160 ; GCN-NEXT: ;;#ASMSTART
162 ; GCN-NEXT: ;;#ASMEND
165 ; GISEL-LABEL: v_pack_b32.fabs:
167 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
168 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
169 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
170 ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
171 ; GISEL-NEXT: s_waitcnt vmcnt(0)
172 ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
173 ; GISEL-NEXT: s_waitcnt vmcnt(0)
174 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
175 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
176 ; GISEL-NEXT: v_pack_b32_f16 v0, |v0|, |v1|
177 ; GISEL-NEXT: ;;#ASMSTART
178 ; GISEL-NEXT: ; use v0
179 ; GISEL-NEXT: ;;#ASMEND
180 ; GISEL-NEXT: s_endpgm
181 %tid = call i32 @llvm.amdgcn.workitem.id.x()
182 %tid.ext = sext i32 %tid to i64
183 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
184 %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
185 %v0 = load volatile half, ptr addrspace(1) %in0.gep
186 %v1 = load volatile half, ptr addrspace(1) %in1.gep
187 %v0.add = fadd half %v0, 2.0
188 %v1.add = fadd half %v1, 2.0
189 %v0.fabs = call half @llvm.fabs.f16(half %v0.add)
190 %v1.fabs = call half @llvm.fabs.f16(half %v1.add)
191 %vec.0 = insertelement <2 x half> undef, half %v0.fabs, i32 0
192 %vec.1 = insertelement <2 x half> %vec.0, half %v1.fabs, i32 1
193 %vec.i32 = bitcast <2 x half> %vec.1 to i32
194 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
198 define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
199 ; GCN-LABEL: v_pack_b32.fneg:
201 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
202 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
203 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
204 ; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
205 ; GCN-NEXT: s_waitcnt vmcnt(0)
206 ; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
207 ; GCN-NEXT: s_waitcnt vmcnt(0)
208 ; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
209 ; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
210 ; GCN-NEXT: v_pack_b32_f16 v0, -v0, -v1
211 ; GCN-NEXT: ;;#ASMSTART
213 ; GCN-NEXT: ;;#ASMEND
216 ; GISEL-LABEL: v_pack_b32.fneg:
218 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
219 ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
220 ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
221 ; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
222 ; GISEL-NEXT: s_waitcnt vmcnt(0)
223 ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
224 ; GISEL-NEXT: s_waitcnt vmcnt(0)
225 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
226 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
227 ; GISEL-NEXT: v_pack_b32_f16 v0, -v0, -v1
228 ; GISEL-NEXT: ;;#ASMSTART
229 ; GISEL-NEXT: ; use v0
230 ; GISEL-NEXT: ;;#ASMEND
231 ; GISEL-NEXT: s_endpgm
232 %tid = call i32 @llvm.amdgcn.workitem.id.x()
233 %tid.ext = sext i32 %tid to i64
234 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext
235 %in1.gep = getelementptr inbounds half, ptr addrspace(1) %in1, i64 %tid.ext
236 %v0 = load volatile half, ptr addrspace(1) %in0.gep
237 %v1 = load volatile half, ptr addrspace(1) %in1.gep
238 %v0.add = fadd half %v0, 2.0
239 %v1.add = fadd half %v1, 2.0
240 %v0.fneg = fsub half -0.0, %v0.add
241 %v1.fneg = fsub half -0.0, %v1.add
242 %vec.0 = insertelement <2 x half> undef, half %v0.fneg, i32 0
243 %vec.1 = insertelement <2 x half> %vec.0, half %v1.fneg, i32 1
244 %vec.i32 = bitcast <2 x half> %vec.1 to i32
245 call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
249 declare half @llvm.fabs.f16(half) #1
251 attributes #0 = { nounwind }
252 attributes #1 = { nounwind readnone }