1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11
7 ; FIXME: Need to handle non-uniform case for function below (load without gep).
8 define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
9 ; GFX9-LABEL: v_test_sub_v2i16:
11 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
12 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
13 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
16 ; GFX9-NEXT: s_waitcnt vmcnt(0)
17 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
18 ; GFX9-NEXT: s_waitcnt vmcnt(0)
19 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
20 ; GFX9-NEXT: s_mov_b32 s6, -1
21 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
22 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
25 ; VI-LABEL: v_test_sub_v2i16:
27 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
28 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
29 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
30 ; VI-NEXT: s_waitcnt lgkmcnt(0)
31 ; VI-NEXT: v_mov_b32_e32 v1, s7
32 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
33 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
34 ; VI-NEXT: v_mov_b32_e32 v3, s1
35 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
36 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
37 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
38 ; VI-NEXT: s_waitcnt vmcnt(0)
39 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
40 ; VI-NEXT: s_waitcnt vmcnt(0)
41 ; VI-NEXT: s_mov_b32 s7, 0xf000
42 ; VI-NEXT: s_mov_b32 s6, -1
43 ; VI-NEXT: v_sub_u16_e32 v2, v0, v1
44 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
45 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
46 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
49 ; GFX10-LABEL: v_test_sub_v2i16:
51 ; GFX10-NEXT: s_clause 0x1
52 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
53 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
54 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
55 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
56 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
57 ; GFX10-NEXT: s_waitcnt vmcnt(0)
58 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
59 ; GFX10-NEXT: s_waitcnt vmcnt(0)
60 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
61 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
62 ; GFX10-NEXT: s_mov_b32 s6, -1
63 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
64 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
65 ; GFX10-NEXT: s_endpgm
67 ; GFX11-LABEL: v_test_sub_v2i16:
69 ; GFX11-NEXT: s_clause 0x1
70 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
71 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
72 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
73 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
74 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
75 ; GFX11-NEXT: s_waitcnt vmcnt(0)
76 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
77 ; GFX11-NEXT: s_waitcnt vmcnt(0)
78 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
79 ; GFX11-NEXT: s_mov_b32 s6, -1
80 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
81 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
83 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
84 ; GFX11-NEXT: s_endpgm
85 %tid = call i32 @llvm.amdgcn.workitem.id.x()
86 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
87 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
88 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
89 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
90 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
91 %add = sub <2 x i16> %a, %b
92 store <2 x i16> %add, ptr addrspace(1) %out
96 define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 {
97 ; GFX9-LABEL: s_test_sub_v2i16:
99 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
100 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
101 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
102 ; GFX9-NEXT: s_mov_b32 s2, -1
103 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
104 ; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0
105 ; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0
106 ; GFX9-NEXT: s_mov_b32 s0, s4
107 ; GFX9-NEXT: s_mov_b32 s1, s5
108 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX9-NEXT: v_mov_b32_e32 v0, s10
110 ; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0
111 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
112 ; GFX9-NEXT: s_endpgm
114 ; VI-LABEL: s_test_sub_v2i16:
116 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
117 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
118 ; VI-NEXT: s_mov_b32 s3, 0xf000
119 ; VI-NEXT: s_mov_b32 s2, -1
120 ; VI-NEXT: s_waitcnt lgkmcnt(0)
121 ; VI-NEXT: s_load_dword s6, s[6:7], 0x0
122 ; VI-NEXT: s_load_dword s7, s[0:1], 0x0
123 ; VI-NEXT: s_mov_b32 s0, s4
124 ; VI-NEXT: s_mov_b32 s1, s5
125 ; VI-NEXT: s_waitcnt lgkmcnt(0)
126 ; VI-NEXT: s_lshr_b32 s4, s6, 16
127 ; VI-NEXT: s_lshr_b32 s5, s7, 16
128 ; VI-NEXT: s_sub_i32 s6, s6, s7
129 ; VI-NEXT: s_sub_i32 s4, s4, s5
130 ; VI-NEXT: s_and_b32 s5, s6, 0xffff
131 ; VI-NEXT: s_lshl_b32 s4, s4, 16
132 ; VI-NEXT: s_or_b32 s4, s5, s4
133 ; VI-NEXT: v_mov_b32_e32 v0, s4
134 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
137 ; GFX10-LABEL: s_test_sub_v2i16:
139 ; GFX10-NEXT: s_clause 0x1
140 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
141 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
142 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
143 ; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
144 ; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0
145 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
146 ; GFX10-NEXT: s_mov_b32 s6, -1
147 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
148 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1
149 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
150 ; GFX10-NEXT: s_endpgm
152 ; GFX11-LABEL: s_test_sub_v2i16:
154 ; GFX11-NEXT: s_clause 0x1
155 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
156 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
157 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
158 ; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
159 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
160 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
161 ; GFX11-NEXT: s_mov_b32 s6, -1
162 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
163 ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0
164 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
165 ; GFX11-NEXT: s_nop 0
166 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
167 ; GFX11-NEXT: s_endpgm
168 %a = load <2 x i16>, ptr addrspace(4) %in0
169 %b = load <2 x i16>, ptr addrspace(4) %in1
170 %add = sub <2 x i16> %a, %b
171 store <2 x i16> %add, ptr addrspace(1) %out
175 define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
176 ; GCN-LABEL: s_test_sub_self_v2i16:
178 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
179 ; GCN-NEXT: s_mov_b32 s3, 0xf000
180 ; GCN-NEXT: s_mov_b32 s2, -1
181 ; GCN-NEXT: v_mov_b32_e32 v0, 0
182 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
183 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
186 ; GFX10-LABEL: s_test_sub_self_v2i16:
188 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
189 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
190 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
191 ; GFX10-NEXT: s_mov_b32 s2, -1
192 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
193 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
194 ; GFX10-NEXT: s_endpgm
196 ; GFX11-LABEL: s_test_sub_self_v2i16:
198 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
199 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
200 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
201 ; GFX11-NEXT: s_mov_b32 s2, -1
202 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
204 ; GFX11-NEXT: s_nop 0
205 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
206 ; GFX11-NEXT: s_endpgm
207 %a = load <2 x i16>, ptr addrspace(4) %in0
208 %add = sub <2 x i16> %a, %a
209 store <2 x i16> %add, ptr addrspace(1) %out
213 ; FIXME: VI should not scalarize arg access.
214 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 {
215 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
217 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
218 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
219 ; GFX9-NEXT: s_mov_b32 s6, -1
220 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
221 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
222 ; GFX9-NEXT: s_mov_b32 s4, s0
223 ; GFX9-NEXT: s_mov_b32 s5, s1
224 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0
225 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
226 ; GFX9-NEXT: s_endpgm
228 ; VI-LABEL: s_test_sub_v2i16_kernarg:
230 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
231 ; VI-NEXT: s_mov_b32 s7, 0xf000
232 ; VI-NEXT: s_mov_b32 s6, -1
233 ; VI-NEXT: s_waitcnt lgkmcnt(0)
234 ; VI-NEXT: s_mov_b32 s4, s0
235 ; VI-NEXT: s_mov_b32 s5, s1
236 ; VI-NEXT: s_lshr_b32 s0, s2, 16
237 ; VI-NEXT: s_lshr_b32 s1, s3, 16
238 ; VI-NEXT: s_sub_i32 s0, s0, s1
239 ; VI-NEXT: s_sub_i32 s1, s2, s3
240 ; VI-NEXT: s_lshl_b32 s0, s0, 16
241 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
242 ; VI-NEXT: s_or_b32 s0, s1, s0
243 ; VI-NEXT: v_mov_b32_e32 v0, s0
244 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
247 ; GFX10-LABEL: s_test_sub_v2i16_kernarg:
249 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
250 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
251 ; GFX10-NEXT: s_mov_b32 s6, -1
252 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3
254 ; GFX10-NEXT: s_mov_b32 s4, s0
255 ; GFX10-NEXT: s_mov_b32 s5, s1
256 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
257 ; GFX10-NEXT: s_endpgm
259 ; GFX11-LABEL: s_test_sub_v2i16_kernarg:
261 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
262 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
263 ; GFX11-NEXT: s_mov_b32 s6, -1
264 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
265 ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3
266 ; GFX11-NEXT: s_mov_b32 s4, s0
267 ; GFX11-NEXT: s_mov_b32 s5, s1
268 ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
269 ; GFX11-NEXT: s_nop 0
270 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
271 ; GFX11-NEXT: s_endpgm
272 %add = sub <2 x i16> %a, %b
273 store <2 x i16> %add, ptr addrspace(1) %out
277 define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
278 ; GFX9-LABEL: v_test_sub_v2i16_constant:
280 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
281 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
282 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
283 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
284 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
286 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
287 ; GFX9-NEXT: s_mov_b32 s2, -1
288 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
289 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
290 ; GFX9-NEXT: s_endpgm
292 ; VI-LABEL: v_test_sub_v2i16_constant:
294 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
295 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
296 ; VI-NEXT: s_waitcnt lgkmcnt(0)
297 ; VI-NEXT: v_mov_b32_e32 v1, s3
298 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
299 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
300 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
301 ; VI-NEXT: s_waitcnt vmcnt(0)
302 ; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
303 ; VI-NEXT: s_mov_b32 s3, 0xf000
304 ; VI-NEXT: s_mov_b32 s2, -1
305 ; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0
306 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
307 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
308 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
311 ; GFX10-LABEL: v_test_sub_v2i16_constant:
313 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
314 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
315 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
316 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
317 ; GFX10-NEXT: s_waitcnt vmcnt(0)
318 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
319 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
320 ; GFX10-NEXT: s_mov_b32 s2, -1
321 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
322 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
323 ; GFX10-NEXT: s_endpgm
325 ; GFX11-LABEL: v_test_sub_v2i16_constant:
327 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
328 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
329 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
331 ; GFX11-NEXT: s_waitcnt vmcnt(0)
332 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
333 ; GFX11-NEXT: s_mov_b32 s2, -1
334 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
335 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
336 ; GFX11-NEXT: s_nop 0
337 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
338 ; GFX11-NEXT: s_endpgm
339 %tid = call i32 @llvm.amdgcn.workitem.id.x()
340 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
341 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
342 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
343 %add = sub <2 x i16> %a, <i16 123, i16 456>
344 store <2 x i16> %add, ptr addrspace(1) %out
348 ; FIXME: Need to handle non-uniform case for function below (load without gep).
349 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
350 ; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
352 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
353 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
354 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
355 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
356 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
357 ; GFX9-NEXT: s_waitcnt vmcnt(0)
358 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
359 ; GFX9-NEXT: s_mov_b32 s2, -1
360 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
361 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
362 ; GFX9-NEXT: s_endpgm
364 ; VI-LABEL: v_test_sub_v2i16_neg_constant:
366 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
367 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
368 ; VI-NEXT: s_waitcnt lgkmcnt(0)
369 ; VI-NEXT: v_mov_b32_e32 v1, s3
370 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
371 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
372 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
373 ; VI-NEXT: s_waitcnt vmcnt(0)
374 ; VI-NEXT: v_mov_b32_e32 v1, 0x3df
375 ; VI-NEXT: s_mov_b32 s3, 0xf000
376 ; VI-NEXT: s_mov_b32 s2, -1
377 ; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
378 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
379 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
380 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
383 ; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
385 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
386 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
387 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
388 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
389 ; GFX10-NEXT: s_waitcnt vmcnt(0)
390 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
391 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
392 ; GFX10-NEXT: s_mov_b32 s2, -1
393 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
394 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
395 ; GFX10-NEXT: s_endpgm
397 ; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
399 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
400 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
401 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
403 ; GFX11-NEXT: s_waitcnt vmcnt(0)
404 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
405 ; GFX11-NEXT: s_mov_b32 s2, -1
406 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
407 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
408 ; GFX11-NEXT: s_nop 0
409 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
410 ; GFX11-NEXT: s_endpgm
411 %tid = call i32 @llvm.amdgcn.workitem.id.x()
412 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
413 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
414 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
415 %add = sub <2 x i16> %a, <i16 -845, i16 -991>
416 store <2 x i16> %add, ptr addrspace(1) %out
420 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
421 ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
423 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
424 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
425 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
427 ; GFX9-NEXT: s_waitcnt vmcnt(0)
428 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
429 ; GFX9-NEXT: s_mov_b32 s2, -1
430 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
431 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
432 ; GFX9-NEXT: s_endpgm
434 ; VI-LABEL: v_test_sub_v2i16_inline_neg1:
436 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
437 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
438 ; VI-NEXT: s_waitcnt lgkmcnt(0)
439 ; VI-NEXT: v_mov_b32_e32 v1, s3
440 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
441 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
442 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
443 ; VI-NEXT: s_waitcnt vmcnt(0)
444 ; VI-NEXT: v_mov_b32_e32 v1, 1
445 ; VI-NEXT: s_mov_b32 s3, 0xf000
446 ; VI-NEXT: s_mov_b32 s2, -1
447 ; VI-NEXT: v_add_u16_e32 v2, 1, v0
448 ; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
449 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
450 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
453 ; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
455 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
456 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
457 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
458 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
459 ; GFX10-NEXT: s_waitcnt vmcnt(0)
460 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
461 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
462 ; GFX10-NEXT: s_mov_b32 s2, -1
463 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
464 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
465 ; GFX10-NEXT: s_endpgm
467 ; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
469 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
470 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
471 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
473 ; GFX11-NEXT: s_waitcnt vmcnt(0)
474 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
475 ; GFX11-NEXT: s_mov_b32 s2, -1
476 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
477 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
478 ; GFX11-NEXT: s_nop 0
479 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
480 ; GFX11-NEXT: s_endpgm
481 %tid = call i32 @llvm.amdgcn.workitem.id.x()
482 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
483 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
484 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
485 %add = sub <2 x i16> %a, <i16 -1, i16 -1>
486 store <2 x i16> %add, ptr addrspace(1) %out
490 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
491 ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
493 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
494 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
495 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
496 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
497 ; GFX9-NEXT: s_waitcnt vmcnt(0)
498 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
499 ; GFX9-NEXT: s_mov_b32 s2, -1
500 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
501 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
502 ; GFX9-NEXT: s_endpgm
504 ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
506 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
507 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
508 ; VI-NEXT: s_waitcnt lgkmcnt(0)
509 ; VI-NEXT: v_mov_b32_e32 v1, s3
510 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
511 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
512 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
513 ; VI-NEXT: s_waitcnt vmcnt(0)
514 ; VI-NEXT: s_mov_b32 s3, 0xf000
515 ; VI-NEXT: s_mov_b32 s2, -1
516 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
517 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
518 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
519 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
522 ; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
524 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
525 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
526 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
527 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
528 ; GFX10-NEXT: s_waitcnt vmcnt(0)
529 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
530 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
531 ; GFX10-NEXT: s_mov_b32 s2, -1
532 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32
533 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
534 ; GFX10-NEXT: s_endpgm
536 ; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
538 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
539 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
540 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
541 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
542 ; GFX11-NEXT: s_waitcnt vmcnt(0)
543 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
544 ; GFX11-NEXT: s_mov_b32 s2, -1
545 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32
546 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
547 ; GFX11-NEXT: s_nop 0
548 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
549 ; GFX11-NEXT: s_endpgm
550 %tid = call i32 @llvm.amdgcn.workitem.id.x()
551 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
552 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
553 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
554 %add = sub <2 x i16> %a, <i16 32, i16 0>
555 store <2 x i16> %add, ptr addrspace(1) %out
559 ; The high element gives fp
560 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
561 ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
563 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
564 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
565 ; GFX9-NEXT: s_mov_b32 s4, 1.0
566 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
567 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
568 ; GFX9-NEXT: s_waitcnt vmcnt(0)
569 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
570 ; GFX9-NEXT: s_mov_b32 s2, -1
571 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
572 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
573 ; GFX9-NEXT: s_endpgm
575 ; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
577 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
578 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
579 ; VI-NEXT: s_waitcnt lgkmcnt(0)
580 ; VI-NEXT: v_mov_b32_e32 v1, s3
581 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
582 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
583 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
584 ; VI-NEXT: s_waitcnt vmcnt(0)
585 ; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
586 ; VI-NEXT: s_mov_b32 s3, 0xf000
587 ; VI-NEXT: s_mov_b32 s2, -1
588 ; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
589 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
590 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
593 ; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
595 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
596 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
597 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
598 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
599 ; GFX10-NEXT: s_waitcnt vmcnt(0)
600 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
601 ; GFX10-NEXT: s_mov_b32 s3, 0x31016000
602 ; GFX10-NEXT: s_mov_b32 s2, -1
603 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0]
604 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
605 ; GFX10-NEXT: s_endpgm
607 ; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
609 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
610 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
611 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
612 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
613 ; GFX11-NEXT: s_waitcnt vmcnt(0)
614 ; GFX11-NEXT: s_mov_b32 s3, 0x31016000
615 ; GFX11-NEXT: s_mov_b32 s2, -1
616 ; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x3f80 op_sel:[0,1] op_sel_hi:[1,0]
617 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
618 ; GFX11-NEXT: s_nop 0
619 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
620 ; GFX11-NEXT: s_endpgm
621 %tid = call i32 @llvm.amdgcn.workitem.id.x()
622 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
623 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
624 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
625 %add = sub <2 x i16> %a, <i16 0, i16 16256>
626 store <2 x i16> %add, ptr addrspace(1) %out
630 ; FIXME: Need to handle non-uniform case for function below (load without gep).
631 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
632 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
634 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
635 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
636 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
637 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
638 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
640 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
641 ; GFX9-NEXT: s_waitcnt vmcnt(0)
642 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
643 ; GFX9-NEXT: s_mov_b32 s6, -1
644 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
645 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
646 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
647 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
648 ; GFX9-NEXT: s_endpgm
650 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
652 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
653 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
654 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
655 ; VI-NEXT: s_waitcnt lgkmcnt(0)
656 ; VI-NEXT: v_mov_b32_e32 v1, s7
657 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
658 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
659 ; VI-NEXT: v_mov_b32_e32 v3, s1
660 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
661 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
662 ; VI-NEXT: flat_load_dword v1, v[0:1] glc
663 ; VI-NEXT: s_waitcnt vmcnt(0)
664 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
665 ; VI-NEXT: s_waitcnt vmcnt(0)
666 ; VI-NEXT: s_mov_b32 s7, 0xf000
667 ; VI-NEXT: s_mov_b32 s6, -1
668 ; VI-NEXT: v_sub_u16_e32 v0, v1, v2
669 ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
670 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
673 ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32:
675 ; GFX10-NEXT: s_clause 0x1
676 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
677 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
678 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
679 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
680 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
681 ; GFX10-NEXT: s_waitcnt vmcnt(0)
682 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
683 ; GFX10-NEXT: s_waitcnt vmcnt(0)
684 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
685 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
686 ; GFX10-NEXT: s_mov_b32 s6, -1
687 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
688 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
689 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
690 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
691 ; GFX10-NEXT: s_endpgm
693 ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32:
695 ; GFX11-NEXT: s_clause 0x1
696 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
697 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
698 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
699 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
700 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
701 ; GFX11-NEXT: s_waitcnt vmcnt(0)
702 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
703 ; GFX11-NEXT: s_waitcnt vmcnt(0)
704 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
705 ; GFX11-NEXT: s_mov_b32 s6, -1
706 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
707 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
708 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
709 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
710 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
711 ; GFX11-NEXT: s_nop 0
712 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
713 ; GFX11-NEXT: s_endpgm
714 %tid = call i32 @llvm.amdgcn.workitem.id.x()
715 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
716 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
717 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
718 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
719 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
720 %add = sub <2 x i16> %a, %b
721 %ext = zext <2 x i16> %add to <2 x i32>
722 store <2 x i32> %ext, ptr addrspace(1) %out
726 ; FIXME: Need to handle non-uniform case for function below (load without gep).
727 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
728 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
730 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
731 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
732 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
733 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
734 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
735 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
736 ; GFX9-NEXT: s_waitcnt vmcnt(0)
737 ; GFX9-NEXT: global_load_dword v3, v0, s[2:3] glc
738 ; GFX9-NEXT: s_waitcnt vmcnt(0)
739 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
740 ; GFX9-NEXT: s_mov_b32 s6, -1
741 ; GFX9-NEXT: v_pk_sub_i16 v0, v2, v3
742 ; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16
743 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
744 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
745 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
746 ; GFX9-NEXT: s_endpgm
748 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
750 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
751 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
752 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
753 ; VI-NEXT: s_waitcnt lgkmcnt(0)
754 ; VI-NEXT: v_mov_b32_e32 v1, s7
755 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
756 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
757 ; VI-NEXT: v_mov_b32_e32 v3, s1
758 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
759 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
760 ; VI-NEXT: flat_load_dword v4, v[0:1] glc
761 ; VI-NEXT: s_waitcnt vmcnt(0)
762 ; VI-NEXT: flat_load_dword v2, v[2:3] glc
763 ; VI-NEXT: s_waitcnt vmcnt(0)
764 ; VI-NEXT: v_mov_b32_e32 v1, 0
765 ; VI-NEXT: s_mov_b32 s7, 0xf000
766 ; VI-NEXT: s_mov_b32 s6, -1
767 ; VI-NEXT: v_mov_b32_e32 v3, v1
768 ; VI-NEXT: v_sub_u16_e32 v0, v4, v2
769 ; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
770 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
773 ; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64:
775 ; GFX10-NEXT: s_clause 0x1
776 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
777 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
778 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
779 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
780 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
781 ; GFX10-NEXT: s_waitcnt vmcnt(0)
782 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
783 ; GFX10-NEXT: s_waitcnt vmcnt(0)
784 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
785 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
786 ; GFX10-NEXT: s_mov_b32 s6, -1
787 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
788 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
789 ; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16
790 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
791 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
792 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
793 ; GFX10-NEXT: s_endpgm
795 ; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64:
797 ; GFX11-NEXT: s_clause 0x1
798 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
799 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
800 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
801 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
802 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
803 ; GFX11-NEXT: s_waitcnt vmcnt(0)
804 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
805 ; GFX11-NEXT: s_waitcnt vmcnt(0)
806 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
807 ; GFX11-NEXT: s_mov_b32 s6, -1
808 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
809 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
810 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
811 ; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16
812 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
813 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
814 ; GFX11-NEXT: s_nop 0
815 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
816 ; GFX11-NEXT: s_endpgm
817 %tid = call i32 @llvm.amdgcn.workitem.id.x()
818 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
819 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
820 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
821 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
822 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
823 %add = sub <2 x i16> %a, %b
824 %ext = zext <2 x i16> %add to <2 x i64>
825 store <2 x i64> %ext, ptr addrspace(1) %out
829 ; FIXME: Need to handle non-uniform case for function below (load without gep).
830 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
831 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
833 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
834 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
835 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
836 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
837 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
838 ; GFX9-NEXT: s_waitcnt vmcnt(0)
839 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
840 ; GFX9-NEXT: s_waitcnt vmcnt(0)
841 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
842 ; GFX9-NEXT: s_mov_b32 s6, -1
843 ; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2
844 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
845 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
846 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
847 ; GFX9-NEXT: s_endpgm
849 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
851 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
852 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
853 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
854 ; VI-NEXT: s_waitcnt lgkmcnt(0)
855 ; VI-NEXT: v_mov_b32_e32 v1, s7
856 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
857 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
858 ; VI-NEXT: v_mov_b32_e32 v3, s1
859 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
860 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
861 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
862 ; VI-NEXT: s_waitcnt vmcnt(0)
863 ; VI-NEXT: flat_load_dword v1, v[2:3] glc
864 ; VI-NEXT: s_waitcnt vmcnt(0)
865 ; VI-NEXT: s_mov_b32 s7, 0xf000
866 ; VI-NEXT: s_mov_b32 s6, -1
867 ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
868 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1
869 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
870 ; VI-NEXT: v_bfe_i32 v1, v2, 0, 16
871 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
874 ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32:
876 ; GFX10-NEXT: s_clause 0x1
877 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
878 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
879 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
880 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
881 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
882 ; GFX10-NEXT: s_waitcnt vmcnt(0)
883 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
884 ; GFX10-NEXT: s_waitcnt vmcnt(0)
885 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
886 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
887 ; GFX10-NEXT: s_mov_b32 s6, -1
888 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
889 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0
890 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
891 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
892 ; GFX10-NEXT: s_endpgm
894 ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32:
896 ; GFX11-NEXT: s_clause 0x1
897 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
898 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
899 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
900 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
901 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
902 ; GFX11-NEXT: s_waitcnt vmcnt(0)
903 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
904 ; GFX11-NEXT: s_waitcnt vmcnt(0)
905 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
906 ; GFX11-NEXT: s_mov_b32 s6, -1
907 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
908 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
909 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0
910 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
911 ; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
912 ; GFX11-NEXT: s_nop 0
913 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
914 ; GFX11-NEXT: s_endpgm
915 %tid = call i32 @llvm.amdgcn.workitem.id.x()
916 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid
917 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
918 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
919 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0
920 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1
921 %add = sub <2 x i16> %a, %b
922 %ext = sext <2 x i16> %add to <2 x i32>
923 store <2 x i32> %ext, ptr addrspace(1) %out
927 ; FIXME: Need to handle non-uniform case for function below (load without gep).
928 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 {
929 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
931 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
932 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
933 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
934 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
935 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
936 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
937 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
938 ; GFX9-NEXT: s_mov_b32 s6, -1
939 ; GFX9-NEXT: s_waitcnt vmcnt(0)
940 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2
941 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
942 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
943 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
944 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
945 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
946 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
947 ; GFX9-NEXT: s_endpgm
949 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
951 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
952 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
953 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
954 ; VI-NEXT: s_waitcnt lgkmcnt(0)
955 ; VI-NEXT: v_mov_b32_e32 v1, s7
956 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
957 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
958 ; VI-NEXT: v_mov_b32_e32 v3, s1
959 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
960 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
961 ; VI-NEXT: flat_load_dword v0, v[0:1]
962 ; VI-NEXT: flat_load_dword v1, v[2:3]
963 ; VI-NEXT: s_mov_b32 s7, 0xf000
964 ; VI-NEXT: s_mov_b32 s6, -1
965 ; VI-NEXT: s_waitcnt vmcnt(0)
966 ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
967 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1
968 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
969 ; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
970 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
971 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
972 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
975 ; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64:
977 ; GFX10-NEXT: s_clause 0x1
978 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
979 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
980 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
981 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
982 ; GFX10-NEXT: s_clause 0x1
983 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
984 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
985 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
986 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000
987 ; GFX10-NEXT: s_mov_b32 s6, -1
988 ; GFX10-NEXT: s_waitcnt vmcnt(0)
989 ; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2
990 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
991 ; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
992 ; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16
993 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
994 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
995 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
996 ; GFX10-NEXT: s_endpgm
998 ; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64:
1000 ; GFX11-NEXT: s_clause 0x1
1001 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
1002 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
1003 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1004 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1005 ; GFX11-NEXT: s_clause 0x1
1006 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1007 ; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
1008 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000
1009 ; GFX11-NEXT: s_mov_b32 s6, -1
1010 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1011 ; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0
1012 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1013 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1014 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
1015 ; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16
1016 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1017 ; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1018 ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
1019 ; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
1020 ; GFX11-NEXT: s_nop 0
1021 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1022 ; GFX11-NEXT: s_endpgm
1023 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1024 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
1025 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
1026 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid
1027 %a = load <2 x i16>, ptr addrspace(1) %gep.in0
1028 %b = load <2 x i16>, ptr addrspace(1) %gep.in1
1029 %add = sub <2 x i16> %a, %b
1030 %ext = sext <2 x i16> %add to <2 x i64>
1031 store <2 x i64> %ext, ptr addrspace(1) %out
1035 declare i32 @llvm.amdgcn.workitem.id.x() #0
1037 attributes #0 = { nounwind readnone }
1038 attributes #1 = { nounwind }