1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,GFX9
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX89,VI
5 ; FIXME: Need to handle non-uniform case for function below (load without gep).
6 define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
7 ; GFX9-LABEL: v_test_sub_v2i16:
9 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
10 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
11 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
12 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
13 ; GFX9-NEXT: s_mov_b32 s2, -1
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
16 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
17 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
18 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
19 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
20 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
21 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
22 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
23 ; GFX9-NEXT: s_mov_b32 s0, s4
24 ; GFX9-NEXT: s_mov_b32 s1, s5
25 ; GFX9-NEXT: s_waitcnt vmcnt(0)
26 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
27 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
30 ; VI-LABEL: v_test_sub_v2i16:
32 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
33 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
35 ; VI-NEXT: s_mov_b32 s3, 0xf000
36 ; VI-NEXT: s_mov_b32 s2, -1
37 ; VI-NEXT: s_waitcnt lgkmcnt(0)
38 ; VI-NEXT: v_mov_b32_e32 v1, s7
39 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
40 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
41 ; VI-NEXT: v_mov_b32_e32 v3, s1
42 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
43 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
44 ; VI-NEXT: flat_load_dword v0, v[0:1]
45 ; VI-NEXT: flat_load_dword v1, v[2:3]
46 ; VI-NEXT: s_mov_b32 s0, s4
47 ; VI-NEXT: s_mov_b32 s1, s5
48 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
49 ; VI-NEXT: v_sub_u16_e32 v2, v0, v1
50 ; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
51 ; VI-NEXT: v_or_b32_e32 v0, v2, v0
52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
54 %tid = call i32 @llvm.amdgcn.workitem.id.x()
55 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
56 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
57 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
58 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
59 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
60 %add = sub <2 x i16> %a, %b
61 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
65 define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
66 ; GFX9-LABEL: s_test_sub_v2i16:
68 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
69 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
70 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
71 ; GFX9-NEXT: s_mov_b32 s2, -1
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
74 ; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
75 ; GFX9-NEXT: s_mov_b32 s0, s4
76 ; GFX9-NEXT: s_mov_b32 s1, s5
77 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
78 ; GFX9-NEXT: v_mov_b32_e32 v0, s7
79 ; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0
80 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
83 ; VI-LABEL: s_test_sub_v2i16:
85 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
86 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
87 ; VI-NEXT: s_mov_b32 s3, 0xf000
88 ; VI-NEXT: s_mov_b32 s2, -1
89 ; VI-NEXT: s_waitcnt lgkmcnt(0)
90 ; VI-NEXT: s_load_dword s6, s[6:7], 0x0
91 ; VI-NEXT: s_load_dword s7, s[0:1], 0x0
92 ; VI-NEXT: s_mov_b32 s0, s4
93 ; VI-NEXT: s_mov_b32 s1, s5
94 ; VI-NEXT: s_waitcnt lgkmcnt(0)
95 ; VI-NEXT: s_lshr_b32 s4, s6, 16
96 ; VI-NEXT: s_lshr_b32 s5, s7, 16
97 ; VI-NEXT: s_sub_i32 s4, s4, s5
98 ; VI-NEXT: s_sub_i32 s6, s6, s7
99 ; VI-NEXT: s_and_b32 s5, s6, 0xffff
100 ; VI-NEXT: s_lshl_b32 s4, s4, 16
101 ; VI-NEXT: s_or_b32 s4, s5, s4
102 ; VI-NEXT: v_mov_b32_e32 v0, s4
103 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
105 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
106 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
107 %add = sub <2 x i16> %a, %b
108 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
112 define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
113 ; GCN-LABEL: s_test_sub_self_v2i16:
115 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
116 ; GCN-NEXT: s_mov_b32 s3, 0xf000
117 ; GCN-NEXT: s_mov_b32 s2, -1
118 ; GCN-NEXT: v_mov_b32_e32 v0, 0
119 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
120 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
122 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
123 %add = sub <2 x i16> %a, %a
124 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
128 ; FIXME: VI should not scalarize arg access.
129 define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
130 ; GFX9-LABEL: s_test_sub_v2i16_kernarg:
132 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
133 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
134 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30
135 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
136 ; GFX9-NEXT: s_mov_b32 s6, -1
137 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
138 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
139 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0
140 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
141 ; GFX9-NEXT: s_endpgm
143 ; VI-LABEL: s_test_sub_v2i16_kernarg:
145 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
146 ; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
147 ; VI-NEXT: s_load_dword s0, s[0:1], 0x30
148 ; VI-NEXT: s_mov_b32 s7, 0xf000
149 ; VI-NEXT: s_mov_b32 s6, -1
150 ; VI-NEXT: s_waitcnt lgkmcnt(0)
151 ; VI-NEXT: s_lshr_b32 s1, s2, 16
152 ; VI-NEXT: s_lshr_b32 s3, s0, 16
153 ; VI-NEXT: s_sub_i32 s1, s1, s3
154 ; VI-NEXT: s_sub_i32 s0, s2, s0
155 ; VI-NEXT: s_lshl_b32 s1, s1, 16
156 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
157 ; VI-NEXT: s_or_b32 s0, s0, s1
158 ; VI-NEXT: v_mov_b32_e32 v0, s0
159 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
161 %add = sub <2 x i16> %a, %b
162 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
166 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
167 ; GFX9-LABEL: v_test_sub_v2i16_constant:
169 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
170 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
171 ; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b
172 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
173 ; GFX9-NEXT: s_mov_b32 s6, -1
174 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
175 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
176 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
177 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
178 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
179 ; GFX9-NEXT: s_mov_b32 s4, s0
180 ; GFX9-NEXT: s_mov_b32 s5, s1
181 ; GFX9-NEXT: s_waitcnt vmcnt(0)
182 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
183 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
184 ; GFX9-NEXT: s_endpgm
186 ; VI-LABEL: v_test_sub_v2i16_constant:
188 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
189 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
190 ; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38
191 ; VI-NEXT: s_mov_b32 s7, 0xf000
192 ; VI-NEXT: s_mov_b32 s6, -1
193 ; VI-NEXT: s_waitcnt lgkmcnt(0)
194 ; VI-NEXT: v_mov_b32_e32 v1, s3
195 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
196 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
197 ; VI-NEXT: flat_load_dword v0, v[0:1]
198 ; VI-NEXT: s_mov_b32 s4, s0
199 ; VI-NEXT: s_mov_b32 s5, s1
200 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
201 ; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0
202 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
203 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
204 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
206 %tid = call i32 @llvm.amdgcn.workitem.id.x()
207 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
208 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
209 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
210 %add = sub <2 x i16> %a, <i16 123, i16 456>
211 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
215 ; FIXME: Need to handle non-uniform case for function below (load without gep).
216 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
217 ; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
219 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
220 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
221 ; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3
222 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
223 ; GFX9-NEXT: s_mov_b32 s6, -1
224 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
225 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
226 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
227 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
228 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
229 ; GFX9-NEXT: s_mov_b32 s4, s0
230 ; GFX9-NEXT: s_mov_b32 s5, s1
231 ; GFX9-NEXT: s_waitcnt vmcnt(0)
232 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
233 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
234 ; GFX9-NEXT: s_endpgm
236 ; VI-LABEL: v_test_sub_v2i16_neg_constant:
238 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
239 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
240 ; VI-NEXT: v_mov_b32_e32 v2, 0x3df
241 ; VI-NEXT: s_mov_b32 s7, 0xf000
242 ; VI-NEXT: s_mov_b32 s6, -1
243 ; VI-NEXT: s_waitcnt lgkmcnt(0)
244 ; VI-NEXT: v_mov_b32_e32 v1, s3
245 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
246 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
247 ; VI-NEXT: flat_load_dword v0, v[0:1]
248 ; VI-NEXT: s_mov_b32 s4, s0
249 ; VI-NEXT: s_mov_b32 s5, s1
250 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
251 ; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0
252 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
253 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
254 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
256 %tid = call i32 @llvm.amdgcn.workitem.id.x()
257 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
258 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
259 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
260 %add = sub <2 x i16> %a, <i16 -845, i16 -991>
261 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
265 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
266 ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
268 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
269 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
270 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
271 ; GFX9-NEXT: s_mov_b32 s6, -1
272 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
273 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
274 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
275 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
276 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
277 ; GFX9-NEXT: s_mov_b32 s4, s0
278 ; GFX9-NEXT: s_mov_b32 s5, s1
279 ; GFX9-NEXT: s_waitcnt vmcnt(0)
280 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
281 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
282 ; GFX9-NEXT: s_endpgm
284 ; VI-LABEL: v_test_sub_v2i16_inline_neg1:
286 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
287 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
288 ; VI-NEXT: v_mov_b32_e32 v2, 1
289 ; VI-NEXT: s_mov_b32 s7, 0xf000
290 ; VI-NEXT: s_mov_b32 s6, -1
291 ; VI-NEXT: s_waitcnt lgkmcnt(0)
292 ; VI-NEXT: v_mov_b32_e32 v1, s3
293 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
294 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
295 ; VI-NEXT: flat_load_dword v0, v[0:1]
296 ; VI-NEXT: s_mov_b32 s4, s0
297 ; VI-NEXT: s_mov_b32 s5, s1
298 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
299 ; VI-NEXT: v_add_u16_e32 v1, 1, v0
300 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
301 ; VI-NEXT: v_or_b32_e32 v0, v1, v0
302 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
304 %tid = call i32 @llvm.amdgcn.workitem.id.x()
305 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
306 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
307 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
308 %add = sub <2 x i16> %a, <i16 -1, i16 -1>
309 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
313 define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
314 ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
316 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
317 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
318 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
319 ; GFX9-NEXT: s_mov_b32 s6, -1
320 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
321 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
322 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
323 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
324 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
325 ; GFX9-NEXT: s_mov_b32 s4, s0
326 ; GFX9-NEXT: s_mov_b32 s5, s1
327 ; GFX9-NEXT: s_waitcnt vmcnt(0)
328 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
329 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
330 ; GFX9-NEXT: s_endpgm
332 ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
334 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
335 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
336 ; VI-NEXT: s_mov_b32 s7, 0xf000
337 ; VI-NEXT: s_mov_b32 s6, -1
338 ; VI-NEXT: s_waitcnt lgkmcnt(0)
339 ; VI-NEXT: v_mov_b32_e32 v1, s3
340 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
341 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
342 ; VI-NEXT: flat_load_dword v0, v[0:1]
343 ; VI-NEXT: s_mov_b32 s4, s0
344 ; VI-NEXT: s_mov_b32 s5, s1
345 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
346 ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
347 ; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0
348 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
349 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
351 %tid = call i32 @llvm.amdgcn.workitem.id.x()
352 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
353 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
354 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
355 %add = sub <2 x i16> %a, <i16 32, i16 0>
356 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
360 ; The high element gives fp
361 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
362 ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
364 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
365 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
366 ; GFX9-NEXT: s_mov_b32 s8, 1.0
367 ; GFX9-NEXT: s_mov_b32 s7, 0xf000
368 ; GFX9-NEXT: s_mov_b32 s6, -1
369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
371 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
373 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
374 ; GFX9-NEXT: s_mov_b32 s4, s0
375 ; GFX9-NEXT: s_mov_b32 s5, s1
376 ; GFX9-NEXT: s_waitcnt vmcnt(0)
377 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
378 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
379 ; GFX9-NEXT: s_endpgm
381 ; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
383 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
384 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
385 ; VI-NEXT: v_mov_b32_e32 v2, 0xffffc080
386 ; VI-NEXT: s_mov_b32 s7, 0xf000
387 ; VI-NEXT: s_mov_b32 s6, -1
388 ; VI-NEXT: s_waitcnt lgkmcnt(0)
389 ; VI-NEXT: v_mov_b32_e32 v1, s3
390 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
391 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
392 ; VI-NEXT: flat_load_dword v0, v[0:1]
393 ; VI-NEXT: s_mov_b32 s4, s0
394 ; VI-NEXT: s_mov_b32 s5, s1
395 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
396 ; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
397 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
398 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
400 %tid = call i32 @llvm.amdgcn.workitem.id.x()
401 %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
402 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
403 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
404 %add = sub <2 x i16> %a, <i16 0, i16 16256>
405 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
409 ; FIXME: Need to handle non-uniform case for function below (load without gep).
410 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
411 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
413 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
414 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
415 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
416 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
417 ; GFX9-NEXT: s_mov_b32 s2, -1
418 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
419 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
420 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
421 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
422 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
423 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
424 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
425 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
426 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
427 ; GFX9-NEXT: s_mov_b32 s0, s4
428 ; GFX9-NEXT: s_mov_b32 s1, s5
429 ; GFX9-NEXT: s_waitcnt vmcnt(0)
430 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
431 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
432 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
433 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
434 ; GFX9-NEXT: s_endpgm
436 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
438 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
439 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
440 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
441 ; VI-NEXT: s_mov_b32 s3, 0xf000
442 ; VI-NEXT: s_mov_b32 s2, -1
443 ; VI-NEXT: s_waitcnt lgkmcnt(0)
444 ; VI-NEXT: v_mov_b32_e32 v1, s7
445 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
446 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
447 ; VI-NEXT: v_mov_b32_e32 v3, s1
448 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
449 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
450 ; VI-NEXT: flat_load_dword v1, v[0:1]
451 ; VI-NEXT: flat_load_dword v2, v[2:3]
452 ; VI-NEXT: s_mov_b32 s0, s4
453 ; VI-NEXT: s_mov_b32 s1, s5
454 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
455 ; VI-NEXT: v_sub_u16_e32 v0, v1, v2
456 ; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
457 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
459 %tid = call i32 @llvm.amdgcn.workitem.id.x()
460 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
461 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
462 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
463 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
464 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
465 %add = sub <2 x i16> %a, %b
466 %ext = zext <2 x i16> %add to <2 x i32>
467 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
471 ; FIXME: Need to handle non-uniform case for function below (load without gep).
472 define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
473 ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
475 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
476 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
477 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
478 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
479 ; GFX9-NEXT: s_mov_b32 s2, -1
480 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
482 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
483 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
484 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
485 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
486 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
487 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
488 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
489 ; GFX9-NEXT: s_mov_b32 s0, s4
490 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
491 ; GFX9-NEXT: s_mov_b32 s1, s5
492 ; GFX9-NEXT: s_waitcnt vmcnt(0)
493 ; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1
494 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
495 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
496 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
497 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
498 ; GFX9-NEXT: s_endpgm
500 ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
502 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
503 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
504 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
505 ; VI-NEXT: v_mov_b32_e32 v1, 0
506 ; VI-NEXT: s_mov_b32 s3, 0xf000
507 ; VI-NEXT: s_waitcnt lgkmcnt(0)
508 ; VI-NEXT: v_mov_b32_e32 v3, s7
509 ; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
510 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
511 ; VI-NEXT: v_mov_b32_e32 v5, s1
512 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0
513 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
514 ; VI-NEXT: flat_load_dword v2, v[2:3]
515 ; VI-NEXT: flat_load_dword v4, v[4:5]
516 ; VI-NEXT: s_mov_b32 s2, -1
517 ; VI-NEXT: s_mov_b32 s0, s4
518 ; VI-NEXT: s_mov_b32 s1, s5
519 ; VI-NEXT: v_mov_b32_e32 v3, v1
520 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
521 ; VI-NEXT: v_sub_u16_e32 v0, v2, v4
522 ; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
523 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
525 %tid = call i32 @llvm.amdgcn.workitem.id.x()
526 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
527 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
528 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
529 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
530 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
531 %add = sub <2 x i16> %a, %b
532 %ext = zext <2 x i16> %add to <2 x i64>
533 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
537 ; FIXME: Need to handle non-uniform case for function below (load without gep).
538 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
539 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
541 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
542 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
543 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
544 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
545 ; GFX9-NEXT: s_mov_b32 s2, -1
546 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
547 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
548 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
549 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
550 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
551 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
552 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
553 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
554 ; GFX9-NEXT: global_load_dword v1, v[2:3], off
555 ; GFX9-NEXT: s_mov_b32 s0, s4
556 ; GFX9-NEXT: s_mov_b32 s1, s5
557 ; GFX9-NEXT: s_waitcnt vmcnt(0)
558 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
559 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
560 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
561 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
562 ; GFX9-NEXT: s_endpgm
564 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
566 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
567 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
568 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
569 ; VI-NEXT: s_mov_b32 s3, 0xf000
570 ; VI-NEXT: s_mov_b32 s2, -1
571 ; VI-NEXT: s_waitcnt lgkmcnt(0)
572 ; VI-NEXT: v_mov_b32_e32 v1, s7
573 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
574 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
575 ; VI-NEXT: v_mov_b32_e32 v3, s1
576 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
577 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
578 ; VI-NEXT: flat_load_dword v0, v[0:1]
579 ; VI-NEXT: flat_load_dword v1, v[2:3]
580 ; VI-NEXT: s_mov_b32 s0, s4
581 ; VI-NEXT: s_mov_b32 s1, s5
582 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
583 ; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
584 ; VI-NEXT: v_sub_u16_e32 v0, v0, v1
585 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
586 ; VI-NEXT: v_bfe_i32 v1, v2, 0, 16
587 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
589 %tid = call i32 @llvm.amdgcn.workitem.id.x()
590 %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
591 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
592 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
593 %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
594 %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
595 %add = sub <2 x i16> %a, %b
596 %ext = sext <2 x i16> %add to <2 x i32>
597 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
601 ; FIXME: Need to handle non-uniform case for function below (load without gep).
602 define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
603 ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
605 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
606 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
607 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
608 ; GFX9-NEXT: s_mov_b32 s3, 0xf000
609 ; GFX9-NEXT: s_mov_b32 s2, -1
610 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
611 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
612 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
613 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
614 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
615 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
616 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
617 ; GFX9-NEXT: global_load_dword v2, v[2:3], off
618 ; GFX9-NEXT: global_load_dword v0, v[0:1], off
619 ; GFX9-NEXT: s_mov_b32 s0, s4
620 ; GFX9-NEXT: s_mov_b32 s1, s5
621 ; GFX9-NEXT: s_waitcnt vmcnt(0)
622 ; GFX9-NEXT: v_pk_sub_i16 v1, v0, v2
623 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
624 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16
625 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
626 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
627 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
628 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
629 ; GFX9-NEXT: s_endpgm
631 ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
633 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
634 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
635 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
636 ; VI-NEXT: s_mov_b32 s3, 0xf000
637 ; VI-NEXT: s_mov_b32 s2, -1
638 ; VI-NEXT: s_waitcnt lgkmcnt(0)
639 ; VI-NEXT: v_mov_b32_e32 v1, s7
640 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
641 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
642 ; VI-NEXT: v_mov_b32_e32 v3, s1
643 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
644 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
645 ; VI-NEXT: flat_load_dword v2, v[2:3]
646 ; VI-NEXT: flat_load_dword v0, v[0:1]
647 ; VI-NEXT: s_mov_b32 s0, s4
648 ; VI-NEXT: s_mov_b32 s1, s5
649 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
650 ; VI-NEXT: v_sub_u16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
651 ; VI-NEXT: v_sub_u16_e32 v0, v0, v2
652 ; VI-NEXT: v_bfe_i32 v2, v1, 0, 16
653 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
654 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
655 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
656 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
658 %tid = call i32 @llvm.amdgcn.workitem.id.x()
659 %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
660 %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
661 %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
662 %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
663 %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
664 %add = sub <2 x i16> %a, %b
665 %ext = sext <2 x i16> %add to <2 x i64>
666 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
670 declare i32 @llvm.amdgcn.workitem.id.x() #0
672 attributes #0 = { nounwind readnone }
673 attributes #1 = { nounwind }