1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
3 ; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
5 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
6 ; GCN: s_load_dword [[VEC:s[0-9]+]]
8 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
9 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
12 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]]
13 define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
14 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
15 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
16 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
20 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg:
21 ; GCN: s_load_dword [[ELT0:s[0-9]+]]
22 ; GCN: s_load_dword [[VEC:s[0-9]+]]
24 ; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
25 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
26 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
30 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
31 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
32 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
33 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
34 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
38 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
39 ; GCN: s_load_dword [[ELT0:s[0-9]+]]
40 ; GCN: s_load_dword [[VEC:s[0-9]+]]
42 ; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
43 ; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
44 ; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
45 ; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
46 ; CIVI-DAG: ; use [[SHR]]
48 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
49 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
50 ; GFX9-DAG: ; use [[ELT1]]
51 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
52 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
53 %elt1 = extractelement <2 x i16> %vec, i32 1
54 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
55 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
56 %use1 = zext i16 %elt1 to i32
57 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
61 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
62 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
63 ; GCN: s_load_dword [[VEC:s[0-9]+]]
65 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
66 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
70 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
71 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
72 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
73 %elt.hi = lshr i32 %elt.arg, 16
74 %elt = trunc i32 %elt.hi to i16
75 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
76 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
80 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
81 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
82 ; GCN: s_load_dword [[VEC:s[0-9]+]],
84 ; CIVI-DAG: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
85 ; CIVI-DAG: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
86 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], [[ELT0]]
88 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
89 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]]
90 ; GFX9: ; use [[ELT1]]
91 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
92 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
93 %elt.hi = lshr i32 %elt.arg, 16
94 %elt = trunc i32 %elt.hi to i16
95 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
96 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
97 %use1 = zext i16 %elt to i32
98 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
102 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
103 ; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
104 ; GCN: s_load_dword [[VEC:s[0-9]+]],
106 ; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
107 ; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
108 ; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16
109 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
111 ; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
112 ; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
113 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
114 ; GFX9: ; use [[ELT_HI]]
115 ; GFX9: ; use [[VEC_HI]]
116 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
117 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
118 %elt.hi = lshr i32 %elt.arg, 16
119 %elt = trunc i32 %elt.hi to i16
120 %vec.hi = extractelement <2 x i16> %vec, i32 1
121 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
122 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
123 %use1 = zext i16 %elt to i32
124 %vec.hi.use1 = zext i16 %vec.hi to i32
126 call void asm sideeffect "; use $0", "s"(i32 %use1) #0
127 call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
131 ; GCN-LABEL: {{^}}s_insertelement_v2i16_1:
132 ; GCN: s_load_dword [[VEC:s[0-9]+]]
136 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
137 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
139 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7
140 define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
141 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
142 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
143 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
147 ; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg:
148 ; GCN: s_load_dword [[ELT1:s[0-9]+]]
149 ; GCN: s_load_dword [[VEC:s[0-9]+]]
151 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
152 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
155 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
156 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
157 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
158 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
159 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
163 ; GCN-LABEL: {{^}}s_insertelement_v2f16_0:
164 ; GCN: s_load_dword [[VEC:s[0-9]+]]
165 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC:s[0-9]+]], 0xffff0000
166 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x4500
168 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
169 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]]
170 define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
171 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
172 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
173 store <2 x half> %vecins, <2 x half> addrspace(1)* %out
177 ; GCN-LABEL: {{^}}s_insertelement_v2f16_1:
178 ; GCN: s_load_dword [[VEC:s[0-9]+]]
181 ; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
182 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
184 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500
185 define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
186 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
187 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
188 store <2 x half> %vecins, <2 x half> addrspace(1)* %out
192 ; GCN-LABEL: {{^}}v_insertelement_v2i16_0:
193 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
194 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
195 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e7, [[ELT1]]
197 ; GFX9-DAG: s_movk_i32 [[ELT0:s[0-9]+]], 0x3e7{{$}}
198 ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
199 ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]]
200 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
201 define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
202 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
203 %tid.ext = sext i32 %tid to i64
204 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
205 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
206 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
207 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
208 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
212 ; GCN-LABEL: {{^}}v_insertelement_v2i16_0_reghi:
213 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
214 ; GCN-DAG: s_load_dword [[ELT0:s[0-9]+]]
216 ; CIVI-DAG: s_lshr_b32 [[ELT0_SHIFT:s[0-9]+]], [[ELT0]], 16
217 ; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
218 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]]
220 ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff0000{{$}}
221 ; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]]
222 ; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]]
224 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
225 define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
226 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
227 %tid.ext = sext i32 %tid to i64
228 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
229 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
230 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
231 %elt.hi = lshr i32 %elt.arg, 16
232 %elt = trunc i32 %elt.hi to i16
233 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
234 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
238 ; GCN-LABEL: {{^}}v_insertelement_v2i16_0_inlineimm:
239 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
241 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
242 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]]
244 ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
245 ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]]
247 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
248 define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
249 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
250 %tid.ext = sext i32 %tid to i64
251 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
252 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
253 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
254 %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
255 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
259 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
261 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
262 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
263 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
265 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
266 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
267 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
269 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
270 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]]
271 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
273 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
274 define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
275 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
276 %tid.ext = sext i32 %tid to i64
277 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
278 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
279 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
280 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
281 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
285 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1_inlineimm:
286 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xfff10000
287 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
288 ; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
289 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
290 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
291 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
292 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
293 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
294 define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
295 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
296 %tid.ext = sext i32 %tid to i64
297 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
298 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
299 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
300 %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
301 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
305 ; GCN-LABEL: {{^}}v_insertelement_v2f16_0:
306 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
308 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
309 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x4500, [[ELT1]]
311 ; GFX9-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0x4500{{$}}
312 ; GFX9-DAG: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
313 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]]
315 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
316 define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
317 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
318 %tid.ext = sext i32 %tid to i64
319 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
320 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
321 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
322 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
323 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
327 ; GCN-LABEL: {{^}}v_insertelement_v2f16_0_inlineimm:
328 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
330 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
331 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]]
333 ; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
334 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53
335 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
336 define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
337 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
338 %tid.ext = sext i32 %tid to i64
339 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
340 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
341 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
342 %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
343 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
347 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
348 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
349 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
351 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
352 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
353 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
355 ; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
356 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]]
358 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
360 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
361 define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
362 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
363 %tid.ext = sext i32 %tid to i64
364 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
365 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
366 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
367 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
368 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
372 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1_inlineimm:
373 ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x230000
374 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
375 ; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
376 ; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
377 ; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
378 ; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
379 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
380 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
381 define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
382 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
383 %tid.ext = sext i32 %tid to i64
384 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
385 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
386 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
387 %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
388 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
392 ; FIXME: Enable for others when argument load not split
393 ; GCN-LABEL: {{^}}s_insertelement_v2i16_dynamic:
394 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
395 ; GCN: s_load_dword [[IDX:s[0-9]+]]
396 ; GCN: s_load_dword [[VEC:s[0-9]+]]
397 ; GCN-DAG: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]]
398 ; GCN-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
399 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
400 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]]
401 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
402 define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
403 %idx = load volatile i32, i32 addrspace(2)* %idx.ptr
404 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
405 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
406 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
410 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_sgpr:
411 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
412 ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
413 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
414 ; GCN-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
415 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
416 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
417 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
418 define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
419 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
420 %tid.ext = sext i32 %tid to i64
421 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
422 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
423 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
424 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
425 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
429 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
430 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
431 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
433 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
434 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
436 ; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
437 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
438 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
440 ; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
441 ; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
443 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
444 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
445 define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
446 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
447 %tid.ext = sext i32 %tid to i64
448 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
449 %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
450 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
451 %idx = load i32, i32 addrspace(1)* %idx.gep
452 %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
453 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
454 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
458 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
459 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
460 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
462 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
463 ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
465 ; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
466 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
467 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
469 ; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
470 ; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
472 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
473 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
474 define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
475 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
476 %tid.ext = sext i32 %tid to i64
477 %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
478 %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
479 %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
480 %idx = load i32, i32 addrspace(1)* %idx.gep
481 %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
482 %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
483 store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
487 declare i32 @llvm.amdgcn.workitem.id.x() #1
489 attributes #0 = { nounwind }
490 attributes #1 = { nounwind readnone }