1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
8 define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 inreg %idx) {
9 ; GFX9-LABEL: insertelement_s_v2i8_s_s:
11 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
12 ; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
13 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
14 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
15 ; GFX9-NEXT: s_waitcnt vmcnt(0)
16 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
17 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
18 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
19 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
20 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
21 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0
22 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
23 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
24 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
25 ; GFX9-NEXT: global_store_short v[0:1], v2, off
28 ; GFX8-LABEL: insertelement_s_v2i8_s_s:
30 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
31 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
32 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
33 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
34 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
35 ; GFX8-NEXT: s_waitcnt vmcnt(0)
36 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0
37 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
38 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
39 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
40 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
41 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0
42 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
43 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
44 ; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
45 ; GFX8-NEXT: flat_store_short v[0:1], v2
48 ; GFX7-LABEL: insertelement_s_v2i8_s_s:
50 ; GFX7-NEXT: s_mov_b32 s0, s2
51 ; GFX7-NEXT: s_mov_b32 s1, s3
52 ; GFX7-NEXT: s_mov_b32 s2, -1
53 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
54 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
55 ; GFX7-NEXT: v_mov_b32_e32 v1, s4
56 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
57 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
58 ; GFX7-NEXT: s_waitcnt vmcnt(0)
59 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0
60 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
61 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
62 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
63 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
64 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
65 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
66 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
67 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
70 ; GFX10-LABEL: insertelement_s_v2i8_s_s:
72 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
73 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
74 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 0
75 ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
76 ; GFX10-NEXT: s_waitcnt vmcnt(0)
77 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
78 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s1
79 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
80 ; GFX10-NEXT: s_movk_i32 s0, 0xff
81 ; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
82 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
83 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
84 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
85 ; GFX10-NEXT: global_store_short v[0:1], v2, off
86 ; GFX10-NEXT: s_endpgm
88 ; GFX11-LABEL: insertelement_s_v2i8_s_s:
90 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
91 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
92 ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
93 ; GFX11-NEXT: s_waitcnt vmcnt(0)
94 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
95 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
96 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
97 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s5, 0
98 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
99 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
100 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s0
101 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1
102 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
103 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
104 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
105 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
106 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
107 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
108 ; GFX11-NEXT: s_nop 0
109 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
110 ; GFX11-NEXT: s_endpgm
111 %vec = load <2 x i8>, ptr addrspace(4) %ptr
112 %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
113 store <2 x i8> %insert, ptr addrspace(1) null
117 define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg %val, i32 inreg %idx) {
118 ; GFX9-LABEL: insertelement_v_v2i8_s_s:
120 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
121 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
122 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
123 ; GFX9-NEXT: s_waitcnt vmcnt(0)
124 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
125 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
126 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1
127 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
128 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
129 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0
130 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
131 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
132 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
133 ; GFX9-NEXT: global_store_short v[0:1], v2, off
134 ; GFX9-NEXT: s_endpgm
136 ; GFX8-LABEL: insertelement_v_v2i8_s_s:
138 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
139 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
140 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
141 ; GFX8-NEXT: s_waitcnt vmcnt(0)
142 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0
143 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
144 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1
145 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
146 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
147 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0
148 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
149 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
150 ; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
151 ; GFX8-NEXT: flat_store_short v[0:1], v2
152 ; GFX8-NEXT: s_endpgm
154 ; GFX7-LABEL: insertelement_v_v2i8_s_s:
156 ; GFX7-NEXT: s_mov_b32 s6, 0
157 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
158 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
159 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
160 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
161 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
162 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
163 ; GFX7-NEXT: s_mov_b32 s6, -1
164 ; GFX7-NEXT: s_waitcnt vmcnt(0)
165 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0
166 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
167 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1
168 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
169 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
170 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
171 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
172 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
173 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
174 ; GFX7-NEXT: s_endpgm
176 ; GFX10-LABEL: insertelement_v_v2i8_s_s:
178 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
179 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
180 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 0
181 ; GFX10-NEXT: s_waitcnt vmcnt(0)
182 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
183 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s1
184 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
185 ; GFX10-NEXT: s_movk_i32 s0, 0xff
186 ; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
187 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
188 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
189 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
190 ; GFX10-NEXT: global_store_short v[0:1], v2, off
191 ; GFX10-NEXT: s_endpgm
193 ; GFX11-LABEL: insertelement_v_v2i8_s_s:
195 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
196 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
197 ; GFX11-NEXT: s_waitcnt vmcnt(0)
198 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
200 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
201 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 0
202 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
203 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
204 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
205 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1
206 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
207 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
208 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
209 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
210 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
211 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
212 ; GFX11-NEXT: s_nop 0
213 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
214 ; GFX11-NEXT: s_endpgm
215 %vec = load <2 x i8>, ptr addrspace(1 ) %ptr
216 %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
217 store <2 x i8> %insert, ptr addrspace(1) null
221 define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8 %val, i32 inreg %idx) {
222 ; GFX9-LABEL: insertelement_s_v2i8_v_s:
224 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
225 ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
226 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
227 ; GFX9-NEXT: s_waitcnt vmcnt(0)
228 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1
229 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
230 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
231 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
232 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
233 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0
234 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
235 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
236 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
237 ; GFX9-NEXT: global_store_short v[0:1], v2, off
238 ; GFX9-NEXT: s_endpgm
240 ; GFX8-LABEL: insertelement_s_v2i8_v_s:
242 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
243 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
244 ; GFX8-NEXT: flat_load_ushort v1, v[1:2]
245 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
246 ; GFX8-NEXT: s_waitcnt vmcnt(0)
247 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1
248 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc
249 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
250 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
251 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
252 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0
253 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
254 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
255 ; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
256 ; GFX8-NEXT: flat_store_short v[0:1], v2
257 ; GFX8-NEXT: s_endpgm
259 ; GFX7-LABEL: insertelement_s_v2i8_v_s:
261 ; GFX7-NEXT: s_mov_b32 s0, s2
262 ; GFX7-NEXT: s_mov_b32 s1, s3
263 ; GFX7-NEXT: s_mov_b32 s2, -1
264 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
265 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
266 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
267 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
268 ; GFX7-NEXT: s_waitcnt vmcnt(0)
269 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v1
270 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
271 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
272 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
273 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
274 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
275 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
276 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
277 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
278 ; GFX7-NEXT: s_endpgm
280 ; GFX10-LABEL: insertelement_s_v2i8_v_s:
282 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
283 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
284 ; GFX10-NEXT: s_movk_i32 s0, 0xff
285 ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
286 ; GFX10-NEXT: s_waitcnt vmcnt(0)
287 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
288 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
289 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0
290 ; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
291 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc_lo
292 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
293 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
294 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
295 ; GFX10-NEXT: global_store_short v[0:1], v2, off
296 ; GFX10-NEXT: s_endpgm
298 ; GFX11-LABEL: insertelement_s_v2i8_v_s:
300 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
301 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
302 ; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
303 ; GFX11-NEXT: s_waitcnt vmcnt(0)
304 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1
305 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
306 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
307 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 0
308 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, 0xff, v2
309 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
310 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
311 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1
312 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
313 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
314 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
315 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
316 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
317 ; GFX11-NEXT: s_nop 0
318 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
319 ; GFX11-NEXT: s_endpgm
320 %vec = load <2 x i8>, ptr addrspace(4) %ptr
321 %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
322 store <2 x i8> %insert, ptr addrspace(1) null
326 define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 %idx) {
327 ; GFX9-LABEL: insertelement_s_v2i8_s_v:
329 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
330 ; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
331 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
332 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
333 ; GFX9-NEXT: s_waitcnt vmcnt(0)
334 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
335 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc
336 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
337 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
338 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
339 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0
340 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
341 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
342 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
343 ; GFX9-NEXT: global_store_short v[0:1], v2, off
344 ; GFX9-NEXT: s_endpgm
346 ; GFX8-LABEL: insertelement_s_v2i8_s_v:
348 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
349 ; GFX8-NEXT: v_mov_b32_e32 v2, s3
350 ; GFX8-NEXT: flat_load_ushort v1, v[1:2]
351 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
352 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
353 ; GFX8-NEXT: s_waitcnt vmcnt(0)
354 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1
355 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc
356 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
357 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
358 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
359 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0
360 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
361 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
362 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
363 ; GFX8-NEXT: flat_store_short v[0:1], v2
364 ; GFX8-NEXT: s_endpgm
366 ; GFX7-LABEL: insertelement_s_v2i8_s_v:
368 ; GFX7-NEXT: s_mov_b32 s0, s2
369 ; GFX7-NEXT: s_mov_b32 s1, s3
370 ; GFX7-NEXT: s_mov_b32 s2, -1
371 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
372 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
373 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
374 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
375 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
376 ; GFX7-NEXT: s_waitcnt vmcnt(0)
377 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1
378 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
379 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
380 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
381 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
382 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
383 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
384 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
385 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
386 ; GFX7-NEXT: s_endpgm
388 ; GFX10-LABEL: insertelement_s_v2i8_s_v:
390 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
391 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
392 ; GFX10-NEXT: s_movk_i32 s0, 0xff
393 ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
394 ; GFX10-NEXT: s_waitcnt vmcnt(0)
395 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
396 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
397 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
398 ; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
399 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo
400 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
401 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
402 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
403 ; GFX10-NEXT: global_store_short v[0:1], v2, off
404 ; GFX10-NEXT: s_endpgm
406 ; GFX11-LABEL: insertelement_s_v2i8_s_v:
408 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
409 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
410 ; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
411 ; GFX11-NEXT: s_waitcnt vmcnt(0)
412 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1
413 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
414 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
415 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
416 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo
417 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
418 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
419 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
420 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1
421 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
422 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
423 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
424 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
425 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
426 ; GFX11-NEXT: s_nop 0
427 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
428 ; GFX11-NEXT: s_endpgm
429 %vec = load <2 x i8>, ptr addrspace(4) %ptr
430 %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
431 store <2 x i8> %insert, ptr addrspace(1) null
435 define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8 %val, i32 %idx) {
436 ; GFX9-LABEL: insertelement_s_v2i8_v_v:
438 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
439 ; GFX9-NEXT: global_load_ushort v2, v2, s[2:3]
440 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
441 ; GFX9-NEXT: s_waitcnt vmcnt(0)
442 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
443 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
444 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
445 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
446 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
447 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v0
448 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
449 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
450 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
451 ; GFX9-NEXT: global_store_short v[0:1], v2, off
452 ; GFX9-NEXT: s_endpgm
454 ; GFX8-LABEL: insertelement_s_v2i8_v_v:
456 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
457 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
458 ; GFX8-NEXT: flat_load_ushort v2, v[2:3]
459 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
460 ; GFX8-NEXT: s_waitcnt vmcnt(0)
461 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2
462 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
463 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
464 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
465 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
466 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v0
467 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
468 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
469 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
470 ; GFX8-NEXT: flat_store_short v[0:1], v2
471 ; GFX8-NEXT: s_endpgm
473 ; GFX7-LABEL: insertelement_s_v2i8_v_v:
475 ; GFX7-NEXT: s_mov_b32 s0, s2
476 ; GFX7-NEXT: s_mov_b32 s1, s3
477 ; GFX7-NEXT: s_mov_b32 s2, -1
478 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
479 ; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
480 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
481 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
482 ; GFX7-NEXT: s_waitcnt vmcnt(0)
483 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v2
484 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
485 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
486 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
487 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
488 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
489 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
490 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
491 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
492 ; GFX7-NEXT: s_endpgm
494 ; GFX10-LABEL: insertelement_s_v2i8_v_v:
496 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
497 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
498 ; GFX10-NEXT: s_movk_i32 s0, 0xff
499 ; GFX10-NEXT: global_load_ushort v2, v2, s[2:3]
500 ; GFX10-NEXT: s_waitcnt vmcnt(0)
501 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
502 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
503 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
504 ; GFX10-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
505 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
506 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
507 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
508 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
509 ; GFX10-NEXT: global_store_short v[0:1], v2, off
510 ; GFX10-NEXT: s_endpgm
512 ; GFX11-LABEL: insertelement_s_v2i8_v_v:
514 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
515 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
516 ; GFX11-NEXT: global_load_u16 v2, v2, s[2:3]
517 ; GFX11-NEXT: s_waitcnt vmcnt(0)
518 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2
519 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
520 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo
521 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
522 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_and_b32 v1, 0xff, v3
523 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
524 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1
525 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
526 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
527 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
528 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
529 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
530 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
531 ; GFX11-NEXT: s_nop 0
532 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
533 ; GFX11-NEXT: s_endpgm
534 %vec = load <2 x i8>, ptr addrspace(4) %ptr
535 %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
536 store <2 x i8> %insert, ptr addrspace(1) null
540 define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg %val, i32 %idx) {
541 ; GFX9-LABEL: insertelement_v_v2i8_s_v:
543 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
544 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
545 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
546 ; GFX9-NEXT: s_waitcnt vmcnt(0)
547 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
548 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
549 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
550 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
551 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
552 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0
553 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
554 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
555 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
556 ; GFX9-NEXT: global_store_short v[0:1], v2, off
557 ; GFX9-NEXT: s_endpgm
559 ; GFX8-LABEL: insertelement_v_v2i8_s_v:
561 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
562 ; GFX8-NEXT: v_mov_b32_e32 v1, s2
563 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
564 ; GFX8-NEXT: s_waitcnt vmcnt(0)
565 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
566 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
567 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
568 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
569 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
570 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0
571 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
572 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
573 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
574 ; GFX8-NEXT: flat_store_short v[0:1], v2
575 ; GFX8-NEXT: s_endpgm
577 ; GFX7-LABEL: insertelement_v_v2i8_s_v:
579 ; GFX7-NEXT: s_mov_b32 s6, 0
580 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
581 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
582 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
583 ; GFX7-NEXT: v_mov_b32_e32 v1, s2
584 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
585 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
586 ; GFX7-NEXT: s_mov_b32 s6, -1
587 ; GFX7-NEXT: s_waitcnt vmcnt(0)
588 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0
589 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
590 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
591 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
592 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
593 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
594 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
595 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
596 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
597 ; GFX7-NEXT: s_endpgm
599 ; GFX10-LABEL: insertelement_v_v2i8_s_v:
601 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
602 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
603 ; GFX10-NEXT: s_movk_i32 s0, 0xff
604 ; GFX10-NEXT: s_waitcnt vmcnt(0)
605 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
606 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
607 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
608 ; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
609 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, vcc_lo
610 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
611 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
612 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
613 ; GFX10-NEXT: global_store_short v[0:1], v2, off
614 ; GFX10-NEXT: s_endpgm
616 ; GFX11-LABEL: insertelement_v_v2i8_s_v:
618 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
619 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
620 ; GFX11-NEXT: s_waitcnt vmcnt(0)
621 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
622 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
623 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
624 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
625 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
626 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
627 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
628 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1
629 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
630 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
631 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
632 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
633 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
634 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
635 ; GFX11-NEXT: s_nop 0
636 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
637 ; GFX11-NEXT: s_endpgm
638 %vec = load <2 x i8>, ptr addrspace(1) %ptr
639 %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
640 store <2 x i8> %insert, ptr addrspace(1) null
644 define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val, i32 inreg %idx) {
645 ; GFX9-LABEL: insertelement_v_v2i8_v_s:
647 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
648 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
649 ; GFX9-NEXT: s_waitcnt vmcnt(0)
650 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
651 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc
652 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1
653 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
654 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
655 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0
656 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
657 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
658 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
659 ; GFX9-NEXT: global_store_short v[0:1], v2, off
660 ; GFX9-NEXT: s_endpgm
662 ; GFX8-LABEL: insertelement_v_v2i8_v_s:
664 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
665 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
666 ; GFX8-NEXT: s_waitcnt vmcnt(0)
667 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0
668 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc
669 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1
670 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
671 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
672 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0
673 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
674 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
675 ; GFX8-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
676 ; GFX8-NEXT: flat_store_short v[0:1], v2
677 ; GFX8-NEXT: s_endpgm
679 ; GFX7-LABEL: insertelement_v_v2i8_v_s:
681 ; GFX7-NEXT: s_mov_b32 s6, 0
682 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
683 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
684 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
685 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
686 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
687 ; GFX7-NEXT: s_mov_b32 s6, -1
688 ; GFX7-NEXT: s_waitcnt vmcnt(0)
689 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0
690 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
691 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1
692 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
693 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
694 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
695 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
696 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
697 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
698 ; GFX7-NEXT: s_endpgm
700 ; GFX10-LABEL: insertelement_v_v2i8_v_s:
702 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
703 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
704 ; GFX10-NEXT: s_movk_i32 s0, 0xff
705 ; GFX10-NEXT: s_waitcnt vmcnt(0)
706 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
707 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
708 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
709 ; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
710 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
711 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
712 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
713 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
714 ; GFX10-NEXT: global_store_short v[0:1], v2, off
715 ; GFX10-NEXT: s_endpgm
717 ; GFX11-LABEL: insertelement_v_v2i8_v_s:
719 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
720 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
721 ; GFX11-NEXT: s_waitcnt vmcnt(0)
722 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
723 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
724 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
725 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
726 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
727 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
728 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
729 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
730 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1
731 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
732 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
733 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
734 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
735 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
736 ; GFX11-NEXT: s_nop 0
737 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
738 ; GFX11-NEXT: s_endpgm
739 %vec = load <2 x i8>, ptr addrspace(1) %ptr
740 %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
741 store <2 x i8> %insert, ptr addrspace(1) null
745 define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val, i32 %idx) {
746 ; GFX9-LABEL: insertelement_v_v2i8_v_v:
748 ; GFX9-NEXT: global_load_ushort v0, v[0:1], off
749 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
750 ; GFX9-NEXT: s_waitcnt vmcnt(0)
751 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
752 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
753 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
754 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
755 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
756 ; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v0
757 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
758 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
759 ; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
760 ; GFX9-NEXT: global_store_short v[0:1], v2, off
761 ; GFX9-NEXT: s_endpgm
763 ; GFX8-LABEL: insertelement_v_v2i8_v_v:
765 ; GFX8-NEXT: flat_load_ushort v0, v[0:1]
766 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
767 ; GFX8-NEXT: s_waitcnt vmcnt(0)
768 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0
769 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
770 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
771 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
772 ; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
773 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v0
774 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
775 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
776 ; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
777 ; GFX8-NEXT: flat_store_short v[0:1], v2
778 ; GFX8-NEXT: s_endpgm
780 ; GFX7-LABEL: insertelement_v_v2i8_v_v:
782 ; GFX7-NEXT: s_mov_b32 s2, 0
783 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
784 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
785 ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
786 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
787 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
788 ; GFX7-NEXT: s_mov_b32 s2, -1
789 ; GFX7-NEXT: s_waitcnt vmcnt(0)
790 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0
791 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
792 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
793 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
794 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v1
795 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
796 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
797 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
798 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
799 ; GFX7-NEXT: s_endpgm
801 ; GFX10-LABEL: insertelement_v_v2i8_v_v:
803 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
804 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
805 ; GFX10-NEXT: s_movk_i32 s0, 0xff
806 ; GFX10-NEXT: s_waitcnt vmcnt(0)
807 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
808 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
809 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
810 ; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
811 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo
812 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
813 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
814 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
815 ; GFX10-NEXT: global_store_short v[0:1], v2, off
816 ; GFX10-NEXT: s_endpgm
818 ; GFX11-LABEL: insertelement_v_v2i8_v_v:
820 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off
821 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
822 ; GFX11-NEXT: s_waitcnt vmcnt(0)
823 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0
824 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
825 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
826 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
827 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
828 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
829 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
830 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
831 ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v1
832 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
833 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
834 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
835 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v3
836 ; GFX11-NEXT: global_store_b16 v[0:1], v2, off
837 ; GFX11-NEXT: s_nop 0
838 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
839 ; GFX11-NEXT: s_endpgm
840 %vec = load <2 x i8>, ptr addrspace(1) %ptr
841 %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx
842 store <2 x i8> %insert, ptr addrspace(1) null
846 ; FIXME: 3 element load/store legalization
847 ; define amdgpu_ps void @insertelement_s_v3i8_s_s(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 inreg %idx) {
848 ; %vec = load <3 x i8>, ptr addrspace(4) %ptr
849 ; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
850 ; store <3 x i8> %insert, ptr addrspace(1) null
854 ; define amdgpu_ps void @insertelement_v_v3i8_s_s(ptr addrspace(1) %ptr, i8 inreg %val, i32 inreg %idx) {
855 ; %vec = load <3 x i8>, ptr addrspace(1 ) %ptr
856 ; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
857 ; store <3 x i8> %insert, ptr addrspace(1) null
861 ; define amdgpu_ps void @insertelement_s_v3i8_v_s(ptr addrspace(4) inreg %ptr, i8 %val, i32 inreg %idx) {
862 ; %vec = load <3 x i8>, ptr addrspace(4) %ptr
863 ; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
864 ; store <3 x i8> %insert, ptr addrspace(1) null
868 ; define amdgpu_ps void @insertelement_s_v3i8_s_v(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 %idx) {
869 ; %vec = load <3 x i8>, ptr addrspace(4) %ptr
870 ; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
871 ; store <3 x i8> %insert, ptr addrspace(1) null
875 ; define amdgpu_ps void @insertelement_s_v3i8_v_v(ptr addrspace(4) inreg %ptr, i8 %val, i32 %idx) {
876 ; %vec = load <3 x i8>, ptr addrspace(4) %ptr
877 ; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
878 ; store <3 x i8> %insert, ptr addrspace(1) null
882 ; define amdgpu_ps void @insertelement_v_v3i8_s_v(ptr addrspace(1) %ptr, i8 inreg %val, i32 %idx) {
883 ; %vec = load <3 x i8>, ptr addrspace(1) %ptr
884 ; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
885 ; store <3 x i8> %insert, ptr addrspace(1) null
889 ; define amdgpu_ps void @insertelement_v_v3i8_v_s(ptr addrspace(1) %ptr, i8 %val, i32 inreg %idx) {
890 ; %vec = load <3 x i8>, ptr addrspace(1) %ptr
891 ; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
892 ; store <3 x i8> %insert, ptr addrspace(1) null
896 ; define amdgpu_ps void @insertelement_v_v3i8_v_v(ptr addrspace(1) %ptr, i8 %val, i32 %idx) {
897 ; %vec = load <3 x i8>, ptr addrspace(1) %ptr
898 ; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx
899 ; store <3 x i8> %insert, ptr addrspace(1) null
903 define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg %val, i32 inreg %idx) {
904 ; GFX9-LABEL: insertelement_v_v4i8_s_s:
906 ; GFX9-NEXT: global_load_dword v2, v[0:1], off
907 ; GFX9-NEXT: s_and_b32 s0, s3, 3
908 ; GFX9-NEXT: s_lshl_b32 s0, s0, 3
909 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff
910 ; GFX9-NEXT: s_lshl_b32 s1, s1, s0
911 ; GFX9-NEXT: s_lshl_b32 s0, 0xff, s0
912 ; GFX9-NEXT: s_not_b32 s0, s0
913 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
914 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
915 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
916 ; GFX9-NEXT: s_waitcnt vmcnt(0)
917 ; GFX9-NEXT: v_and_or_b32 v2, v2, s0, v3
918 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
919 ; GFX9-NEXT: s_endpgm
921 ; GFX8-LABEL: insertelement_v_v4i8_s_s:
923 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
924 ; GFX8-NEXT: s_and_b32 s0, s3, 3
925 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff
926 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3
927 ; GFX8-NEXT: s_lshl_b32 s1, s1, s0
928 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
929 ; GFX8-NEXT: s_not_b32 s0, s0
930 ; GFX8-NEXT: s_waitcnt vmcnt(0)
931 ; GFX8-NEXT: v_and_b32_e32 v2, s0, v0
932 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
933 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
934 ; GFX8-NEXT: v_or_b32_e32 v2, s1, v2
935 ; GFX8-NEXT: flat_store_dword v[0:1], v2
936 ; GFX8-NEXT: s_endpgm
938 ; GFX7-LABEL: insertelement_v_v4i8_s_s:
940 ; GFX7-NEXT: s_mov_b32 s6, 0
941 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
942 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
943 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
944 ; GFX7-NEXT: s_and_b32 s0, s3, 3
945 ; GFX7-NEXT: s_and_b32 s1, s2, 0xff
946 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3
947 ; GFX7-NEXT: s_lshl_b32 s1, s1, s0
948 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
949 ; GFX7-NEXT: s_not_b32 s0, s0
950 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
951 ; GFX7-NEXT: s_mov_b32 s6, -1
952 ; GFX7-NEXT: s_waitcnt vmcnt(0)
953 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
954 ; GFX7-NEXT: v_or_b32_e32 v0, s1, v0
955 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
956 ; GFX7-NEXT: s_endpgm
958 ; GFX10-LABEL: insertelement_v_v4i8_s_s:
960 ; GFX10-NEXT: global_load_dword v2, v[0:1], off
961 ; GFX10-NEXT: s_and_b32 s0, s3, 3
962 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff
963 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3
964 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
965 ; GFX10-NEXT: s_lshl_b32 s2, 0xff, s0
966 ; GFX10-NEXT: s_lshl_b32 s0, s1, s0
967 ; GFX10-NEXT: s_not_b32 s1, s2
968 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
969 ; GFX10-NEXT: s_waitcnt vmcnt(0)
970 ; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0
971 ; GFX10-NEXT: global_store_dword v[0:1], v2, off
972 ; GFX10-NEXT: s_endpgm
974 ; GFX11-LABEL: insertelement_v_v4i8_s_s:
976 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off
977 ; GFX11-NEXT: s_and_b32 s0, s3, 3
978 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff
979 ; GFX11-NEXT: s_lshl_b32 s0, s0, 3
980 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
981 ; GFX11-NEXT: s_lshl_b32 s2, 0xff, s0
982 ; GFX11-NEXT: s_lshl_b32 s0, s1, s0
983 ; GFX11-NEXT: s_not_b32 s1, s2
984 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
985 ; GFX11-NEXT: s_waitcnt vmcnt(0)
986 ; GFX11-NEXT: v_and_or_b32 v2, v2, s1, s0
987 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
988 ; GFX11-NEXT: s_nop 0
989 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
990 ; GFX11-NEXT: s_endpgm
991 %vec = load <4 x i8>, ptr addrspace(1 ) %ptr
992 %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
993 store <4 x i8> %insert, ptr addrspace(1) null
997 define amdgpu_ps void @insertelement_s_v4i8_v_s(ptr addrspace(4) inreg %ptr, i8 %val, i32 inreg %idx) {
998 ; GFX9-LABEL: insertelement_s_v4i8_v_s:
1000 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1001 ; GFX9-NEXT: s_and_b32 s1, s4, 3
1002 ; GFX9-NEXT: s_lshl_b32 s1, s1, 3
1003 ; GFX9-NEXT: s_lshl_b32 s2, 0xff, s1
1004 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0
1005 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1006 ; GFX9-NEXT: s_andn2_b32 s0, s0, s2
1007 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
1008 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1009 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1010 ; GFX9-NEXT: v_lshl_or_b32 v2, v2, s1, v3
1011 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1012 ; GFX9-NEXT: s_endpgm
1014 ; GFX8-LABEL: insertelement_s_v4i8_v_s:
1016 ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
1017 ; GFX8-NEXT: s_and_b32 s1, s4, 3
1018 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3
1019 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1020 ; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
1021 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1022 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1023 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1
1024 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
1025 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
1026 ; GFX8-NEXT: v_or_b32_e32 v2, s0, v2
1027 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1028 ; GFX8-NEXT: s_endpgm
1030 ; GFX7-LABEL: insertelement_s_v4i8_v_s:
1032 ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0
1033 ; GFX7-NEXT: s_and_b32 s1, s4, 3
1034 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3
1035 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
1036 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0
1037 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
1038 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1039 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1
1040 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
1041 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
1042 ; GFX7-NEXT: s_mov_b32 s2, -1
1043 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1044 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1045 ; GFX7-NEXT: s_endpgm
1047 ; GFX10-LABEL: insertelement_s_v4i8_v_s:
1049 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
1050 ; GFX10-NEXT: s_and_b32 s1, s4, 3
1051 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0
1052 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3
1053 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1054 ; GFX10-NEXT: s_lshl_b32 s2, 0xff, s1
1055 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1056 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1057 ; GFX10-NEXT: s_andn2_b32 s0, s0, s2
1058 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, s1, s0
1059 ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1060 ; GFX10-NEXT: s_endpgm
1062 ; GFX11-LABEL: insertelement_s_v4i8_v_s:
1064 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
1065 ; GFX11-NEXT: s_and_b32 s1, s4, 3
1066 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
1067 ; GFX11-NEXT: s_lshl_b32 s1, s1, 3
1068 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1069 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1070 ; GFX11-NEXT: s_lshl_b32 s2, 0xff, s1
1071 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1072 ; GFX11-NEXT: s_and_not1_b32 s0, s0, s2
1073 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1074 ; GFX11-NEXT: v_lshl_or_b32 v2, v2, s1, s0
1075 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1076 ; GFX11-NEXT: s_nop 0
1077 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1078 ; GFX11-NEXT: s_endpgm
1079 %vec = load <4 x i8>, ptr addrspace(4) %ptr
1080 %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
1081 store <4 x i8> %insert, ptr addrspace(1) null
1085 define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 %idx) {
1086 ; GFX9-LABEL: insertelement_s_v4i8_s_v:
1088 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1089 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
1090 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1091 ; GFX9-NEXT: s_movk_i32 s1, 0xff
1092 ; GFX9-NEXT: s_and_b32 s2, s4, 0xff
1093 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2
1094 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1
1095 ; GFX9-NEXT: v_not_b32_e32 v3, v0
1096 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1097 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1098 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1099 ; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2
1100 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1101 ; GFX9-NEXT: s_endpgm
1103 ; GFX8-LABEL: insertelement_s_v4i8_s_v:
1105 ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
1106 ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
1107 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1108 ; GFX8-NEXT: s_movk_i32 s1, 0xff
1109 ; GFX8-NEXT: s_and_b32 s2, s4, 0xff
1110 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2
1111 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1
1112 ; GFX8-NEXT: v_not_b32_e32 v0, v0
1113 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1114 ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
1115 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
1116 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
1117 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
1118 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1119 ; GFX8-NEXT: s_endpgm
1121 ; GFX7-LABEL: insertelement_s_v4i8_s_v:
1123 ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0
1124 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
1125 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1126 ; GFX7-NEXT: s_and_b32 s1, s4, 0xff
1127 ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0
1128 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
1129 ; GFX7-NEXT: v_not_b32_e32 v0, v0
1130 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1131 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
1132 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
1133 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
1134 ; GFX7-NEXT: s_mov_b32 s2, -1
1135 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1136 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1137 ; GFX7-NEXT: s_endpgm
1139 ; GFX10-LABEL: insertelement_s_v4i8_s_v:
1141 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
1142 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
1143 ; GFX10-NEXT: s_and_b32 s1, s4, 0xff
1144 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1145 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
1146 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1
1147 ; GFX10-NEXT: v_not_b32_e32 v3, v1
1148 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1149 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1150 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1151 ; GFX10-NEXT: v_and_or_b32 v2, s0, v3, v2
1152 ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1153 ; GFX10-NEXT: s_endpgm
1155 ; GFX11-LABEL: insertelement_s_v4i8_s_v:
1157 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
1158 ; GFX11-NEXT: v_and_b32_e32 v0, 3, v0
1159 ; GFX11-NEXT: s_and_b32 s1, s4, 0xff
1160 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1161 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1162 ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
1163 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s1
1164 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
1165 ; GFX11-NEXT: v_not_b32_e32 v3, v1
1166 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1167 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1168 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1169 ; GFX11-NEXT: v_and_or_b32 v2, s0, v3, v2
1170 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1171 ; GFX11-NEXT: s_nop 0
1172 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1173 ; GFX11-NEXT: s_endpgm
1174 %vec = load <4 x i8>, ptr addrspace(4) %ptr
1175 %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
1176 store <4 x i8> %insert, ptr addrspace(1) null
1180 define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8 %val, i32 %idx) {
1181 ; GFX9-LABEL: insertelement_s_v4i8_v_v:
1183 ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
1184 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1
1185 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1186 ; GFX9-NEXT: s_movk_i32 s1, 0xff
1187 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1188 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v1, s1
1189 ; GFX9-NEXT: v_not_b32_e32 v3, v0
1190 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1191 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1192 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1193 ; GFX9-NEXT: v_and_or_b32 v2, s0, v3, v2
1194 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1195 ; GFX9-NEXT: s_endpgm
1197 ; GFX8-LABEL: insertelement_s_v4i8_v_v:
1199 ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
1200 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
1201 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1202 ; GFX8-NEXT: s_movk_i32 s1, 0xff
1203 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1204 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v1, s1
1205 ; GFX8-NEXT: v_not_b32_e32 v0, v0
1206 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1207 ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
1208 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
1209 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
1210 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
1211 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1212 ; GFX8-NEXT: s_endpgm
1214 ; GFX7-LABEL: insertelement_s_v4i8_v_v:
1216 ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0
1217 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
1218 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1219 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
1220 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
1221 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
1222 ; GFX7-NEXT: v_not_b32_e32 v1, v1
1223 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1224 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v1
1225 ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
1226 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
1227 ; GFX7-NEXT: s_mov_b32 s2, -1
1228 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1229 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1230 ; GFX7-NEXT: s_endpgm
1232 ; GFX10-LABEL: insertelement_s_v4i8_v_v:
1234 ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
1235 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1
1236 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1237 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
1238 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1239 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1240 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1241 ; GFX10-NEXT: v_not_b32_e32 v2, v2
1242 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1243 ; GFX10-NEXT: v_and_or_b32 v2, s0, v2, v3
1244 ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1245 ; GFX10-NEXT: s_endpgm
1247 ; GFX11-LABEL: insertelement_s_v4i8_v_v:
1249 ; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
1250 ; GFX11-NEXT: v_and_b32_e32 v1, 3, v1
1251 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
1252 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1253 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1254 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
1255 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1256 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0
1257 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1258 ; GFX11-NEXT: v_not_b32_e32 v2, v2
1259 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1260 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1261 ; GFX11-NEXT: v_and_or_b32 v2, s0, v2, v3
1262 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1263 ; GFX11-NEXT: s_nop 0
1264 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1265 ; GFX11-NEXT: s_endpgm
1266 %vec = load <4 x i8>, ptr addrspace(4) %ptr
1267 %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
1268 store <4 x i8> %insert, ptr addrspace(1) null
1272 define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg %val, i32 %idx) {
1273 ; GFX9-LABEL: insertelement_v_v4i8_s_v:
1275 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1276 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v2
1277 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1278 ; GFX9-NEXT: s_movk_i32 s0, 0xff
1279 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff
1280 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1
1281 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0
1282 ; GFX9-NEXT: v_not_b32_e32 v4, v0
1283 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1284 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1286 ; GFX9-NEXT: v_and_or_b32 v2, v3, v4, v2
1287 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1288 ; GFX9-NEXT: s_endpgm
1290 ; GFX8-LABEL: insertelement_v_v4i8_s_v:
1292 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1293 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v2
1294 ; GFX8-NEXT: s_movk_i32 s0, 0xff
1295 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff
1296 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1297 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1
1298 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
1299 ; GFX8-NEXT: v_not_b32_e32 v1, v1
1300 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1301 ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
1302 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
1303 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
1304 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
1305 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1306 ; GFX8-NEXT: s_endpgm
1308 ; GFX7-LABEL: insertelement_v_v4i8_s_v:
1310 ; GFX7-NEXT: s_mov_b32 s6, 0
1311 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1312 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1313 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1314 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v2
1315 ; GFX7-NEXT: s_and_b32 s0, s2, 0xff
1316 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1317 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
1318 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
1319 ; GFX7-NEXT: v_not_b32_e32 v1, v1
1320 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1321 ; GFX7-NEXT: s_mov_b32 s6, -1
1322 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1323 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
1324 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
1325 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1326 ; GFX7-NEXT: s_endpgm
1328 ; GFX10-LABEL: insertelement_v_v4i8_s_v:
1330 ; GFX10-NEXT: global_load_dword v3, v[0:1], off
1331 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2
1332 ; GFX10-NEXT: s_and_b32 s0, s2, 0xff
1333 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1334 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
1335 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0
1336 ; GFX10-NEXT: v_not_b32_e32 v4, v1
1337 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1338 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1339 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1340 ; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2
1341 ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1342 ; GFX10-NEXT: s_endpgm
1344 ; GFX11-LABEL: insertelement_v_v4i8_s_v:
1346 ; GFX11-NEXT: global_load_b32 v3, v[0:1], off
1347 ; GFX11-NEXT: v_and_b32_e32 v0, 3, v2
1348 ; GFX11-NEXT: s_and_b32 s0, s2, 0xff
1349 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1350 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1351 ; GFX11-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
1352 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, s0
1353 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
1354 ; GFX11-NEXT: v_not_b32_e32 v4, v1
1355 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1356 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1357 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1358 ; GFX11-NEXT: v_and_or_b32 v2, v3, v4, v2
1359 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1360 ; GFX11-NEXT: s_nop 0
1361 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1362 ; GFX11-NEXT: s_endpgm
1363 %vec = load <4 x i8>, ptr addrspace(1) %ptr
1364 %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
1365 store <4 x i8> %insert, ptr addrspace(1) null
1369 define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val, i32 inreg %idx) {
1370 ; GFX9-LABEL: insertelement_v_v4i8_v_s:
1372 ; GFX9-NEXT: global_load_dword v3, v[0:1], off
1373 ; GFX9-NEXT: s_and_b32 s0, s2, 3
1374 ; GFX9-NEXT: s_lshl_b32 s0, s0, 3
1375 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1376 ; GFX9-NEXT: s_lshl_b32 s0, 0xff, s0
1377 ; GFX9-NEXT: s_not_b32 s0, s0
1378 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1379 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1380 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1381 ; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2
1382 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1383 ; GFX9-NEXT: s_endpgm
1385 ; GFX8-LABEL: insertelement_v_v4i8_v_s:
1387 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1388 ; GFX8-NEXT: s_and_b32 s0, s2, 3
1389 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3
1390 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
1391 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
1392 ; GFX8-NEXT: s_not_b32 s0, s0
1393 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1394 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1395 ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0
1396 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
1397 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
1398 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
1399 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1400 ; GFX8-NEXT: s_endpgm
1402 ; GFX7-LABEL: insertelement_v_v4i8_v_s:
1404 ; GFX7-NEXT: s_mov_b32 s6, 0
1405 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1406 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1407 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1408 ; GFX7-NEXT: s_and_b32 s0, s2, 3
1409 ; GFX7-NEXT: v_and_b32_e32 v1, 0xff, v2
1410 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3
1411 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s0, v1
1412 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
1413 ; GFX7-NEXT: s_not_b32 s0, s0
1414 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1415 ; GFX7-NEXT: s_mov_b32 s6, -1
1416 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1417 ; GFX7-NEXT: v_and_b32_e32 v0, s0, v0
1418 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
1419 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
1420 ; GFX7-NEXT: s_endpgm
1422 ; GFX10-LABEL: insertelement_v_v4i8_v_s:
1424 ; GFX10-NEXT: global_load_dword v3, v[0:1], off
1425 ; GFX10-NEXT: s_and_b32 s0, s2, 3
1426 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1427 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3
1428 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1429 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1430 ; GFX10-NEXT: s_lshl_b32 s0, 0xff, s0
1431 ; GFX10-NEXT: s_not_b32 s0, s0
1432 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1433 ; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2
1434 ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1435 ; GFX10-NEXT: s_endpgm
1437 ; GFX11-LABEL: insertelement_v_v4i8_v_s:
1439 ; GFX11-NEXT: global_load_b32 v3, v[0:1], off
1440 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v2
1441 ; GFX11-NEXT: s_and_b32 s0, s2, 3
1442 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1443 ; GFX11-NEXT: s_lshl_b32 s0, s0, 3
1444 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1445 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, s0, v0
1446 ; GFX11-NEXT: s_lshl_b32 s0, 0xff, s0
1447 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1448 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1449 ; GFX11-NEXT: s_not_b32 s0, s0
1450 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1451 ; GFX11-NEXT: v_and_or_b32 v2, v3, s0, v2
1452 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1453 ; GFX11-NEXT: s_nop 0
1454 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1455 ; GFX11-NEXT: s_endpgm
1456 %vec = load <4 x i8>, ptr addrspace(1) %ptr
1457 %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
1458 store <4 x i8> %insert, ptr addrspace(1) null
1462 define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val, i32 %idx) {
1463 ; GFX9-LABEL: insertelement_v_v4i8_v_v:
1465 ; GFX9-NEXT: global_load_dword v4, v[0:1], off
1466 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v3
1467 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1468 ; GFX9-NEXT: s_movk_i32 s0, 0xff
1469 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1470 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0
1471 ; GFX9-NEXT: v_not_b32_e32 v3, v0
1472 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1473 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1474 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1475 ; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2
1476 ; GFX9-NEXT: global_store_dword v[0:1], v2, off
1477 ; GFX9-NEXT: s_endpgm
1479 ; GFX8-LABEL: insertelement_v_v4i8_v_v:
1481 ; GFX8-NEXT: flat_load_dword v0, v[0:1]
1482 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v3
1483 ; GFX8-NEXT: s_movk_i32 s0, 0xff
1484 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1485 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1486 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
1487 ; GFX8-NEXT: v_not_b32_e32 v1, v1
1488 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1489 ; GFX8-NEXT: v_and_b32_e32 v3, v0, v1
1490 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
1491 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
1492 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
1493 ; GFX8-NEXT: flat_store_dword v[0:1], v2
1494 ; GFX8-NEXT: s_endpgm
1496 ; GFX7-LABEL: insertelement_v_v4i8_v_v:
1498 ; GFX7-NEXT: s_mov_b32 s2, 0
1499 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1500 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
1501 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
1502 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v3
1503 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
1504 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
1505 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
1506 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
1507 ; GFX7-NEXT: v_not_b32_e32 v1, v1
1508 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
1509 ; GFX7-NEXT: s_mov_b32 s2, -1
1510 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1511 ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
1512 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
1513 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1514 ; GFX7-NEXT: s_endpgm
1516 ; GFX10-LABEL: insertelement_v_v4i8_v_v:
1518 ; GFX10-NEXT: global_load_dword v4, v[0:1], off
1519 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v3
1520 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1521 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff
1522 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1523 ; GFX10-NEXT: v_not_b32_e32 v3, v1
1524 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1525 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1526 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1527 ; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2
1528 ; GFX10-NEXT: global_store_dword v[0:1], v2, off
1529 ; GFX10-NEXT: s_endpgm
1531 ; GFX11-LABEL: insertelement_v_v4i8_v_v:
1533 ; GFX11-NEXT: global_load_b32 v4, v[0:1], off
1534 ; GFX11-NEXT: v_and_b32_e32 v0, 3, v3
1535 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2
1536 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1537 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1538 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v0, 0xff
1539 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1540 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1
1541 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1542 ; GFX11-NEXT: v_not_b32_e32 v2, v2
1543 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1544 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1545 ; GFX11-NEXT: v_and_or_b32 v2, v4, v2, v3
1546 ; GFX11-NEXT: global_store_b32 v[0:1], v2, off
1547 ; GFX11-NEXT: s_nop 0
1548 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1549 ; GFX11-NEXT: s_endpgm
1550 %vec = load <4 x i8>, ptr addrspace(1) %ptr
1551 %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx
1552 store <4 x i8> %insert, ptr addrspace(1) null
1556 define amdgpu_ps void @insertelement_s_v8i8_s_s(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 inreg %idx) {
1557 ; GFX9-LABEL: insertelement_s_v8i8_s_s:
1559 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1560 ; GFX9-NEXT: s_lshr_b32 s2, s5, 2
1561 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1
1562 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1563 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
1564 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1565 ; GFX9-NEXT: s_cselect_b32 s3, s1, s0
1566 ; GFX9-NEXT: s_and_b32 s5, s5, 3
1567 ; GFX9-NEXT: s_lshl_b32 s5, s5, 3
1568 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff
1569 ; GFX9-NEXT: s_lshl_b32 s4, s4, s5
1570 ; GFX9-NEXT: s_lshl_b32 s5, 0xff, s5
1571 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5
1572 ; GFX9-NEXT: s_or_b32 s3, s3, s4
1573 ; GFX9-NEXT: s_cmp_eq_u32 s2, 0
1574 ; GFX9-NEXT: s_cselect_b32 s0, s3, s0
1575 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1
1576 ; GFX9-NEXT: s_cselect_b32 s1, s3, s1
1577 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1578 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
1579 ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
1580 ; GFX9-NEXT: s_endpgm
1582 ; GFX8-LABEL: insertelement_s_v8i8_s_s:
1584 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1585 ; GFX8-NEXT: s_lshr_b32 s2, s5, 2
1586 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1
1587 ; GFX8-NEXT: v_mov_b32_e32 v0, 0
1588 ; GFX8-NEXT: v_mov_b32_e32 v1, 0
1589 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1590 ; GFX8-NEXT: s_cselect_b32 s3, s1, s0
1591 ; GFX8-NEXT: s_and_b32 s5, s5, 3
1592 ; GFX8-NEXT: s_lshl_b32 s5, s5, 3
1593 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff
1594 ; GFX8-NEXT: s_lshl_b32 s4, s4, s5
1595 ; GFX8-NEXT: s_lshl_b32 s5, 0xff, s5
1596 ; GFX8-NEXT: s_andn2_b32 s3, s3, s5
1597 ; GFX8-NEXT: s_or_b32 s3, s3, s4
1598 ; GFX8-NEXT: s_cmp_eq_u32 s2, 0
1599 ; GFX8-NEXT: s_cselect_b32 s0, s3, s0
1600 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1
1601 ; GFX8-NEXT: s_cselect_b32 s1, s3, s1
1602 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1603 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
1604 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
1605 ; GFX8-NEXT: s_endpgm
1607 ; GFX7-LABEL: insertelement_s_v8i8_s_s:
1609 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1610 ; GFX7-NEXT: s_lshr_b32 s3, s5, 2
1611 ; GFX7-NEXT: s_cmp_eq_u32 s3, 1
1612 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1613 ; GFX7-NEXT: s_cselect_b32 s2, s1, s0
1614 ; GFX7-NEXT: s_and_b32 s5, s5, 3
1615 ; GFX7-NEXT: s_lshl_b32 s5, s5, 3
1616 ; GFX7-NEXT: s_and_b32 s4, s4, 0xff
1617 ; GFX7-NEXT: s_lshl_b32 s4, s4, s5
1618 ; GFX7-NEXT: s_lshl_b32 s5, 0xff, s5
1619 ; GFX7-NEXT: s_andn2_b32 s2, s2, s5
1620 ; GFX7-NEXT: s_or_b32 s4, s2, s4
1621 ; GFX7-NEXT: s_cmp_eq_u32 s3, 0
1622 ; GFX7-NEXT: s_cselect_b32 s2, s4, s0
1623 ; GFX7-NEXT: s_cmp_eq_u32 s3, 1
1624 ; GFX7-NEXT: s_cselect_b32 s3, s4, s1
1625 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
1626 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
1627 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
1628 ; GFX7-NEXT: s_mov_b32 s2, -1
1629 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1630 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1631 ; GFX7-NEXT: s_endpgm
1633 ; GFX10-LABEL: insertelement_s_v8i8_s_s:
1635 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1636 ; GFX10-NEXT: s_lshr_b32 s2, s5, 2
1637 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1638 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1
1639 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1640 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1641 ; GFX10-NEXT: s_cselect_b32 s3, s1, s0
1642 ; GFX10-NEXT: s_and_b32 s5, s5, 3
1643 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff
1644 ; GFX10-NEXT: s_lshl_b32 s5, s5, 3
1645 ; GFX10-NEXT: s_lshl_b32 s6, 0xff, s5
1646 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5
1647 ; GFX10-NEXT: s_andn2_b32 s3, s3, s6
1648 ; GFX10-NEXT: s_or_b32 s3, s3, s4
1649 ; GFX10-NEXT: s_cmp_eq_u32 s2, 0
1650 ; GFX10-NEXT: s_cselect_b32 s0, s3, s0
1651 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1
1652 ; GFX10-NEXT: s_cselect_b32 s1, s3, s1
1653 ; GFX10-NEXT: v_mov_b32_e32 v3, s1
1654 ; GFX10-NEXT: v_mov_b32_e32 v2, s0
1655 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
1656 ; GFX10-NEXT: s_endpgm
1658 ; GFX11-LABEL: insertelement_s_v8i8_s_s:
1660 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
1661 ; GFX11-NEXT: s_lshr_b32 s2, s5, 2
1662 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1663 ; GFX11-NEXT: s_cmp_eq_u32 s2, 1
1664 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1665 ; GFX11-NEXT: s_cselect_b32 s3, s1, s0
1666 ; GFX11-NEXT: s_and_b32 s5, s5, 3
1667 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff
1668 ; GFX11-NEXT: s_lshl_b32 s5, s5, 3
1669 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
1670 ; GFX11-NEXT: s_lshl_b32 s6, 0xff, s5
1671 ; GFX11-NEXT: s_lshl_b32 s4, s4, s5
1672 ; GFX11-NEXT: s_and_not1_b32 s3, s3, s6
1673 ; GFX11-NEXT: s_or_b32 s3, s3, s4
1674 ; GFX11-NEXT: s_cmp_eq_u32 s2, 0
1675 ; GFX11-NEXT: s_cselect_b32 s0, s3, s0
1676 ; GFX11-NEXT: s_cmp_eq_u32 s2, 1
1677 ; GFX11-NEXT: s_cselect_b32 s1, s3, s1
1678 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1679 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
1680 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
1681 ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
1682 ; GFX11-NEXT: s_nop 0
1683 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1684 ; GFX11-NEXT: s_endpgm
1685 %vec = load <8 x i8>, ptr addrspace(4) %ptr
1686 %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
1687 store <8 x i8> %insert, ptr addrspace(1) null
1691 define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg %val, i32 inreg %idx) {
1692 ; GFX9-LABEL: insertelement_v_v8i8_s_s:
1694 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1695 ; GFX9-NEXT: s_and_b32 s1, s3, 3
1696 ; GFX9-NEXT: s_lshr_b32 s0, s3, 2
1697 ; GFX9-NEXT: s_and_b32 s2, s2, 0xff
1698 ; GFX9-NEXT: s_lshl_b32 s1, s1, 3
1699 ; GFX9-NEXT: s_lshl_b32 s2, s2, s1
1700 ; GFX9-NEXT: s_lshl_b32 s1, 0xff, s1
1701 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
1702 ; GFX9-NEXT: s_not_b32 s1, s1
1703 ; GFX9-NEXT: v_mov_b32_e32 v4, s2
1704 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1705 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1706 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1707 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
1708 ; GFX9-NEXT: v_and_or_b32 v4, v5, s1, v4
1709 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
1710 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1711 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1712 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1713 ; GFX9-NEXT: s_endpgm
1715 ; GFX8-LABEL: insertelement_v_v8i8_s_s:
1717 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1718 ; GFX8-NEXT: s_and_b32 s1, s3, 3
1719 ; GFX8-NEXT: s_lshr_b32 s0, s3, 2
1720 ; GFX8-NEXT: s_and_b32 s2, s2, 0xff
1721 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3
1722 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1
1723 ; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
1724 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
1725 ; GFX8-NEXT: s_not_b32 s1, s1
1726 ; GFX8-NEXT: v_mov_b32_e32 v2, 0
1727 ; GFX8-NEXT: v_mov_b32_e32 v3, 0
1728 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1729 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
1730 ; GFX8-NEXT: v_and_b32_e32 v4, s1, v4
1731 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4
1732 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
1733 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1734 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1735 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1736 ; GFX8-NEXT: s_endpgm
1738 ; GFX7-LABEL: insertelement_v_v8i8_s_s:
1740 ; GFX7-NEXT: s_mov_b32 s6, 0
1741 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1742 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1743 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
1744 ; GFX7-NEXT: s_and_b32 s1, s3, 3
1745 ; GFX7-NEXT: s_lshr_b32 s0, s3, 2
1746 ; GFX7-NEXT: s_and_b32 s2, s2, 0xff
1747 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3
1748 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1
1749 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
1750 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
1751 ; GFX7-NEXT: s_not_b32 s1, s1
1752 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1753 ; GFX7-NEXT: s_mov_b32 s6, -1
1754 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1755 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
1756 ; GFX7-NEXT: v_and_b32_e32 v2, s1, v2
1757 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2
1758 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
1759 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1760 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1761 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1762 ; GFX7-NEXT: s_endpgm
1764 ; GFX10-LABEL: insertelement_v_v8i8_s_s:
1766 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
1767 ; GFX10-NEXT: s_lshr_b32 s0, s3, 2
1768 ; GFX10-NEXT: s_and_b32 s1, s3, 3
1769 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
1770 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3
1771 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff
1772 ; GFX10-NEXT: s_lshl_b32 s3, 0xff, s1
1773 ; GFX10-NEXT: s_lshl_b32 s1, s2, s1
1774 ; GFX10-NEXT: s_not_b32 s2, s3
1775 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0
1776 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1777 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo
1778 ; GFX10-NEXT: v_and_or_b32 v4, v2, s2, s1
1779 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1780 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
1781 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
1782 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1783 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1784 ; GFX10-NEXT: s_endpgm
1786 ; GFX11-LABEL: insertelement_v_v8i8_s_s:
1788 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
1789 ; GFX11-NEXT: s_lshr_b32 s0, s3, 2
1790 ; GFX11-NEXT: s_and_b32 s1, s3, 3
1791 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
1792 ; GFX11-NEXT: s_lshl_b32 s1, s1, 3
1793 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff
1794 ; GFX11-NEXT: s_lshl_b32 s3, 0xff, s1
1795 ; GFX11-NEXT: s_lshl_b32 s1, s2, s1
1796 ; GFX11-NEXT: s_not_b32 s2, s3
1797 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s0, 0
1798 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1799 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo
1800 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1801 ; GFX11-NEXT: v_and_or_b32 v4, v2, s2, s1
1802 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1803 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
1804 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1805 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
1806 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
1807 ; GFX11-NEXT: s_nop 0
1808 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1809 ; GFX11-NEXT: s_endpgm
1810 %vec = load <8 x i8>, ptr addrspace(1 ) %ptr
1811 %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
1812 store <8 x i8> %insert, ptr addrspace(1) null
1816 define amdgpu_ps void @insertelement_s_v8i8_v_s(ptr addrspace(4) inreg %ptr, i8 %val, i32 inreg %idx) {
1817 ; GFX9-LABEL: insertelement_s_v8i8_v_s:
1819 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1820 ; GFX9-NEXT: s_lshr_b32 s2, s4, 2
1821 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1
1822 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
1823 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
1824 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1825 ; GFX9-NEXT: s_cselect_b32 s3, s1, s0
1826 ; GFX9-NEXT: s_and_b32 s4, s4, 3
1827 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3
1828 ; GFX9-NEXT: s_lshl_b32 s5, 0xff, s4
1829 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5
1830 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1831 ; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1
1832 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1833 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1834 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1835 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1836 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1
1837 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1838 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1839 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1840 ; GFX9-NEXT: s_endpgm
1842 ; GFX8-LABEL: insertelement_s_v8i8_v_s:
1844 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1845 ; GFX8-NEXT: s_lshr_b32 s2, s4, 2
1846 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1
1847 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
1848 ; GFX8-NEXT: v_mov_b32_e32 v2, 0
1849 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1850 ; GFX8-NEXT: s_cselect_b32 s3, s1, s0
1851 ; GFX8-NEXT: s_and_b32 s4, s4, 3
1852 ; GFX8-NEXT: s_lshl_b32 s4, s4, 3
1853 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
1854 ; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4
1855 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1856 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4
1857 ; GFX8-NEXT: v_or_b32_e32 v4, s3, v0
1858 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1859 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1860 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
1861 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1
1862 ; GFX8-NEXT: v_mov_b32_e32 v3, 0
1863 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1864 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1865 ; GFX8-NEXT: s_endpgm
1867 ; GFX7-LABEL: insertelement_s_v8i8_v_s:
1869 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1870 ; GFX7-NEXT: s_lshr_b32 s2, s4, 2
1871 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1
1872 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
1873 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0
1874 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1875 ; GFX7-NEXT: s_cselect_b32 s3, s1, s0
1876 ; GFX7-NEXT: s_and_b32 s4, s4, 3
1877 ; GFX7-NEXT: s_lshl_b32 s4, s4, 3
1878 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0
1879 ; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4
1880 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4
1881 ; GFX7-NEXT: v_or_b32_e32 v2, s3, v0
1882 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
1883 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
1884 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
1885 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1
1886 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
1887 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
1888 ; GFX7-NEXT: s_mov_b32 s2, -1
1889 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
1890 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1891 ; GFX7-NEXT: s_endpgm
1893 ; GFX10-LABEL: insertelement_s_v8i8_v_s:
1895 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1896 ; GFX10-NEXT: s_lshr_b32 s2, s4, 2
1897 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0
1898 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1
1899 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
1900 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1901 ; GFX10-NEXT: s_cselect_b32 s3, s1, s0
1902 ; GFX10-NEXT: s_and_b32 s4, s4, 3
1903 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1904 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3
1905 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1906 ; GFX10-NEXT: s_lshl_b32 s5, 0xff, s4
1907 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5
1908 ; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3
1909 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1910 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
1911 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
1912 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
1913 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1914 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1915 ; GFX10-NEXT: s_endpgm
1917 ; GFX11-LABEL: insertelement_s_v8i8_v_s:
1919 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
1920 ; GFX11-NEXT: s_lshr_b32 s2, s4, 2
1921 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v0
1922 ; GFX11-NEXT: s_cmp_eq_u32 s2, 1
1923 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0
1924 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1925 ; GFX11-NEXT: s_cselect_b32 s3, s1, s0
1926 ; GFX11-NEXT: s_and_b32 s4, s4, 3
1927 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
1928 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3
1929 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
1930 ; GFX11-NEXT: s_lshl_b32 s5, 0xff, s4
1931 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1932 ; GFX11-NEXT: s_and_not1_b32 s3, s3, s5
1933 ; GFX11-NEXT: v_lshl_or_b32 v4, v2, s4, s3
1934 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1935 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1936 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_cndmask_b32 v0, v0, v4
1937 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1
1938 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1939 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
1940 ; GFX11-NEXT: s_nop 0
1941 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1942 ; GFX11-NEXT: s_endpgm
1943 %vec = load <8 x i8>, ptr addrspace(4) %ptr
1944 %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
1945 store <8 x i8> %insert, ptr addrspace(1) null
1949 define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 %idx) {
1950 ; GFX9-LABEL: insertelement_s_v8i8_s_v:
1952 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1953 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v0
1954 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
1955 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
1956 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1957 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1958 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
1959 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
1960 ; GFX9-NEXT: s_movk_i32 s2, 0xff
1961 ; GFX9-NEXT: s_and_b32 s3, s4, 0xff
1962 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
1963 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3
1964 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2
1965 ; GFX9-NEXT: v_not_b32_e32 v0, v0
1966 ; GFX9-NEXT: v_and_or_b32 v4, v1, v0, v3
1967 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1968 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1969 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2
1970 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1971 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1972 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1973 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
1974 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
1975 ; GFX9-NEXT: s_endpgm
1977 ; GFX8-LABEL: insertelement_s_v8i8_s_v:
1979 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
1980 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v0
1981 ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
1982 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
1983 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
1984 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1985 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
1986 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
1987 ; GFX8-NEXT: s_movk_i32 s2, 0xff
1988 ; GFX8-NEXT: s_and_b32 s3, s4, 0xff
1989 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
1990 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3
1991 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2
1992 ; GFX8-NEXT: v_not_b32_e32 v0, v0
1993 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
1994 ; GFX8-NEXT: v_or_b32_e32 v4, v0, v3
1995 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
1996 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
1997 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2
1998 ; GFX8-NEXT: v_mov_b32_e32 v2, 0
1999 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
2000 ; GFX8-NEXT: v_mov_b32_e32 v3, 0
2001 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
2002 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2003 ; GFX8-NEXT: s_endpgm
2005 ; GFX7-LABEL: insertelement_s_v8i8_s_v:
2007 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
2008 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v0
2009 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
2010 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
2011 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
2012 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2013 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
2014 ; GFX7-NEXT: v_mov_b32_e32 v3, s1
2015 ; GFX7-NEXT: s_and_b32 s2, s4, 0xff
2016 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2017 ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0
2018 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
2019 ; GFX7-NEXT: v_not_b32_e32 v0, v0
2020 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
2021 ; GFX7-NEXT: v_or_b32_e32 v3, v0, v3
2022 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
2023 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
2024 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2
2025 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1]
2026 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2027 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
2028 ; GFX7-NEXT: s_mov_b32 s2, -1
2029 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2030 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2031 ; GFX7-NEXT: s_endpgm
2033 ; GFX10-LABEL: insertelement_s_v8i8_s_v:
2035 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
2036 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v0
2037 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v0
2038 ; GFX10-NEXT: s_and_b32 s2, s4, 0xff
2039 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1
2040 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
2041 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
2042 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2
2043 ; GFX10-NEXT: v_not_b32_e32 v2, v2
2044 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2045 ; GFX10-NEXT: v_mov_b32_e32 v0, s1
2046 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo
2047 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
2048 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
2049 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
2050 ; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3
2051 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2052 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
2053 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
2054 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
2055 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2056 ; GFX10-NEXT: s_endpgm
2058 ; GFX11-LABEL: insertelement_s_v8i8_s_v:
2060 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
2061 ; GFX11-NEXT: v_and_b32_e32 v1, 3, v0
2062 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 2, v0
2063 ; GFX11-NEXT: s_and_b32 s2, s4, 0xff
2064 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2065 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
2066 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2067 ; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_lshlrev_b32 v1, 3, v1
2068 ; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo
2069 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2070 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
2071 ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v1, s2
2072 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2073 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
2074 ; GFX11-NEXT: v_not_b32_e32 v2, v2
2075 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2076 ; GFX11-NEXT: v_and_or_b32 v5, v5, v2, v3
2077 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
2078 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
2079 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
2080 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
2081 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2082 ; GFX11-NEXT: s_nop 0
2083 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2084 ; GFX11-NEXT: s_endpgm
2085 %vec = load <8 x i8>, ptr addrspace(4) %ptr
2086 %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
2087 store <8 x i8> %insert, ptr addrspace(1) null
2091 define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8 %val, i32 %idx) {
2092 ; GFX9-LABEL: insertelement_s_v8i8_v_v:
2094 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
2095 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v1
2096 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1
2097 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
2098 ; GFX9-NEXT: s_movk_i32 s2, 0xff
2099 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2100 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
2101 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
2102 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
2103 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2104 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2
2105 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2106 ; GFX9-NEXT: v_not_b32_e32 v1, v1
2107 ; GFX9-NEXT: v_and_or_b32 v4, v3, v1, v0
2108 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2109 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2110 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2
2111 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2112 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
2113 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
2114 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
2115 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2116 ; GFX9-NEXT: s_endpgm
2118 ; GFX8-LABEL: insertelement_s_v8i8_v_v:
2120 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
2121 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v1
2122 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
2123 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
2124 ; GFX8-NEXT: s_movk_i32 s2, 0xff
2125 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2126 ; GFX8-NEXT: v_mov_b32_e32 v3, s0
2127 ; GFX8-NEXT: v_mov_b32_e32 v4, s1
2128 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
2129 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2130 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2
2131 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2132 ; GFX8-NEXT: v_not_b32_e32 v1, v1
2133 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
2134 ; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
2135 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2136 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2137 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2
2138 ; GFX8-NEXT: v_mov_b32_e32 v2, 0
2139 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
2140 ; GFX8-NEXT: v_mov_b32_e32 v3, 0
2141 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
2142 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
2143 ; GFX8-NEXT: s_endpgm
2145 ; GFX7-LABEL: insertelement_s_v8i8_v_v:
2147 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
2148 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1
2149 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
2150 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
2151 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
2152 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2153 ; GFX7-NEXT: v_mov_b32_e32 v3, s0
2154 ; GFX7-NEXT: v_mov_b32_e32 v4, s1
2155 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
2156 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
2157 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
2158 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
2159 ; GFX7-NEXT: v_not_b32_e32 v1, v1
2160 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
2161 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v0
2162 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
2163 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
2164 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2
2165 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1]
2166 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
2167 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
2168 ; GFX7-NEXT: s_mov_b32 s2, -1
2169 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
2170 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2171 ; GFX7-NEXT: s_endpgm
2173 ; GFX10-LABEL: insertelement_s_v8i8_v_v:
2175 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
2176 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v1
2177 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v1
2178 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v2
2179 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
2180 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
2181 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2182 ; GFX10-NEXT: v_not_b32_e32 v3, v3
2183 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2184 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
2185 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
2186 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
2187 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
2188 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
2189 ; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2
2190 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2191 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
2192 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
2193 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
2194 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2195 ; GFX10-NEXT: s_endpgm
2197 ; GFX11-LABEL: insertelement_s_v8i8_v_v:
2199 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
2200 ; GFX11-NEXT: v_and_b32_e32 v2, 3, v1
2201 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 2, v1
2202 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
2203 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2204 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
2205 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2206 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v2, 3, v2
2207 ; GFX11-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo
2208 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2209 ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
2210 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v0
2211 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2212 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v4
2213 ; GFX11-NEXT: v_not_b32_e32 v3, v3
2214 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2215 ; GFX11-NEXT: v_and_or_b32 v5, v5, v3, v2
2216 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
2217 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
2218 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
2219 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
2220 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2221 ; GFX11-NEXT: s_nop 0
2222 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2223 ; GFX11-NEXT: s_endpgm
2224 %vec = load <8 x i8>, ptr addrspace(4) %ptr
2225 %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
2226 store <8 x i8> %insert, ptr addrspace(1) null
2230 define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg %val, i32 %idx) {
2231 ; GFX9-LABEL: insertelement_v_v8i8_s_v:
2233 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2234 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v2
2235 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2
2236 ; GFX9-NEXT: s_movk_i32 s0, 0xff
2237 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff
2238 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2
2239 ; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1
2240 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0
2241 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
2242 ; GFX9-NEXT: v_not_b32_e32 v2, v2
2243 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
2244 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5
2245 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2246 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2247 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc
2248 ; GFX9-NEXT: v_and_or_b32 v2, v7, v2, v6
2249 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2250 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2251 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off
2252 ; GFX9-NEXT: s_endpgm
2254 ; GFX8-LABEL: insertelement_v_v8i8_s_v:
2256 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2257 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 2, v2
2258 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2
2259 ; GFX8-NEXT: s_movk_i32 s0, 0xff
2260 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff
2261 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2
2262 ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1
2263 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0
2264 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
2265 ; GFX8-NEXT: v_not_b32_e32 v2, v2
2266 ; GFX8-NEXT: v_mov_b32_e32 v3, 0
2267 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5
2268 ; GFX8-NEXT: v_mov_b32_e32 v4, 0
2269 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2270 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc
2271 ; GFX8-NEXT: v_and_b32_e32 v2, v7, v2
2272 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
2273 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2274 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2275 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
2276 ; GFX8-NEXT: s_endpgm
2278 ; GFX7-LABEL: insertelement_v_v8i8_s_v:
2280 ; GFX7-NEXT: s_mov_b32 s6, 0
2281 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2282 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
2283 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2284 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2
2285 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2
2286 ; GFX7-NEXT: s_and_b32 s0, s2, 0xff
2287 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
2288 ; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2
2289 ; GFX7-NEXT: v_lshl_b32_e32 v2, 0xff, v2
2290 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
2291 ; GFX7-NEXT: v_not_b32_e32 v2, v2
2292 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3
2293 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
2294 ; GFX7-NEXT: s_mov_b32 s6, -1
2295 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2296 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
2297 ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2
2298 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
2299 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2300 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2301 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2302 ; GFX7-NEXT: s_endpgm
2304 ; GFX10-LABEL: insertelement_v_v8i8_s_v:
2306 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2307 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v2
2308 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v2
2309 ; GFX10-NEXT: s_and_b32 s0, s2, 0xff
2310 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
2311 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
2312 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xff
2313 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0
2314 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5
2315 ; GFX10-NEXT: v_not_b32_e32 v3, v4
2316 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2317 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
2318 ; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2
2319 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2320 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
2321 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
2322 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
2323 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2324 ; GFX10-NEXT: s_endpgm
2326 ; GFX11-LABEL: insertelement_v_v8i8_s_v:
2328 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2329 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 2, v2
2330 ; GFX11-NEXT: s_and_b32 s0, s2, 0xff
2331 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2332 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
2333 ; GFX11-NEXT: v_and_b32_e32 v3, 3, v2
2334 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
2335 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2336 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xff
2337 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v3, s0
2338 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v5
2339 ; GFX11-NEXT: v_not_b32_e32 v3, v4
2340 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2341 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
2342 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2343 ; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2
2344 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
2345 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
2346 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
2347 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
2348 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2349 ; GFX11-NEXT: s_nop 0
2350 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2351 ; GFX11-NEXT: s_endpgm
2352 %vec = load <8 x i8>, ptr addrspace(1) %ptr
2353 %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
2354 store <8 x i8> %insert, ptr addrspace(1) null
2358 define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val, i32 inreg %idx) {
2359 ; GFX9-LABEL: insertelement_v_v8i8_v_s:
2361 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2362 ; GFX9-NEXT: s_and_b32 s1, s2, 3
2363 ; GFX9-NEXT: s_lshr_b32 s0, s2, 2
2364 ; GFX9-NEXT: s_lshl_b32 s1, s1, 3
2365 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2366 ; GFX9-NEXT: s_lshl_b32 s1, 0xff, s1
2367 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
2368 ; GFX9-NEXT: s_not_b32 s1, s1
2369 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
2370 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2371 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2372 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
2373 ; GFX9-NEXT: v_and_or_b32 v2, v5, s1, v2
2374 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
2375 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2376 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2377 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off
2378 ; GFX9-NEXT: s_endpgm
2380 ; GFX8-LABEL: insertelement_v_v8i8_v_s:
2382 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2383 ; GFX8-NEXT: s_and_b32 s1, s2, 3
2384 ; GFX8-NEXT: s_lshr_b32 s0, s2, 2
2385 ; GFX8-NEXT: s_lshl_b32 s1, s1, 3
2386 ; GFX8-NEXT: v_mov_b32_e32 v5, s1
2387 ; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1
2388 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
2389 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2390 ; GFX8-NEXT: s_not_b32 s1, s1
2391 ; GFX8-NEXT: v_mov_b32_e32 v3, 0
2392 ; GFX8-NEXT: v_mov_b32_e32 v4, 0
2393 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2394 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
2395 ; GFX8-NEXT: v_and_b32_e32 v5, s1, v5
2396 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2
2397 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
2398 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2399 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2400 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1]
2401 ; GFX8-NEXT: s_endpgm
2403 ; GFX7-LABEL: insertelement_v_v8i8_v_s:
2405 ; GFX7-NEXT: s_mov_b32 s6, 0
2406 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2407 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
2408 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2409 ; GFX7-NEXT: s_and_b32 s1, s2, 3
2410 ; GFX7-NEXT: s_lshr_b32 s0, s2, 2
2411 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
2412 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3
2413 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2
2414 ; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1
2415 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1
2416 ; GFX7-NEXT: s_not_b32 s1, s1
2417 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
2418 ; GFX7-NEXT: s_mov_b32 s6, -1
2419 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2420 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
2421 ; GFX7-NEXT: v_and_b32_e32 v3, s1, v3
2422 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
2423 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0
2424 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2425 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2426 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2427 ; GFX7-NEXT: s_endpgm
2429 ; GFX10-LABEL: insertelement_v_v8i8_v_s:
2431 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2432 ; GFX10-NEXT: s_lshr_b32 s1, s2, 2
2433 ; GFX10-NEXT: s_and_b32 s0, s2, 3
2434 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1
2435 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3
2436 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2437 ; GFX10-NEXT: s_lshl_b32 s0, 0xff, s0
2438 ; GFX10-NEXT: s_not_b32 s0, s0
2439 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2440 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo
2441 ; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2
2442 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0
2443 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2444 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
2445 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
2446 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
2447 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2448 ; GFX10-NEXT: s_endpgm
2450 ; GFX11-LABEL: insertelement_v_v8i8_v_s:
2452 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2453 ; GFX11-NEXT: s_lshr_b32 s1, s2, 2
2454 ; GFX11-NEXT: s_and_b32 s0, s2, 3
2455 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1
2456 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
2457 ; GFX11-NEXT: s_lshl_b32 s0, s0, 3
2458 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2459 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2460 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v0, v1 :: v_dual_lshlrev_b32 v2, s0, v2
2461 ; GFX11-NEXT: s_lshl_b32 s0, 0xff, s0
2462 ; GFX11-NEXT: s_not_b32 s0, s0
2463 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
2464 ; GFX11-NEXT: v_and_or_b32 v4, v3, s0, v2
2465 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s1, 0
2466 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
2467 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
2468 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2469 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
2470 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
2471 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2472 ; GFX11-NEXT: s_nop 0
2473 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2474 ; GFX11-NEXT: s_endpgm
2475 %vec = load <8 x i8>, ptr addrspace(1) %ptr
2476 %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
2477 store <8 x i8> %insert, ptr addrspace(1) null
2481 define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val, i32 %idx) {
2482 ; GFX9-LABEL: insertelement_v_v8i8_v_v:
2484 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2485 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v3
2486 ; GFX9-NEXT: v_and_b32_e32 v3, 3, v3
2487 ; GFX9-NEXT: s_movk_i32 s0, 0xff
2488 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3
2489 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2490 ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0
2491 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
2492 ; GFX9-NEXT: v_not_b32_e32 v3, v3
2493 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2494 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
2495 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
2496 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2497 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc
2498 ; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2
2499 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2500 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2501 ; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
2502 ; GFX9-NEXT: s_endpgm
2504 ; GFX8-LABEL: insertelement_v_v8i8_v_v:
2506 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
2507 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 2, v3
2508 ; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
2509 ; GFX8-NEXT: s_movk_i32 s0, 0xff
2510 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
2511 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2512 ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0
2513 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
2514 ; GFX8-NEXT: v_not_b32_e32 v3, v3
2515 ; GFX8-NEXT: v_mov_b32_e32 v4, 0
2516 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6
2517 ; GFX8-NEXT: v_mov_b32_e32 v5, 0
2518 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2519 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc
2520 ; GFX8-NEXT: v_and_b32_e32 v3, v7, v3
2521 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
2522 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2523 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2524 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
2525 ; GFX8-NEXT: s_endpgm
2527 ; GFX7-LABEL: insertelement_v_v8i8_v_v:
2529 ; GFX7-NEXT: s_mov_b32 s6, 0
2530 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2531 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
2532 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
2533 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v3
2534 ; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
2535 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
2536 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
2537 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2
2538 ; GFX7-NEXT: v_lshl_b32_e32 v3, 0xff, v3
2539 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
2540 ; GFX7-NEXT: v_not_b32_e32 v3, v3
2541 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
2542 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
2543 ; GFX7-NEXT: s_mov_b32 s6, -1
2544 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2545 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
2546 ; GFX7-NEXT: v_and_b32_e32 v3, v5, v3
2547 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2
2548 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
2549 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
2550 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2551 ; GFX7-NEXT: s_endpgm
2553 ; GFX10-LABEL: insertelement_v_v8i8_v_v:
2555 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
2556 ; GFX10-NEXT: v_and_b32_e32 v4, 3, v3
2557 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 2, v3
2558 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
2559 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
2560 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6
2561 ; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xff
2562 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2563 ; GFX10-NEXT: v_not_b32_e32 v3, v5
2564 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2565 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
2566 ; GFX10-NEXT: v_and_or_b32 v4, v4, v3, v2
2567 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
2568 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
2569 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
2570 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
2571 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
2572 ; GFX10-NEXT: s_endpgm
2574 ; GFX11-LABEL: insertelement_v_v8i8_v_v:
2576 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off
2577 ; GFX11-NEXT: v_and_b32_e32 v4, 3, v3
2578 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 2, v3
2579 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
2580 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
2581 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
2582 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
2583 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v6
2584 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
2585 ; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xff
2586 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
2587 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2588 ; GFX11-NEXT: v_not_b32_e32 v3, v5
2589 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2590 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
2591 ; GFX11-NEXT: v_and_or_b32 v4, v4, v3, v2
2592 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
2593 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
2594 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
2595 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
2596 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
2597 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off
2598 ; GFX11-NEXT: s_nop 0
2599 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2600 ; GFX11-NEXT: s_endpgm
2601 %vec = load <8 x i8>, ptr addrspace(1) %ptr
2602 %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx
2603 store <8 x i8> %insert, ptr addrspace(1) null
2607 define amdgpu_ps void @insertelement_s_v16i8_s_s(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 inreg %idx) {
2608 ; GFX9-LABEL: insertelement_s_v16i8_s_s:
2610 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
2611 ; GFX9-NEXT: s_lshr_b32 s6, s5, 2
2612 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1
2613 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2614 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
2615 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2616 ; GFX9-NEXT: s_cselect_b32 s7, s1, s0
2617 ; GFX9-NEXT: s_cmp_eq_u32 s6, 2
2618 ; GFX9-NEXT: s_cselect_b32 s7, s2, s7
2619 ; GFX9-NEXT: s_cmp_eq_u32 s6, 3
2620 ; GFX9-NEXT: s_cselect_b32 s7, s3, s7
2621 ; GFX9-NEXT: s_and_b32 s5, s5, 3
2622 ; GFX9-NEXT: s_lshl_b32 s5, s5, 3
2623 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff
2624 ; GFX9-NEXT: s_lshl_b32 s4, s4, s5
2625 ; GFX9-NEXT: s_lshl_b32 s5, 0xff, s5
2626 ; GFX9-NEXT: s_andn2_b32 s5, s7, s5
2627 ; GFX9-NEXT: s_or_b32 s4, s5, s4
2628 ; GFX9-NEXT: s_cmp_eq_u32 s6, 0
2629 ; GFX9-NEXT: s_cselect_b32 s0, s4, s0
2630 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1
2631 ; GFX9-NEXT: s_cselect_b32 s1, s4, s1
2632 ; GFX9-NEXT: s_cmp_eq_u32 s6, 2
2633 ; GFX9-NEXT: s_cselect_b32 s2, s4, s2
2634 ; GFX9-NEXT: s_cmp_eq_u32 s6, 3
2635 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3
2636 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2637 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2638 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2639 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2640 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
2641 ; GFX9-NEXT: s_endpgm
2643 ; GFX8-LABEL: insertelement_s_v16i8_s_s:
2645 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
2646 ; GFX8-NEXT: s_lshr_b32 s6, s5, 2
2647 ; GFX8-NEXT: s_cmp_eq_u32 s6, 1
2648 ; GFX8-NEXT: v_mov_b32_e32 v4, 0
2649 ; GFX8-NEXT: v_mov_b32_e32 v5, 0
2650 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2651 ; GFX8-NEXT: s_cselect_b32 s7, s1, s0
2652 ; GFX8-NEXT: s_cmp_eq_u32 s6, 2
2653 ; GFX8-NEXT: s_cselect_b32 s7, s2, s7
2654 ; GFX8-NEXT: s_cmp_eq_u32 s6, 3
2655 ; GFX8-NEXT: s_cselect_b32 s7, s3, s7
2656 ; GFX8-NEXT: s_and_b32 s5, s5, 3
2657 ; GFX8-NEXT: s_lshl_b32 s5, s5, 3
2658 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff
2659 ; GFX8-NEXT: s_lshl_b32 s4, s4, s5
2660 ; GFX8-NEXT: s_lshl_b32 s5, 0xff, s5
2661 ; GFX8-NEXT: s_andn2_b32 s5, s7, s5
2662 ; GFX8-NEXT: s_or_b32 s4, s5, s4
2663 ; GFX8-NEXT: s_cmp_eq_u32 s6, 0
2664 ; GFX8-NEXT: s_cselect_b32 s0, s4, s0
2665 ; GFX8-NEXT: s_cmp_eq_u32 s6, 1
2666 ; GFX8-NEXT: s_cselect_b32 s1, s4, s1
2667 ; GFX8-NEXT: s_cmp_eq_u32 s6, 2
2668 ; GFX8-NEXT: s_cselect_b32 s2, s4, s2
2669 ; GFX8-NEXT: s_cmp_eq_u32 s6, 3
2670 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3
2671 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
2672 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
2673 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
2674 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
2675 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2676 ; GFX8-NEXT: s_endpgm
2678 ; GFX7-LABEL: insertelement_s_v16i8_s_s:
2680 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
2681 ; GFX7-NEXT: s_lshr_b32 s6, s5, 2
2682 ; GFX7-NEXT: s_cmp_eq_u32 s6, 1
2683 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2684 ; GFX7-NEXT: s_cselect_b32 s7, s1, s0
2685 ; GFX7-NEXT: s_cmp_eq_u32 s6, 2
2686 ; GFX7-NEXT: s_cselect_b32 s7, s2, s7
2687 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3
2688 ; GFX7-NEXT: s_cselect_b32 s7, s3, s7
2689 ; GFX7-NEXT: s_and_b32 s5, s5, 3
2690 ; GFX7-NEXT: s_lshl_b32 s5, s5, 3
2691 ; GFX7-NEXT: s_and_b32 s4, s4, 0xff
2692 ; GFX7-NEXT: s_lshl_b32 s4, s4, s5
2693 ; GFX7-NEXT: s_lshl_b32 s5, 0xff, s5
2694 ; GFX7-NEXT: s_andn2_b32 s5, s7, s5
2695 ; GFX7-NEXT: s_or_b32 s4, s5, s4
2696 ; GFX7-NEXT: s_cmp_eq_u32 s6, 0
2697 ; GFX7-NEXT: s_cselect_b32 s0, s4, s0
2698 ; GFX7-NEXT: s_cmp_eq_u32 s6, 1
2699 ; GFX7-NEXT: s_cselect_b32 s1, s4, s1
2700 ; GFX7-NEXT: s_cmp_eq_u32 s6, 2
2701 ; GFX7-NEXT: s_cselect_b32 s2, s4, s2
2702 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3
2703 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3
2704 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
2705 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
2706 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
2707 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
2708 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
2709 ; GFX7-NEXT: s_mov_b32 s6, -1
2710 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
2711 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2712 ; GFX7-NEXT: s_endpgm
2714 ; GFX10-LABEL: insertelement_s_v16i8_s_s:
2716 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
2717 ; GFX10-NEXT: s_lshr_b32 s6, s5, 2
2718 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
2719 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1
2720 ; GFX10-NEXT: v_mov_b32_e32 v5, 0
2721 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2722 ; GFX10-NEXT: s_cselect_b32 s7, s1, s0
2723 ; GFX10-NEXT: s_cmp_eq_u32 s6, 2
2724 ; GFX10-NEXT: s_cselect_b32 s7, s2, s7
2725 ; GFX10-NEXT: s_cmp_eq_u32 s6, 3
2726 ; GFX10-NEXT: s_cselect_b32 s7, s3, s7
2727 ; GFX10-NEXT: s_and_b32 s5, s5, 3
2728 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff
2729 ; GFX10-NEXT: s_lshl_b32 s5, s5, 3
2730 ; GFX10-NEXT: s_lshl_b32 s8, 0xff, s5
2731 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5
2732 ; GFX10-NEXT: s_andn2_b32 s5, s7, s8
2733 ; GFX10-NEXT: s_or_b32 s4, s5, s4
2734 ; GFX10-NEXT: s_cmp_eq_u32 s6, 0
2735 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0
2736 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1
2737 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1
2738 ; GFX10-NEXT: s_cmp_eq_u32 s6, 2
2739 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2
2740 ; GFX10-NEXT: s_cmp_eq_u32 s6, 3
2741 ; GFX10-NEXT: s_cselect_b32 s3, s4, s3
2742 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
2743 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
2744 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
2745 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
2746 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
2747 ; GFX10-NEXT: s_endpgm
2749 ; GFX11-LABEL: insertelement_s_v16i8_s_s:
2751 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
2752 ; GFX11-NEXT: s_lshr_b32 s6, s5, 2
2753 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2754 ; GFX11-NEXT: s_cmp_eq_u32 s6, 1
2755 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2756 ; GFX11-NEXT: s_cselect_b32 s7, s1, s0
2757 ; GFX11-NEXT: s_cmp_eq_u32 s6, 2
2758 ; GFX11-NEXT: s_cselect_b32 s7, s2, s7
2759 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3
2760 ; GFX11-NEXT: s_cselect_b32 s7, s3, s7
2761 ; GFX11-NEXT: s_and_b32 s5, s5, 3
2762 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff
2763 ; GFX11-NEXT: s_lshl_b32 s5, s5, 3
2764 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
2765 ; GFX11-NEXT: s_lshl_b32 s8, 0xff, s5
2766 ; GFX11-NEXT: s_lshl_b32 s4, s4, s5
2767 ; GFX11-NEXT: s_and_not1_b32 s5, s7, s8
2768 ; GFX11-NEXT: s_or_b32 s4, s5, s4
2769 ; GFX11-NEXT: s_cmp_eq_u32 s6, 0
2770 ; GFX11-NEXT: s_cselect_b32 s0, s4, s0
2771 ; GFX11-NEXT: s_cmp_eq_u32 s6, 1
2772 ; GFX11-NEXT: s_cselect_b32 s1, s4, s1
2773 ; GFX11-NEXT: s_cmp_eq_u32 s6, 2
2774 ; GFX11-NEXT: s_cselect_b32 s2, s4, s2
2775 ; GFX11-NEXT: s_cmp_eq_u32 s6, 3
2776 ; GFX11-NEXT: s_cselect_b32 s3, s4, s3
2777 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2778 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s0
2779 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
2780 ; GFX11-NEXT: v_mov_b32_e32 v3, s3
2781 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2782 ; GFX11-NEXT: s_nop 0
2783 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2784 ; GFX11-NEXT: s_endpgm
2785 %vec = load <16 x i8>, ptr addrspace(4) %ptr
2786 %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
2787 store <16 x i8> %insert, ptr addrspace(1) null
2791 define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg %val, i32 inreg %idx) {
2792 ; GFX9-LABEL: insertelement_v_v16i8_s_s:
2794 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
2795 ; GFX9-NEXT: s_and_b32 s0, s3, 3
2796 ; GFX9-NEXT: s_lshr_b32 s4, s3, 2
2797 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff
2798 ; GFX9-NEXT: s_lshl_b32 s0, s0, 3
2799 ; GFX9-NEXT: s_lshl_b32 s1, s1, s0
2800 ; GFX9-NEXT: s_lshl_b32 s0, 0xff, s0
2801 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
2802 ; GFX9-NEXT: s_not_b32 s5, s0
2803 ; GFX9-NEXT: v_mov_b32_e32 v6, s1
2804 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
2805 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
2806 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2807 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
2808 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2809 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc
2810 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v2, s[0:1]
2811 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[2:3]
2812 ; GFX9-NEXT: v_and_or_b32 v6, v7, s5, v6
2813 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
2814 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2815 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
2816 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2817 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3]
2818 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
2819 ; GFX9-NEXT: s_endpgm
2821 ; GFX8-LABEL: insertelement_v_v16i8_s_s:
2823 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
2824 ; GFX8-NEXT: s_and_b32 s0, s3, 3
2825 ; GFX8-NEXT: s_lshr_b32 s4, s3, 2
2826 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff
2827 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3
2828 ; GFX8-NEXT: s_lshl_b32 s5, s1, s0
2829 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
2830 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
2831 ; GFX8-NEXT: s_not_b32 s6, s0
2832 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
2833 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
2834 ; GFX8-NEXT: v_mov_b32_e32 v4, 0
2835 ; GFX8-NEXT: v_mov_b32_e32 v5, 0
2836 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2837 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
2838 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
2839 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3]
2840 ; GFX8-NEXT: v_and_b32_e32 v6, s6, v6
2841 ; GFX8-NEXT: v_or_b32_e32 v6, s5, v6
2842 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
2843 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2844 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
2845 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2846 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3]
2847 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2848 ; GFX8-NEXT: s_endpgm
2850 ; GFX7-LABEL: insertelement_v_v16i8_s_s:
2852 ; GFX7-NEXT: s_mov_b32 s10, 0
2853 ; GFX7-NEXT: s_mov_b32 s11, 0xf000
2854 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
2855 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
2856 ; GFX7-NEXT: s_and_b32 s0, s3, 3
2857 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2
2858 ; GFX7-NEXT: s_and_b32 s1, s2, 0xff
2859 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3
2860 ; GFX7-NEXT: s_lshl_b32 s5, s1, s0
2861 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
2862 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
2863 ; GFX7-NEXT: s_not_b32 s6, s0
2864 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
2865 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
2866 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
2867 ; GFX7-NEXT: s_mov_b32 s10, -1
2868 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2869 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
2870 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1]
2871 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3]
2872 ; GFX7-NEXT: v_and_b32_e32 v4, s6, v4
2873 ; GFX7-NEXT: v_or_b32_e32 v4, s5, v4
2874 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
2875 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
2876 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
2877 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
2878 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3]
2879 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2880 ; GFX7-NEXT: s_endpgm
2882 ; GFX10-LABEL: insertelement_v_v16i8_s_s:
2884 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
2885 ; GFX10-NEXT: s_lshr_b32 s4, s3, 2
2886 ; GFX10-NEXT: s_and_b32 s1, s3, 3
2887 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
2888 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2
2889 ; GFX10-NEXT: s_lshl_b32 s3, s1, 3
2890 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3
2891 ; GFX10-NEXT: s_and_b32 s2, s2, 0xff
2892 ; GFX10-NEXT: s_lshl_b32 s5, 0xff, s3
2893 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3
2894 ; GFX10-NEXT: s_not_b32 s3, s5
2895 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2896 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
2897 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0
2898 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1
2899 ; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2
2900 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0
2901 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
2902 ; GFX10-NEXT: v_mov_b32_e32 v5, 0
2903 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
2904 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2
2905 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
2906 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1
2907 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
2908 ; GFX10-NEXT: s_endpgm
2910 ; GFX11-LABEL: insertelement_v_v16i8_s_s:
2912 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
2913 ; GFX11-NEXT: s_lshr_b32 s4, s3, 2
2914 ; GFX11-NEXT: s_and_b32 s1, s3, 3
2915 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1
2916 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s4, 2
2917 ; GFX11-NEXT: s_lshl_b32 s3, s1, 3
2918 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s4, 3
2919 ; GFX11-NEXT: s_and_b32 s2, s2, 0xff
2920 ; GFX11-NEXT: s_lshl_b32 s5, 0xff, s3
2921 ; GFX11-NEXT: s_lshl_b32 s2, s2, s3
2922 ; GFX11-NEXT: s_not_b32 s3, s5
2923 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2924 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo
2925 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2926 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v2, s0
2927 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v3, s1
2928 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
2929 ; GFX11-NEXT: v_and_or_b32 v6, v4, s3, s2
2930 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s4, 0
2931 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
2932 ; GFX11-NEXT: v_mov_b32_e32 v5, 0
2933 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
2934 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
2935 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2
2936 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
2937 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s1
2938 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
2939 ; GFX11-NEXT: s_nop 0
2940 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2941 ; GFX11-NEXT: s_endpgm
2942 %vec = load <16 x i8>, ptr addrspace(1 ) %ptr
2943 %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
2944 store <16 x i8> %insert, ptr addrspace(1) null
2948 define amdgpu_ps void @insertelement_s_v16i8_v_s(ptr addrspace(4) inreg %ptr, i8 %val, i32 inreg %idx) {
2949 ; GFX9-LABEL: insertelement_s_v16i8_v_s:
2951 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
2952 ; GFX9-NEXT: s_lshr_b32 s5, s4, 2
2953 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1
2954 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
2955 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
2956 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2957 ; GFX9-NEXT: s_cselect_b32 s6, s1, s0
2958 ; GFX9-NEXT: s_cmp_eq_u32 s5, 2
2959 ; GFX9-NEXT: s_cselect_b32 s6, s2, s6
2960 ; GFX9-NEXT: s_cmp_eq_u32 s5, 3
2961 ; GFX9-NEXT: s_cselect_b32 s6, s3, s6
2962 ; GFX9-NEXT: s_and_b32 s4, s4, 3
2963 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3
2964 ; GFX9-NEXT: s_lshl_b32 s7, 0xff, s4
2965 ; GFX9-NEXT: s_andn2_b32 s6, s6, s7
2966 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2967 ; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1
2968 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2969 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2970 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
2971 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
2972 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2973 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
2974 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2
2975 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2976 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
2977 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2978 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3
2979 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
2980 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
2981 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
2982 ; GFX9-NEXT: s_endpgm
2984 ; GFX8-LABEL: insertelement_s_v16i8_v_s:
2986 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
2987 ; GFX8-NEXT: s_lshr_b32 s5, s4, 2
2988 ; GFX8-NEXT: s_cmp_eq_u32 s5, 1
2989 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
2990 ; GFX8-NEXT: v_mov_b32_e32 v4, 0
2991 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2992 ; GFX8-NEXT: s_cselect_b32 s6, s1, s0
2993 ; GFX8-NEXT: s_cmp_eq_u32 s5, 2
2994 ; GFX8-NEXT: s_cselect_b32 s6, s2, s6
2995 ; GFX8-NEXT: s_cmp_eq_u32 s5, 3
2996 ; GFX8-NEXT: s_cselect_b32 s6, s3, s6
2997 ; GFX8-NEXT: s_and_b32 s4, s4, 3
2998 ; GFX8-NEXT: s_lshl_b32 s4, s4, 3
2999 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
3000 ; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4
3001 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3002 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4
3003 ; GFX8-NEXT: v_or_b32_e32 v6, s4, v0
3004 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
3005 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
3006 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
3007 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
3008 ; GFX8-NEXT: v_mov_b32_e32 v2, s2
3009 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
3010 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2
3011 ; GFX8-NEXT: v_mov_b32_e32 v3, s3
3012 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
3013 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3
3014 ; GFX8-NEXT: v_mov_b32_e32 v5, 0
3015 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
3016 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3017 ; GFX8-NEXT: s_endpgm
3019 ; GFX7-LABEL: insertelement_s_v16i8_v_s:
3021 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
3022 ; GFX7-NEXT: s_lshr_b32 s5, s4, 2
3023 ; GFX7-NEXT: s_cmp_eq_u32 s5, 1
3024 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
3025 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
3026 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3027 ; GFX7-NEXT: s_cselect_b32 s6, s1, s0
3028 ; GFX7-NEXT: s_cmp_eq_u32 s5, 2
3029 ; GFX7-NEXT: s_cselect_b32 s6, s2, s6
3030 ; GFX7-NEXT: s_cmp_eq_u32 s5, 3
3031 ; GFX7-NEXT: s_cselect_b32 s6, s3, s6
3032 ; GFX7-NEXT: s_and_b32 s4, s4, 3
3033 ; GFX7-NEXT: s_lshl_b32 s4, s4, 3
3034 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0
3035 ; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4
3036 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4
3037 ; GFX7-NEXT: v_or_b32_e32 v4, s4, v0
3038 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
3039 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
3040 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
3041 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
3042 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
3043 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
3044 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2
3045 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
3046 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
3047 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3
3048 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
3049 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
3050 ; GFX7-NEXT: s_mov_b32 s2, -1
3051 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
3052 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3053 ; GFX7-NEXT: s_endpgm
3055 ; GFX10-LABEL: insertelement_s_v16i8_v_s:
3057 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
3058 ; GFX10-NEXT: s_lshr_b32 s5, s4, 2
3059 ; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v0
3060 ; GFX10-NEXT: s_cmp_eq_u32 s5, 1
3061 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0
3062 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3063 ; GFX10-NEXT: s_cselect_b32 s6, s1, s0
3064 ; GFX10-NEXT: s_cmp_eq_u32 s5, 2
3065 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
3066 ; GFX10-NEXT: s_cselect_b32 s6, s2, s6
3067 ; GFX10-NEXT: s_cmp_eq_u32 s5, 3
3068 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
3069 ; GFX10-NEXT: s_cselect_b32 s6, s3, s6
3070 ; GFX10-NEXT: s_and_b32 s4, s4, 3
3071 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
3072 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3
3073 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
3074 ; GFX10-NEXT: s_lshl_b32 s7, 0xff, s4
3075 ; GFX10-NEXT: s_andn2_b32 s6, s6, s7
3076 ; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6
3077 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
3078 ; GFX10-NEXT: v_mov_b32_e32 v5, 0
3079 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
3080 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1
3081 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
3082 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2
3083 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
3084 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3
3085 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
3086 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3087 ; GFX10-NEXT: s_endpgm
3089 ; GFX11-LABEL: insertelement_s_v16i8_v_s:
3091 ; GFX11-NEXT: s_load_b128 s[0:3], s[2:3], 0x0
3092 ; GFX11-NEXT: s_lshr_b32 s5, s4, 2
3093 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0
3094 ; GFX11-NEXT: s_cmp_eq_u32 s5, 1
3095 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0
3096 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3097 ; GFX11-NEXT: s_cselect_b32 s6, s1, s0
3098 ; GFX11-NEXT: s_cmp_eq_u32 s5, 2
3099 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
3100 ; GFX11-NEXT: s_cselect_b32 s6, s2, s6
3101 ; GFX11-NEXT: s_cmp_eq_u32 s5, 3
3102 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
3103 ; GFX11-NEXT: s_cselect_b32 s6, s3, s6
3104 ; GFX11-NEXT: s_and_b32 s4, s4, 3
3105 ; GFX11-NEXT: v_mov_b32_e32 v2, s2
3106 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3
3107 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3108 ; GFX11-NEXT: s_lshl_b32 s7, 0xff, s4
3109 ; GFX11-NEXT: s_and_not1_b32 s6, s6, s7
3110 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3111 ; GFX11-NEXT: v_lshl_or_b32 v6, v4, s4, s6
3112 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
3113 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1
3114 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
3115 ; GFX11-NEXT: v_mov_b32_e32 v5, 0
3116 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo
3117 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 2
3118 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo
3119 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 3
3120 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc_lo
3121 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
3122 ; GFX11-NEXT: s_nop 0
3123 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3124 ; GFX11-NEXT: s_endpgm
3125 %vec = load <16 x i8>, ptr addrspace(4) %ptr
3126 %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
3127 store <16 x i8> %insert, ptr addrspace(1) null
3131 define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8 inreg %val, i32 %idx) {
3132 ; GFX9-LABEL: insertelement_s_v16i8_s_v:
3134 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
3135 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v0
3136 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
3137 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0
3138 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
3139 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3140 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
3141 ; GFX9-NEXT: v_mov_b32_e32 v2, s9
3142 ; GFX9-NEXT: v_mov_b32_e32 v3, s10
3143 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
3144 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
3145 ; GFX9-NEXT: s_movk_i32 s5, 0xff
3146 ; GFX9-NEXT: s_and_b32 s4, s4, 0xff
3147 ; GFX9-NEXT: v_mov_b32_e32 v5, s11
3148 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3149 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
3150 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4
3151 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5
3152 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
3153 ; GFX9-NEXT: v_not_b32_e32 v0, v0
3154 ; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2
3155 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
3156 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
3157 ; GFX9-NEXT: v_mov_b32_e32 v2, s10
3158 ; GFX9-NEXT: v_mov_b32_e32 v3, s11
3159 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
3160 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3161 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
3162 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
3163 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
3164 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
3165 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3]
3166 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3167 ; GFX9-NEXT: s_endpgm
3169 ; GFX8-LABEL: insertelement_s_v16i8_s_v:
3171 ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
3172 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v0
3173 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
3174 ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0
3175 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
3176 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3177 ; GFX8-NEXT: v_mov_b32_e32 v1, s8
3178 ; GFX8-NEXT: v_mov_b32_e32 v2, s9
3179 ; GFX8-NEXT: v_mov_b32_e32 v3, s10
3180 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
3181 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
3182 ; GFX8-NEXT: s_movk_i32 s5, 0xff
3183 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff
3184 ; GFX8-NEXT: v_mov_b32_e32 v5, s11
3185 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3186 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
3187 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4
3188 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5
3189 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
3190 ; GFX8-NEXT: v_not_b32_e32 v0, v0
3191 ; GFX8-NEXT: v_and_b32_e32 v0, v1, v0
3192 ; GFX8-NEXT: v_or_b32_e32 v6, v0, v2
3193 ; GFX8-NEXT: v_mov_b32_e32 v0, s8
3194 ; GFX8-NEXT: v_mov_b32_e32 v1, s9
3195 ; GFX8-NEXT: v_mov_b32_e32 v2, s10
3196 ; GFX8-NEXT: v_mov_b32_e32 v3, s11
3197 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
3198 ; GFX8-NEXT: v_mov_b32_e32 v4, 0
3199 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
3200 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
3201 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
3202 ; GFX8-NEXT: v_mov_b32_e32 v5, 0
3203 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3]
3204 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3205 ; GFX8-NEXT: s_endpgm
3207 ; GFX7-LABEL: insertelement_s_v16i8_s_v:
3209 ; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
3210 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0
3211 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
3212 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0
3213 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
3214 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3215 ; GFX7-NEXT: v_mov_b32_e32 v1, s8
3216 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
3217 ; GFX7-NEXT: v_mov_b32_e32 v3, s10
3218 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
3219 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0
3220 ; GFX7-NEXT: s_and_b32 s4, s4, 0xff
3221 ; GFX7-NEXT: v_mov_b32_e32 v5, s11
3222 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3223 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
3224 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0
3225 ; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0
3226 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
3227 ; GFX7-NEXT: v_not_b32_e32 v0, v0
3228 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0
3229 ; GFX7-NEXT: v_or_b32_e32 v5, v0, v2
3230 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
3231 ; GFX7-NEXT: v_mov_b32_e32 v1, s9
3232 ; GFX7-NEXT: v_mov_b32_e32 v2, s10
3233 ; GFX7-NEXT: v_mov_b32_e32 v3, s11
3234 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
3235 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
3236 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
3237 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
3238 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3]
3239 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
3240 ; GFX7-NEXT: s_mov_b32 s2, -1
3241 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
3242 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3243 ; GFX7-NEXT: s_endpgm
3245 ; GFX10-LABEL: insertelement_s_v16i8_s_v:
3247 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
3248 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 2, v0
3249 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v0
3250 ; GFX10-NEXT: s_and_b32 s1, s4, 0xff
3251 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
3252 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3253 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6
3254 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
3255 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
3256 ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1
3257 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
3258 ; GFX10-NEXT: v_not_b32_e32 v5, v2
3259 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3260 ; GFX10-NEXT: v_mov_b32_e32 v0, s9
3261 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo
3262 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0
3263 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1
3264 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
3265 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
3266 ; GFX10-NEXT: v_mov_b32_e32 v2, s10
3267 ; GFX10-NEXT: v_mov_b32_e32 v3, s11
3268 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4
3269 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
3270 ; GFX10-NEXT: v_mov_b32_e32 v5, 0
3271 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2
3272 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
3273 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0
3274 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
3275 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3276 ; GFX10-NEXT: s_endpgm
3278 ; GFX11-LABEL: insertelement_s_v16i8_s_v:
3280 ; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
3281 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 2, v0
3282 ; GFX11-NEXT: v_and_b32_e32 v1, 3, v0
3283 ; GFX11-NEXT: s_and_b32 s1, s4, 0xff
3284 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
3285 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
3286 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6
3287 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
3288 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3289 ; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_lshlrev_b32 v1, 3, v1
3290 ; GFX11-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo
3291 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
3292 ; GFX11-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff
3293 ; GFX11-NEXT: v_lshlrev_b32_e64 v4, v1, s1
3294 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
3295 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0
3296 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
3297 ; GFX11-NEXT: v_not_b32_e32 v5, v2
3298 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1
3299 ; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
3300 ; GFX11-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
3301 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3302 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4
3303 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
3304 ; GFX11-NEXT: v_mov_b32_e32 v5, 0
3305 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
3306 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2
3307 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0
3308 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
3309 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
3310 ; GFX11-NEXT: s_nop 0
3311 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3312 ; GFX11-NEXT: s_endpgm
3313 %vec = load <16 x i8>, ptr addrspace(4) %ptr
3314 %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
3315 store <16 x i8> %insert, ptr addrspace(1) null
3319 define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8 %val, i32 %idx) {
3320 ; GFX9-LABEL: insertelement_s_v16i8_v_v:
3322 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
3323 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v1
3324 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
3325 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1
3326 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
3327 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3328 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
3329 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
3330 ; GFX9-NEXT: v_mov_b32_e32 v5, s6
3331 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
3332 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3333 ; GFX9-NEXT: s_movk_i32 s8, 0xff
3334 ; GFX9-NEXT: v_mov_b32_e32 v6, s7
3335 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
3336 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
3337 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3338 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8
3339 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
3340 ; GFX9-NEXT: v_not_b32_e32 v1, v1
3341 ; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0
3342 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3343 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3344 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
3345 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
3346 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
3347 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3348 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
3349 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
3350 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
3351 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
3352 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3]
3353 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3354 ; GFX9-NEXT: s_endpgm
3356 ; GFX8-LABEL: insertelement_s_v16i8_v_v:
3358 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
3359 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1
3360 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
3361 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1
3362 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
3363 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
3364 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
3365 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
3366 ; GFX8-NEXT: v_mov_b32_e32 v5, s6
3367 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
3368 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3369 ; GFX8-NEXT: s_movk_i32 s8, 0xff
3370 ; GFX8-NEXT: v_mov_b32_e32 v6, s7
3371 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
3372 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
3373 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3374 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8
3375 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
3376 ; GFX8-NEXT: v_not_b32_e32 v1, v1
3377 ; GFX8-NEXT: v_and_b32_e32 v1, v2, v1
3378 ; GFX8-NEXT: v_or_b32_e32 v6, v1, v0
3379 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
3380 ; GFX8-NEXT: v_mov_b32_e32 v1, s5
3381 ; GFX8-NEXT: v_mov_b32_e32 v2, s6
3382 ; GFX8-NEXT: v_mov_b32_e32 v3, s7
3383 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
3384 ; GFX8-NEXT: v_mov_b32_e32 v4, 0
3385 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5]
3386 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
3387 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
3388 ; GFX8-NEXT: v_mov_b32_e32 v5, 0
3389 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3]
3390 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3391 ; GFX8-NEXT: s_endpgm
3393 ; GFX7-LABEL: insertelement_s_v16i8_v_v:
3395 ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
3396 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1
3397 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
3398 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1
3399 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4
3400 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3401 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
3402 ; GFX7-NEXT: v_mov_b32_e32 v3, s5
3403 ; GFX7-NEXT: v_mov_b32_e32 v5, s6
3404 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
3405 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3406 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
3407 ; GFX7-NEXT: v_mov_b32_e32 v6, s7
3408 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
3409 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4
3410 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0
3411 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
3412 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3]
3413 ; GFX7-NEXT: v_not_b32_e32 v1, v1
3414 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
3415 ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0
3416 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
3417 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
3418 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
3419 ; GFX7-NEXT: v_mov_b32_e32 v3, s7
3420 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
3421 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
3422 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
3423 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1]
3424 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3]
3425 ; GFX7-NEXT: s_mov_b64 s[0:1], 0
3426 ; GFX7-NEXT: s_mov_b32 s2, -1
3427 ; GFX7-NEXT: s_mov_b32 s3, 0xf000
3428 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3429 ; GFX7-NEXT: s_endpgm
3431 ; GFX10-LABEL: insertelement_s_v16i8_v_v:
3433 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
3434 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 2, v1
3435 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v1
3436 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
3437 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v2
3438 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6
3439 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
3440 ; GFX10-NEXT: s_mov_b32 null, 0
3441 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
3442 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
3443 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3444 ; GFX10-NEXT: v_not_b32_e32 v5, v3
3445 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3446 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
3447 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
3448 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0
3449 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
3450 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
3451 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
3452 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
3453 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
3454 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4
3455 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
3456 ; GFX10-NEXT: v_mov_b32_e32 v5, 0
3457 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2
3458 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
3459 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0
3460 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
3461 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
3462 ; GFX10-NEXT: s_endpgm
3464 ; GFX11-LABEL: insertelement_s_v16i8_v_v:
3466 ; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
3467 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 2, v1
3468 ; GFX11-NEXT: v_and_b32_e32 v2, 3, v1
3469 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0
3470 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
3471 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
3472 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v6
3473 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v6
3474 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v6
3475 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3476 ; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_lshlrev_b32 v2, 3, v2
3477 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
3478 ; GFX11-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo
3479 ; GFX11-NEXT: v_lshlrev_b32_e64 v3, v2, 0xff
3480 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, v2, v0
3481 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3482 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0
3483 ; GFX11-NEXT: v_not_b32_e32 v5, v3
3484 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3485 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1
3486 ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
3487 ; GFX11-NEXT: v_mov_b32_e32 v3, s7
3488 ; GFX11-NEXT: v_and_or_b32 v7, v7, v5, v4
3489 ; GFX11-NEXT: v_mov_b32_e32 v2, s6
3490 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
3491 ; GFX11-NEXT: v_mov_b32_e32 v5, 0
3492 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
3493 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
3494 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v7, s2
3495 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v7, s0
3496 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
3497 ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
3498 ; GFX11-NEXT: s_nop 0
3499 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3500 ; GFX11-NEXT: s_endpgm
3501 %vec = load <16 x i8>, ptr addrspace(4) %ptr
3502 %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
3503 store <16 x i8> %insert, ptr addrspace(1) null
3507 define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg %val, i32 %idx) {
3508 ; GFX9-LABEL: insertelement_v_v16i8_s_v:
3510 ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
3511 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 2, v2
3512 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v2
3513 ; GFX9-NEXT: s_movk_i32 s0, 0xff
3514 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff
3515 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3516 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3517 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1
3518 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
3519 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
3520 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
3521 ; GFX9-NEXT: v_not_b32_e32 v1, v1
3522 ; GFX9-NEXT: v_mov_b32_e32 v7, 0
3523 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
3524 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
3525 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3526 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc
3527 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1]
3528 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3]
3529 ; GFX9-NEXT: v_and_or_b32 v9, v9, v1, v2
3530 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
3531 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
3532 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1]
3533 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3]
3534 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
3535 ; GFX9-NEXT: s_endpgm
3537 ; GFX8-LABEL: insertelement_v_v16i8_s_v:
3539 ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
3540 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 2, v2
3541 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v2
3542 ; GFX8-NEXT: s_movk_i32 s0, 0xff
3543 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff
3544 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3545 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3546 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1
3547 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
3548 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
3549 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
3550 ; GFX8-NEXT: v_not_b32_e32 v1, v1
3551 ; GFX8-NEXT: v_mov_b32_e32 v7, 0
3552 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
3553 ; GFX8-NEXT: v_mov_b32_e32 v8, 0
3554 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3555 ; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc
3556 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1]
3557 ; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3]
3558 ; GFX8-NEXT: v_and_b32_e32 v1, v9, v1
3559 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v2
3560 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
3561 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
3562 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1]
3563 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3]
3564 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
3565 ; GFX8-NEXT: s_endpgm
3567 ; GFX7-LABEL: insertelement_v_v16i8_s_v:
3569 ; GFX7-NEXT: s_mov_b32 s10, 0
3570 ; GFX7-NEXT: s_mov_b32 s11, 0xf000
3571 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
3572 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
3573 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v2
3574 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v2
3575 ; GFX7-NEXT: s_and_b32 s0, s2, 0xff
3576 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3577 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3578 ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1
3579 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
3580 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
3581 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
3582 ; GFX7-NEXT: v_not_b32_e32 v1, v1
3583 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
3584 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
3585 ; GFX7-NEXT: s_mov_b32 s10, -1
3586 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3587 ; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc
3588 ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
3589 ; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3]
3590 ; GFX7-NEXT: v_and_b32_e32 v1, v7, v1
3591 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v2
3592 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
3593 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
3594 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1]
3595 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3]
3596 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
3597 ; GFX7-NEXT: s_endpgm
3599 ; GFX10-LABEL: insertelement_v_v16i8_s_v:
3601 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
3602 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v2
3603 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2
3604 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff
3605 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
3606 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
3607 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
3608 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
3609 ; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xff
3610 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1
3611 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1
3612 ; GFX10-NEXT: v_not_b32_e32 v7, v7
3613 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3614 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
3615 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0
3616 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1
3617 ; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0
3618 ; GFX10-NEXT: v_mov_b32_e32 v7, 0
3619 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
3620 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2
3621 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
3622 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0
3623 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1
3624 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
3625 ; GFX10-NEXT: s_endpgm
3627 ; GFX11-LABEL: insertelement_v_v16i8_s_v:
3629 ; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off
3630 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v2
3631 ; GFX11-NEXT: v_and_b32_e32 v0, 3, v2
3632 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff
3633 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
3634 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
3635 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
3636 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
3637 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
3638 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3639 ; GFX11-NEXT: v_lshlrev_b32_e64 v7, v0, 0xff
3640 ; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, s1
3641 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1
3642 ; GFX11-NEXT: v_not_b32_e32 v7, v7
3643 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3644 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo
3645 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3646 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0
3647 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1
3648 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3649 ; GFX11-NEXT: v_and_or_b32 v9, v2, v7, v0
3650 ; GFX11-NEXT: v_mov_b32_e32 v7, 0
3651 ; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_cndmask_b32 v1, v4, v9
3652 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2
3653 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0
3654 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1
3655 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off
3656 ; GFX11-NEXT: s_nop 0
3657 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3658 ; GFX11-NEXT: s_endpgm
3659 %vec = load <16 x i8>, ptr addrspace(1) %ptr
3660 %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
3661 store <16 x i8> %insert, ptr addrspace(1) null
3665 define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val, i32 inreg %idx) {
3666 ; GFX9-LABEL: insertelement_v_v16i8_v_s:
3668 ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
3669 ; GFX9-NEXT: s_and_b32 s0, s2, 3
3670 ; GFX9-NEXT: s_lshr_b32 s4, s2, 2
3671 ; GFX9-NEXT: s_lshl_b32 s0, s0, 3
3672 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3673 ; GFX9-NEXT: s_lshl_b32 s0, 0xff, s0
3674 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
3675 ; GFX9-NEXT: s_not_b32 s5, s0
3676 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
3677 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
3678 ; GFX9-NEXT: v_mov_b32_e32 v7, 0
3679 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
3680 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3681 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
3682 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
3683 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
3684 ; GFX9-NEXT: v_and_or_b32 v9, v1, s5, v0
3685 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
3686 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
3687 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
3688 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1]
3689 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3]
3690 ; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
3691 ; GFX9-NEXT: s_endpgm
3693 ; GFX8-LABEL: insertelement_v_v16i8_v_s:
3695 ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
3696 ; GFX8-NEXT: s_and_b32 s0, s2, 3
3697 ; GFX8-NEXT: s_lshr_b32 s4, s2, 2
3698 ; GFX8-NEXT: s_lshl_b32 s0, s0, 3
3699 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
3700 ; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0
3701 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
3702 ; GFX8-NEXT: s_not_b32 s5, s0
3703 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
3704 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
3705 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3706 ; GFX8-NEXT: v_mov_b32_e32 v7, 0
3707 ; GFX8-NEXT: v_mov_b32_e32 v8, 0
3708 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3709 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
3710 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
3711 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
3712 ; GFX8-NEXT: v_and_b32_e32 v1, s5, v1
3713 ; GFX8-NEXT: v_or_b32_e32 v9, v1, v0
3714 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
3715 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v9, s[4:5]
3716 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
3717 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v9, s[0:1]
3718 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v9, s[2:3]
3719 ; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
3720 ; GFX8-NEXT: s_endpgm
3722 ; GFX7-LABEL: insertelement_v_v16i8_v_s:
3724 ; GFX7-NEXT: s_mov_b32 s10, 0
3725 ; GFX7-NEXT: s_mov_b32 s11, 0xf000
3726 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
3727 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
3728 ; GFX7-NEXT: s_and_b32 s0, s2, 3
3729 ; GFX7-NEXT: s_lshr_b32 s4, s2, 2
3730 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v2
3731 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3
3732 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0
3733 ; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0
3734 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
3735 ; GFX7-NEXT: s_not_b32 s5, s0
3736 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2
3737 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3
3738 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
3739 ; GFX7-NEXT: s_mov_b32 s10, -1
3740 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3741 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
3742 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
3743 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
3744 ; GFX7-NEXT: v_and_b32_e32 v1, s5, v1
3745 ; GFX7-NEXT: v_or_b32_e32 v7, v1, v0
3746 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0
3747 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5]
3748 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
3749 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1]
3750 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3]
3751 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
3752 ; GFX7-NEXT: s_endpgm
3754 ; GFX10-LABEL: insertelement_v_v16i8_v_s:
3756 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
3757 ; GFX10-NEXT: s_lshr_b32 s3, s2, 2
3758 ; GFX10-NEXT: s_and_b32 s1, s2, 3
3759 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1
3760 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2
3761 ; GFX10-NEXT: s_lshl_b32 s2, s1, 3
3762 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3
3763 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3764 ; GFX10-NEXT: s_lshl_b32 s2, 0xff, s2
3765 ; GFX10-NEXT: v_mov_b32_e32 v7, 0
3766 ; GFX10-NEXT: s_not_b32 s2, s2
3767 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
3768 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3769 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
3770 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
3771 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1
3772 ; GFX10-NEXT: v_and_or_b32 v9, v0, s2, v1
3773 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0
3774 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
3775 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2
3776 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0
3777 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1
3778 ; GFX10-NEXT: global_store_dwordx4 v[7:8], v[0:3], off
3779 ; GFX10-NEXT: s_endpgm
3781 ; GFX11-LABEL: insertelement_v_v16i8_v_s:
3783 ; GFX11-NEXT: global_load_b128 v[3:6], v[0:1], off
3784 ; GFX11-NEXT: s_lshr_b32 s3, s2, 2
3785 ; GFX11-NEXT: s_and_b32 s1, s2, 3
3786 ; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1
3787 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, s3, 2
3788 ; GFX11-NEXT: s_lshl_b32 s2, s1, 3
3789 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, s3, 3
3790 ; GFX11-NEXT: v_mov_b32_e32 v7, 0
3791 ; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v1, 0xff, v2
3792 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3793 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, s2, v1
3794 ; GFX11-NEXT: s_lshl_b32 s2, 0xff, s2
3795 ; GFX11-NEXT: s_not_b32 s2, s2
3796 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3797 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
3798 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3799 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v5, s0
3800 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s1
3801 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3802 ; GFX11-NEXT: v_and_or_b32 v9, v0, s2, v1
3803 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, s3, 0
3804 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo
3805 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
3806 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2
3807 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0
3808 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v6, v9, s1
3809 ; GFX11-NEXT: global_store_b128 v[7:8], v[0:3], off
3810 ; GFX11-NEXT: s_nop 0
3811 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3812 ; GFX11-NEXT: s_endpgm
3813 %vec = load <16 x i8>, ptr addrspace(1) %ptr
3814 %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
3815 store <16 x i8> %insert, ptr addrspace(1) null
3819 define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val, i32 %idx) {
3820 ; GFX9-LABEL: insertelement_v_v16i8_v_v:
3822 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
3823 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 2, v3
3824 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v3
3825 ; GFX9-NEXT: s_movk_i32 s0, 0xff
3826 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3827 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3828 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3829 ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
3830 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
3831 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
3832 ; GFX9-NEXT: v_not_b32_e32 v1, v1
3833 ; GFX9-NEXT: v_mov_b32_e32 v8, 0
3834 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
3835 ; GFX9-NEXT: v_mov_b32_e32 v9, 0
3836 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3837 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
3838 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
3839 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
3840 ; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v2
3841 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
3842 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
3843 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1]
3844 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3]
3845 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
3846 ; GFX9-NEXT: s_endpgm
3848 ; GFX8-LABEL: insertelement_v_v16i8_v_v:
3850 ; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
3851 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 2, v3
3852 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v3
3853 ; GFX8-NEXT: s_movk_i32 s0, 0xff
3854 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3855 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3856 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3857 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0
3858 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
3859 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
3860 ; GFX8-NEXT: v_not_b32_e32 v1, v1
3861 ; GFX8-NEXT: v_mov_b32_e32 v8, 0
3862 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
3863 ; GFX8-NEXT: v_mov_b32_e32 v9, 0
3864 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3865 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
3866 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
3867 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
3868 ; GFX8-NEXT: v_and_b32_e32 v1, v3, v1
3869 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v2
3870 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
3871 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
3872 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1]
3873 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3]
3874 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
3875 ; GFX8-NEXT: s_endpgm
3877 ; GFX7-LABEL: insertelement_v_v16i8_v_v:
3879 ; GFX7-NEXT: s_mov_b32 s10, 0
3880 ; GFX7-NEXT: s_mov_b32 s11, 0xf000
3881 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
3882 ; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
3883 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 2, v3
3884 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v3
3885 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3886 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
3887 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1
3888 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0
3889 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2
3890 ; GFX7-NEXT: v_lshl_b32_e32 v1, 0xff, v1
3891 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0
3892 ; GFX7-NEXT: v_not_b32_e32 v1, v1
3893 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
3894 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
3895 ; GFX7-NEXT: s_mov_b32 s10, -1
3896 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3897 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
3898 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
3899 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
3900 ; GFX7-NEXT: v_and_b32_e32 v1, v3, v1
3901 ; GFX7-NEXT: v_or_b32_e32 v3, v1, v2
3902 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5]
3903 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
3904 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1]
3905 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3]
3906 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
3907 ; GFX7-NEXT: s_endpgm
3909 ; GFX10-LABEL: insertelement_v_v16i8_v_v:
3911 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
3912 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v3
3913 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v3
3914 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
3915 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
3916 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
3917 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1
3918 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
3919 ; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xff
3920 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
3921 ; GFX10-NEXT: v_not_b32_e32 v2, v8
3922 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
3923 ; GFX10-NEXT: v_mov_b32_e32 v9, 0
3924 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3925 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo
3926 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0
3927 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
3928 ; GFX10-NEXT: v_and_or_b32 v3, v3, v2, v0
3929 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2
3930 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
3931 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0
3932 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1
3933 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
3934 ; GFX10-NEXT: s_endpgm
3936 ; GFX11-LABEL: insertelement_v_v16i8_v_v:
3938 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off
3939 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v3
3940 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3941 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
3942 ; GFX11-NEXT: v_and_b32_e32 v0, 3, v3
3943 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
3944 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
3945 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 3, v1
3946 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v1
3947 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3948 ; GFX11-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_lshlrev_b32 v0, 3, v0
3949 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
3950 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0
3951 ; GFX11-NEXT: v_lshlrev_b32_e64 v8, v0, 0xff
3952 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v2
3953 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
3954 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s1
3955 ; GFX11-NEXT: v_not_b32_e32 v2, v8
3956 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
3957 ; GFX11-NEXT: v_mov_b32_e32 v9, 0
3958 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3959 ; GFX11-NEXT: v_and_or_b32 v3, v3, v2, v0
3960 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v3, s2
3961 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc_lo
3962 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v3, s0
3963 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s1
3964 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off
3965 ; GFX11-NEXT: s_nop 0
3966 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3967 ; GFX11-NEXT: s_endpgm
3968 %vec = load <16 x i8>, ptr addrspace(1) %ptr
3969 %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx
3970 store <16 x i8> %insert, ptr addrspace(1) null