1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
5 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
8 define amdgpu_ps i128 @extractelement_sgpr_v4i128_sgpr_idx(ptr addrspace(4) inreg %ptr, i32 inreg %idx) {
9 ; GCN-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
11 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0
12 ; GCN-NEXT: s_lshl_b32 m0, s4, 1
13 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
14 ; GCN-NEXT: s_movrels_b64 s[0:1], s[8:9]
15 ; GCN-NEXT: s_movrels_b64 s[2:3], s[10:11]
16 ; GCN-NEXT: ; return to shader part epilog
18 ; GFX10-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
20 ; GFX10-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0
21 ; GFX10-NEXT: s_lshl_b32 m0, s4, 1
22 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
23 ; GFX10-NEXT: s_movrels_b64 s[0:1], s[8:9]
24 ; GFX10-NEXT: s_movrels_b64 s[2:3], s[10:11]
25 ; GFX10-NEXT: ; return to shader part epilog
27 ; GFX11-LABEL: extractelement_sgpr_v4i128_sgpr_idx:
29 ; GFX11-NEXT: s_load_b512 s[8:23], s[2:3], 0x0
30 ; GFX11-NEXT: s_lshl_b32 m0, s4, 1
31 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
32 ; GFX11-NEXT: s_movrels_b64 s[0:1], s[8:9]
33 ; GFX11-NEXT: s_movrels_b64 s[2:3], s[10:11]
34 ; GFX11-NEXT: ; return to shader part epilog
35 %vector = load <4 x i128>, ptr addrspace(4) %ptr
36 %element = extractelement <4 x i128> %vector, i32 %idx
40 define amdgpu_ps i128 @extractelement_vgpr_v4i128_sgpr_idx(ptr addrspace(1) %ptr, i32 inreg %idx) {
41 ; GFX9-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
43 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
44 ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
45 ; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
46 ; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
47 ; GFX9-NEXT: s_lshl_b32 s0, s2, 1
48 ; GFX9-NEXT: s_lshl_b32 s2, s0, 1
49 ; GFX9-NEXT: s_waitcnt vmcnt(0)
50 ; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
51 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
52 ; GFX9-NEXT: v_mov_b32_e32 v1, v3
53 ; GFX9-NEXT: v_mov_b32_e32 v18, v2
54 ; GFX9-NEXT: s_set_gpr_idx_off
55 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
56 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
57 ; GFX9-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
58 ; GFX9-NEXT: v_mov_b32_e32 v3, v3
59 ; GFX9-NEXT: s_set_gpr_idx_off
60 ; GFX9-NEXT: v_readfirstlane_b32 s2, v18
61 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
62 ; GFX9-NEXT: ; return to shader part epilog
64 ; GFX8-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
66 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 16, v0
67 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
68 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 32, v0
69 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
70 ; GFX8-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
71 ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[6:7]
72 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
73 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
74 ; GFX8-NEXT: flat_load_dwordx4 v[10:13], v[10:11]
75 ; GFX8-NEXT: flat_load_dwordx4 v[14:17], v[0:1]
76 ; GFX8-NEXT: s_lshl_b32 s0, s2, 1
77 ; GFX8-NEXT: s_lshl_b32 m0, s0, 1
78 ; GFX8-NEXT: s_waitcnt vmcnt(0)
79 ; GFX8-NEXT: v_movrels_b32_e32 v1, v3
80 ; GFX8-NEXT: v_movrels_b32_e32 v0, v2
81 ; GFX8-NEXT: v_mov_b32_e32 v3, v1
82 ; GFX8-NEXT: v_mov_b32_e32 v2, v0
83 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
84 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
85 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
86 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
87 ; GFX8-NEXT: ; return to shader part epilog
89 ; GFX7-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
91 ; GFX7-NEXT: s_mov_b32 s6, 0
92 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
93 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
94 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
95 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
96 ; GFX7-NEXT: buffer_load_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:32
97 ; GFX7-NEXT: buffer_load_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
98 ; GFX7-NEXT: s_lshl_b32 s0, s2, 1
99 ; GFX7-NEXT: s_lshl_b32 m0, s0, 1
100 ; GFX7-NEXT: s_waitcnt vmcnt(0)
101 ; GFX7-NEXT: v_movrels_b32_e32 v1, v3
102 ; GFX7-NEXT: v_movrels_b32_e32 v0, v2
103 ; GFX7-NEXT: v_mov_b32_e32 v3, v1
104 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
105 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
106 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
107 ; GFX7-NEXT: v_readfirstlane_b32 s2, v2
108 ; GFX7-NEXT: v_readfirstlane_b32 s3, v3
109 ; GFX7-NEXT: ; return to shader part epilog
111 ; GFX10-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
113 ; GFX10-NEXT: s_clause 0x3
114 ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
115 ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
116 ; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
117 ; GFX10-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
118 ; GFX10-NEXT: s_lshl_b32 s0, s2, 1
119 ; GFX10-NEXT: s_lshl_b32 m0, s0, 1
120 ; GFX10-NEXT: s_waitcnt vmcnt(0)
121 ; GFX10-NEXT: v_movrels_b32_e32 v1, v3
122 ; GFX10-NEXT: v_movrels_b32_e32 v0, v2
123 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
124 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
125 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
126 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
127 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
128 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
129 ; GFX10-NEXT: ; return to shader part epilog
131 ; GFX11-LABEL: extractelement_vgpr_v4i128_sgpr_idx:
133 ; GFX11-NEXT: s_clause 0x3
134 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off
135 ; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off offset:16
136 ; GFX11-NEXT: global_load_b128 v[10:13], v[0:1], off offset:32
137 ; GFX11-NEXT: global_load_b128 v[14:17], v[0:1], off offset:48
138 ; GFX11-NEXT: s_lshl_b32 s0, s2, 1
139 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
140 ; GFX11-NEXT: s_lshl_b32 m0, s0, 1
141 ; GFX11-NEXT: s_waitcnt vmcnt(0)
142 ; GFX11-NEXT: v_movrels_b32_e32 v0, v2
143 ; GFX11-NEXT: v_movrels_b32_e32 v1, v3
144 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
145 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
146 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
147 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
148 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3
149 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
150 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2
151 ; GFX11-NEXT: ; return to shader part epilog
152 %vector = load <4 x i128>, ptr addrspace(1) %ptr
153 %element = extractelement <4 x i128> %vector, i32 %idx
157 define i128 @extractelement_vgpr_v4i128_vgpr_idx(ptr addrspace(1) %ptr, i32 %idx) {
158 ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
160 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161 ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
162 ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16
163 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
164 ; GFX9-NEXT: v_add_u32_e32 v16, 1, v2
165 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
166 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
167 ; GFX9-NEXT: s_waitcnt vmcnt(1)
168 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
169 ; GFX9-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
170 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
171 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
172 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
173 ; GFX9-NEXT: s_waitcnt vmcnt(0)
174 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc
175 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc
176 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
177 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
178 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
179 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
180 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
181 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
182 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
183 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
184 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
185 ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
186 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
187 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
188 ; GFX9-NEXT: s_waitcnt vmcnt(1)
189 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
190 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc
191 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
192 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
193 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
194 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
195 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
196 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
197 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
198 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
199 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
200 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
201 ; GFX9-NEXT: s_waitcnt vmcnt(0)
202 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
203 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
204 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
205 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
206 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc
207 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
208 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
209 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
210 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
211 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc
212 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc
213 ; GFX9-NEXT: s_setpc_b64 s[30:31]
215 ; GFX8-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
217 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218 ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1]
219 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0
220 ; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc
221 ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8]
222 ; GFX8-NEXT: v_lshlrev_b32_e32 v16, 1, v2
223 ; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v16
224 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
225 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
226 ; GFX8-NEXT: s_waitcnt vmcnt(1)
227 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
228 ; GFX8-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
229 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc
230 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
231 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 32, v0
232 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
233 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
234 ; GFX8-NEXT: s_waitcnt vmcnt(0)
235 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v7, vcc
236 ; GFX8-NEXT: v_cndmask_b32_e32 v11, v12, v8, vcc
237 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17
238 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
239 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
240 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
241 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
242 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc
243 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17
244 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
245 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
246 ; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[2:3]
247 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
248 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
249 ; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
250 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
251 ; GFX8-NEXT: s_waitcnt vmcnt(1)
252 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
253 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc
254 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17
255 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc
256 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc
257 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
258 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
259 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
260 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17
261 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
262 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
263 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
264 ; GFX8-NEXT: s_waitcnt vmcnt(0)
265 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
266 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
267 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17
268 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
269 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
270 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
271 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
272 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
273 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v17
274 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v14, vcc
275 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v15, vcc
276 ; GFX8-NEXT: s_setpc_b64 s[30:31]
278 ; GFX7-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
280 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
281 ; GFX7-NEXT: s_mov_b32 s10, 0
282 ; GFX7-NEXT: s_mov_b32 s11, 0xf000
283 ; GFX7-NEXT: s_mov_b64 s[8:9], 0
284 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
285 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:16
286 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 1, v2
287 ; GFX7-NEXT: v_add_i32_e32 v16, vcc, 1, v2
288 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
289 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
290 ; GFX7-NEXT: s_waitcnt vmcnt(1)
291 ; GFX7-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
292 ; GFX7-NEXT: v_cndmask_b32_e64 v12, v4, v6, s[4:5]
293 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
294 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
295 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
296 ; GFX7-NEXT: s_waitcnt vmcnt(0)
297 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc
298 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc
299 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
300 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
301 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
302 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
303 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
304 ; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc
305 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
306 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
307 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
308 ; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32
309 ; GFX7-NEXT: buffer_load_dwordx4 v[12:15], v[0:1], s[8:11], 0 addr64 offset:48
310 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
311 ; GFX7-NEXT: s_waitcnt vmcnt(1)
312 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc
313 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc
314 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
315 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
316 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
317 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
318 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
319 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
320 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
321 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
322 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc
323 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v2
324 ; GFX7-NEXT: s_waitcnt vmcnt(0)
325 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc
326 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
327 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v16
328 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
329 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v13, vcc
330 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v2
331 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc
332 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc
333 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v16
334 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v14, vcc
335 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v15, vcc
336 ; GFX7-NEXT: s_setpc_b64 s[30:31]
338 ; GFX10-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
340 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341 ; GFX10-NEXT: s_clause 0x1
342 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off
343 ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
344 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v2
345 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
346 ; GFX10-NEXT: v_add_nc_u32_e32 v3, 1, v2
347 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
348 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v3
349 ; GFX10-NEXT: s_waitcnt vmcnt(2)
350 ; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v14, vcc_lo
351 ; GFX10-NEXT: v_cndmask_b32_e32 v17, v13, v15, vcc_lo
352 ; GFX10-NEXT: v_cndmask_b32_e64 v18, v12, v14, s4
353 ; GFX10-NEXT: v_cndmask_b32_e64 v19, v13, v15, s4
354 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
355 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2
356 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 2, v3
357 ; GFX10-NEXT: s_waitcnt vmcnt(2)
358 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v4, vcc_lo
359 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v5, vcc_lo
360 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2
361 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s4
362 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v19, v5, s4
363 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 3, v3
364 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
365 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo
366 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2
367 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v6, s4
368 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v7, s4
369 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 4, v3
370 ; GFX10-NEXT: s_waitcnt vmcnt(1)
371 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
372 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo
373 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v2
374 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v8, s4
375 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s4
376 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v3
377 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
378 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo
379 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v2
380 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4
381 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4
382 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v3
383 ; GFX10-NEXT: s_waitcnt vmcnt(0)
384 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo
385 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo
386 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s4
387 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v2
388 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s4
389 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 7, v3
390 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo
391 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo
392 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s4
393 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v15, s4
394 ; GFX10-NEXT: s_setpc_b64 s[30:31]
396 ; GFX11-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
398 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
399 ; GFX11-NEXT: s_clause 0x3
400 ; GFX11-NEXT: global_load_b128 v[16:19], v[0:1], off
401 ; GFX11-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
402 ; GFX11-NEXT: global_load_b128 v[8:11], v[0:1], off offset:32
403 ; GFX11-NEXT: global_load_b128 v[12:15], v[0:1], off offset:48
404 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v2
405 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
406 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
407 ; GFX11-NEXT: s_waitcnt vmcnt(3)
408 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v19, vcc_lo
409 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v18 :: v_dual_add_nc_u32 v1, 1, v0
410 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
411 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
412 ; GFX11-NEXT: s_waitcnt vmcnt(2)
413 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
414 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5
415 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
416 ; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v18, s0
417 ; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v19, s0
418 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
419 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v6 :: v_dual_cndmask_b32 v3, v3, v7
420 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
421 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
422 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s0
423 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s0
424 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
425 ; GFX11-NEXT: s_waitcnt vmcnt(1)
426 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v8 :: v_dual_cndmask_b32 v3, v3, v9
427 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
428 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
429 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v6, s0
430 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v7, s0
431 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
432 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v10 :: v_dual_cndmask_b32 v3, v3, v11
433 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
434 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v8, s0
435 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0
436 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
437 ; GFX11-NEXT: s_waitcnt vmcnt(0)
438 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v2, v12 :: v_dual_cndmask_b32 v3, v3, v13
439 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
440 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
441 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s0
442 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s0
443 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
444 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v14, vcc_lo
445 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s0
446 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
447 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s0
448 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
449 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc_lo
450 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0
451 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
452 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, v15, s0
453 ; GFX11-NEXT: s_setpc_b64 s[30:31]
454 %vector = load <4 x i128>, ptr addrspace(1) %ptr
455 %element = extractelement <4 x i128> %vector, i32 %idx
459 define amdgpu_ps i128 @extractelement_sgpr_v4i128_vgpr_idx(ptr addrspace(4) inreg %ptr, i32 %idx) {
460 ; GFX9-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
462 ; GFX9-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
463 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
464 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
465 ; GFX9-NEXT: v_add_u32_e32 v19, 1, v0
466 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
467 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
468 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
469 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
470 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
471 ; GFX9-NEXT: v_mov_b32_e32 v5, s4
472 ; GFX9-NEXT: v_mov_b32_e32 v6, s5
473 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
474 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
475 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
476 ; GFX9-NEXT: v_mov_b32_e32 v7, s6
477 ; GFX9-NEXT: v_mov_b32_e32 v8, s7
478 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
479 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
480 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
481 ; GFX9-NEXT: v_mov_b32_e32 v9, s8
482 ; GFX9-NEXT: v_mov_b32_e32 v10, s9
483 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
484 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
485 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
486 ; GFX9-NEXT: v_mov_b32_e32 v11, s10
487 ; GFX9-NEXT: v_mov_b32_e32 v12, s11
488 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
489 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
490 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
491 ; GFX9-NEXT: v_mov_b32_e32 v13, s12
492 ; GFX9-NEXT: v_mov_b32_e32 v14, s13
493 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
494 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
495 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
496 ; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
497 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
498 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
499 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
500 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
501 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
502 ; GFX9-NEXT: v_mov_b32_e32 v15, s14
503 ; GFX9-NEXT: v_mov_b32_e32 v16, s15
504 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
505 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
506 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
507 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
508 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
509 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
510 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
511 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
512 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
513 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
514 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
515 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
516 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
517 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
518 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
519 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
520 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
521 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
522 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
523 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
524 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0
525 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1
526 ; GFX9-NEXT: v_readfirstlane_b32 s2, v2
527 ; GFX9-NEXT: v_readfirstlane_b32 s3, v3
528 ; GFX9-NEXT: ; return to shader part epilog
530 ; GFX8-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
532 ; GFX8-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
533 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
534 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
535 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
536 ; GFX8-NEXT: v_mov_b32_e32 v1, s0
537 ; GFX8-NEXT: v_mov_b32_e32 v2, s1
538 ; GFX8-NEXT: v_mov_b32_e32 v3, s2
539 ; GFX8-NEXT: v_mov_b32_e32 v4, s3
540 ; GFX8-NEXT: v_mov_b32_e32 v5, s4
541 ; GFX8-NEXT: v_mov_b32_e32 v6, s5
542 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
543 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
544 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
545 ; GFX8-NEXT: v_mov_b32_e32 v7, s6
546 ; GFX8-NEXT: v_mov_b32_e32 v8, s7
547 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
548 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
549 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
550 ; GFX8-NEXT: v_mov_b32_e32 v9, s8
551 ; GFX8-NEXT: v_mov_b32_e32 v10, s9
552 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
553 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
554 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
555 ; GFX8-NEXT: v_mov_b32_e32 v11, s10
556 ; GFX8-NEXT: v_mov_b32_e32 v12, s11
557 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
558 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
559 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
560 ; GFX8-NEXT: v_mov_b32_e32 v13, s12
561 ; GFX8-NEXT: v_mov_b32_e32 v14, s13
562 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
563 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
564 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
565 ; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
566 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
567 ; GFX8-NEXT: v_add_u32_e32 v19, vcc, 1, v0
568 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
569 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
570 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
571 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
572 ; GFX8-NEXT: v_mov_b32_e32 v15, s14
573 ; GFX8-NEXT: v_mov_b32_e32 v16, s15
574 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
575 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
576 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
577 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
578 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
579 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
580 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
581 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
582 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
583 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
584 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
585 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
586 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
587 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
588 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
589 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
590 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
591 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
592 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
593 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
594 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
595 ; GFX8-NEXT: v_readfirstlane_b32 s1, v1
596 ; GFX8-NEXT: v_readfirstlane_b32 s2, v2
597 ; GFX8-NEXT: v_readfirstlane_b32 s3, v3
598 ; GFX8-NEXT: ; return to shader part epilog
600 ; GFX7-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
602 ; GFX7-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
603 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0
604 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
605 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
606 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
607 ; GFX7-NEXT: v_mov_b32_e32 v2, s1
608 ; GFX7-NEXT: v_mov_b32_e32 v3, s2
609 ; GFX7-NEXT: v_mov_b32_e32 v4, s3
610 ; GFX7-NEXT: v_mov_b32_e32 v5, s4
611 ; GFX7-NEXT: v_mov_b32_e32 v6, s5
612 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v1, v3, vcc
613 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc
614 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
615 ; GFX7-NEXT: v_mov_b32_e32 v7, s6
616 ; GFX7-NEXT: v_mov_b32_e32 v8, s7
617 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v5, vcc
618 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v6, vcc
619 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
620 ; GFX7-NEXT: v_mov_b32_e32 v9, s8
621 ; GFX7-NEXT: v_mov_b32_e32 v10, s9
622 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v7, vcc
623 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v8, vcc
624 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
625 ; GFX7-NEXT: v_mov_b32_e32 v11, s10
626 ; GFX7-NEXT: v_mov_b32_e32 v12, s11
627 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v9, vcc
628 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v10, vcc
629 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0
630 ; GFX7-NEXT: v_mov_b32_e32 v13, s12
631 ; GFX7-NEXT: v_mov_b32_e32 v14, s13
632 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v11, vcc
633 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v12, vcc
634 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0
635 ; GFX7-NEXT: v_cndmask_b32_e32 v17, v17, v13, vcc
636 ; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v14, vcc
637 ; GFX7-NEXT: v_add_i32_e32 v19, vcc, 1, v0
638 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
639 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
640 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
641 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19
642 ; GFX7-NEXT: v_mov_b32_e32 v15, s14
643 ; GFX7-NEXT: v_mov_b32_e32 v16, s15
644 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0
645 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
646 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1]
647 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 3, v19
648 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v15, vcc
649 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v1, v7, s[0:1]
650 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v18, v16, vcc
651 ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
652 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 4, v19
653 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
654 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
655 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 5, v19
656 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
657 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
658 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 6, v19
659 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
660 ; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v14, vcc
661 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 7, v19
662 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v15, vcc
663 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v16, vcc
664 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0
665 ; GFX7-NEXT: v_readfirstlane_b32 s1, v1
666 ; GFX7-NEXT: v_readfirstlane_b32 s2, v2
667 ; GFX7-NEXT: v_readfirstlane_b32 s3, v3
668 ; GFX7-NEXT: ; return to shader part epilog
670 ; GFX10-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
672 ; GFX10-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0
673 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
674 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v0
675 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
676 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
677 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
678 ; GFX10-NEXT: v_mov_b32_e32 v2, s6
679 ; GFX10-NEXT: v_mov_b32_e32 v3, s7
680 ; GFX10-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo
681 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo
682 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
683 ; GFX10-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0
684 ; GFX10-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0
685 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
686 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
687 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo
688 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
689 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0
690 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0
691 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
692 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
693 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo
694 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
695 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
696 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0
697 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
698 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo
699 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo
700 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
701 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0
702 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0
703 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
704 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo
705 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo
706 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
707 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0
708 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0
709 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
710 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo
711 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo
712 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
713 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0
714 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0
715 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
716 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo
717 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo
718 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0
719 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0
720 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0
721 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1
722 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2
723 ; GFX10-NEXT: v_readfirstlane_b32 s3, v3
724 ; GFX10-NEXT: ; return to shader part epilog
726 ; GFX11-LABEL: extractelement_sgpr_v4i128_vgpr_idx:
728 ; GFX11-NEXT: s_load_b512 s[4:19], s[2:3], 0x0
729 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
730 ; GFX11-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_lshlrev_b32 v0, 1, v0
731 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
732 ; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_add_nc_u32 v1, 1, v0
733 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
734 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1
735 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
736 ; GFX11-NEXT: v_cndmask_b32_e32 v4, s4, v2, vcc_lo
737 ; GFX11-NEXT: v_cndmask_b32_e32 v5, s5, v3, vcc_lo
738 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
739 ; GFX11-NEXT: v_cndmask_b32_e64 v2, s4, v2, s0
740 ; GFX11-NEXT: v_cndmask_b32_e64 v3, s5, v3, s0
741 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 2, v1
742 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s8, vcc_lo
743 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s9, vcc_lo
744 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
745 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
746 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s8, s0
747 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s9, s0
748 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 3, v1
749 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
750 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s11, vcc_lo
751 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
752 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
753 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0
754 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s11, s0
755 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 4, v1
756 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s12, vcc_lo
757 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s13, vcc_lo
758 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
759 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
760 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s12, s0
761 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s13, s0
762 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 5, v1
763 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s14, vcc_lo
764 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s15, vcc_lo
765 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
766 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
767 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s14, s0
768 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s15, s0
769 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 6, v1
770 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s16, vcc_lo
771 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s17, vcc_lo
772 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
773 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
774 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s16, s0
775 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s17, s0
776 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 7, v1
777 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, s18, vcc_lo
778 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, s19, vcc_lo
779 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s18, s0
780 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s19, s0
781 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
782 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0
783 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1
784 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
785 ; GFX11-NEXT: v_readfirstlane_b32 s2, v2
786 ; GFX11-NEXT: v_readfirstlane_b32 s3, v3
787 ; GFX11-NEXT: ; return to shader part epilog
788 %vector = load <4 x i128>, ptr addrspace(4) %ptr
789 %element = extractelement <4 x i128> %vector, i32 %idx
793 define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx0(ptr addrspace(4) inreg %ptr) {
794 ; GCN-LABEL: extractelement_sgpr_v4i128_idx0:
796 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
797 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
798 ; GCN-NEXT: ; return to shader part epilog
800 ; GFX10-LABEL: extractelement_sgpr_v4i128_idx0:
802 ; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
803 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
804 ; GFX10-NEXT: ; return to shader part epilog
806 ; GFX11-LABEL: extractelement_sgpr_v4i128_idx0:
808 ; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
809 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
810 ; GFX11-NEXT: ; return to shader part epilog
811 %vector = load <4 x i128>, ptr addrspace(4) %ptr
812 %element = extractelement <4 x i128> %vector, i32 0
816 define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx1(ptr addrspace(4) inreg %ptr) {
817 ; GCN-LABEL: extractelement_sgpr_v4i128_idx1:
819 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
820 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
821 ; GCN-NEXT: s_mov_b32 s0, s4
822 ; GCN-NEXT: s_mov_b32 s1, s5
823 ; GCN-NEXT: s_mov_b32 s2, s6
824 ; GCN-NEXT: s_mov_b32 s3, s7
825 ; GCN-NEXT: ; return to shader part epilog
827 ; GFX10-LABEL: extractelement_sgpr_v4i128_idx1:
829 ; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
830 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
831 ; GFX10-NEXT: s_mov_b32 s0, s4
832 ; GFX10-NEXT: s_mov_b32 s1, s5
833 ; GFX10-NEXT: s_mov_b32 s2, s6
834 ; GFX10-NEXT: s_mov_b32 s3, s7
835 ; GFX10-NEXT: ; return to shader part epilog
837 ; GFX11-LABEL: extractelement_sgpr_v4i128_idx1:
839 ; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
840 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
841 ; GFX11-NEXT: s_mov_b32 s0, s4
842 ; GFX11-NEXT: s_mov_b32 s1, s5
843 ; GFX11-NEXT: s_mov_b32 s2, s6
844 ; GFX11-NEXT: s_mov_b32 s3, s7
845 ; GFX11-NEXT: ; return to shader part epilog
846 %vector = load <4 x i128>, ptr addrspace(4) %ptr
847 %element = extractelement <4 x i128> %vector, i32 1
851 define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx2(ptr addrspace(4) inreg %ptr) {
852 ; GCN-LABEL: extractelement_sgpr_v4i128_idx2:
854 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
855 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
856 ; GCN-NEXT: s_mov_b32 s0, s8
857 ; GCN-NEXT: s_mov_b32 s1, s9
858 ; GCN-NEXT: s_mov_b32 s2, s10
859 ; GCN-NEXT: s_mov_b32 s3, s11
860 ; GCN-NEXT: ; return to shader part epilog
862 ; GFX10-LABEL: extractelement_sgpr_v4i128_idx2:
864 ; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
865 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
866 ; GFX10-NEXT: s_mov_b32 s0, s8
867 ; GFX10-NEXT: s_mov_b32 s1, s9
868 ; GFX10-NEXT: s_mov_b32 s2, s10
869 ; GFX10-NEXT: s_mov_b32 s3, s11
870 ; GFX10-NEXT: ; return to shader part epilog
872 ; GFX11-LABEL: extractelement_sgpr_v4i128_idx2:
874 ; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
875 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
876 ; GFX11-NEXT: s_mov_b32 s0, s8
877 ; GFX11-NEXT: s_mov_b32 s1, s9
878 ; GFX11-NEXT: s_mov_b32 s2, s10
879 ; GFX11-NEXT: s_mov_b32 s3, s11
880 ; GFX11-NEXT: ; return to shader part epilog
881 %vector = load <4 x i128>, ptr addrspace(4) %ptr
882 %element = extractelement <4 x i128> %vector, i32 2
886 define amdgpu_ps i128 @extractelement_sgpr_v4i128_idx3(ptr addrspace(4) inreg %ptr) {
887 ; GCN-LABEL: extractelement_sgpr_v4i128_idx3:
889 ; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
890 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
891 ; GCN-NEXT: s_mov_b32 s0, s12
892 ; GCN-NEXT: s_mov_b32 s1, s13
893 ; GCN-NEXT: s_mov_b32 s2, s14
894 ; GCN-NEXT: s_mov_b32 s3, s15
895 ; GCN-NEXT: ; return to shader part epilog
897 ; GFX10-LABEL: extractelement_sgpr_v4i128_idx3:
899 ; GFX10-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0
900 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
901 ; GFX10-NEXT: s_mov_b32 s0, s12
902 ; GFX10-NEXT: s_mov_b32 s1, s13
903 ; GFX10-NEXT: s_mov_b32 s2, s14
904 ; GFX10-NEXT: s_mov_b32 s3, s15
905 ; GFX10-NEXT: ; return to shader part epilog
907 ; GFX11-LABEL: extractelement_sgpr_v4i128_idx3:
909 ; GFX11-NEXT: s_load_b512 s[0:15], s[2:3], 0x0
910 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
911 ; GFX11-NEXT: s_mov_b32 s0, s12
912 ; GFX11-NEXT: s_mov_b32 s1, s13
913 ; GFX11-NEXT: s_mov_b32 s2, s14
914 ; GFX11-NEXT: s_mov_b32 s3, s15
915 ; GFX11-NEXT: ; return to shader part epilog
916 %vector = load <4 x i128>, ptr addrspace(4) %ptr
917 %element = extractelement <4 x i128> %vector, i32 3
921 define i128 @extractelement_vgpr_v4i128_idx0(ptr addrspace(1) %ptr) {
922 ; GFX9-LABEL: extractelement_vgpr_v4i128_idx0:
924 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
925 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
926 ; GFX9-NEXT: s_waitcnt vmcnt(0)
927 ; GFX9-NEXT: s_setpc_b64 s[30:31]
929 ; GFX8-LABEL: extractelement_vgpr_v4i128_idx0:
931 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
932 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
933 ; GFX8-NEXT: s_waitcnt vmcnt(0)
934 ; GFX8-NEXT: s_setpc_b64 s[30:31]
936 ; GFX7-LABEL: extractelement_vgpr_v4i128_idx0:
938 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
939 ; GFX7-NEXT: s_mov_b32 s6, 0
940 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
941 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
942 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
943 ; GFX7-NEXT: s_waitcnt vmcnt(0)
944 ; GFX7-NEXT: s_setpc_b64 s[30:31]
946 ; GFX10-LABEL: extractelement_vgpr_v4i128_idx0:
948 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
949 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
950 ; GFX10-NEXT: s_waitcnt vmcnt(0)
951 ; GFX10-NEXT: s_setpc_b64 s[30:31]
953 ; GFX11-LABEL: extractelement_vgpr_v4i128_idx0:
955 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
956 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
957 ; GFX11-NEXT: s_waitcnt vmcnt(0)
958 ; GFX11-NEXT: s_setpc_b64 s[30:31]
959 %vector = load <4 x i128>, ptr addrspace(1) %ptr
960 %element = extractelement <4 x i128> %vector, i32 0
964 define i128 @extractelement_vgpr_v4i128_idx1(ptr addrspace(1) %ptr) {
965 ; GFX9-LABEL: extractelement_vgpr_v4i128_idx1:
967 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
968 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:16
969 ; GFX9-NEXT: s_waitcnt vmcnt(0)
970 ; GFX9-NEXT: s_setpc_b64 s[30:31]
972 ; GFX8-LABEL: extractelement_vgpr_v4i128_idx1:
974 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
975 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0
976 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
977 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
978 ; GFX8-NEXT: s_waitcnt vmcnt(0)
979 ; GFX8-NEXT: s_setpc_b64 s[30:31]
981 ; GFX7-LABEL: extractelement_vgpr_v4i128_idx1:
983 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
984 ; GFX7-NEXT: s_mov_b32 s6, 0
985 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
986 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
987 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:16
988 ; GFX7-NEXT: s_waitcnt vmcnt(0)
989 ; GFX7-NEXT: s_setpc_b64 s[30:31]
991 ; GFX10-LABEL: extractelement_vgpr_v4i128_idx1:
993 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
994 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:16
995 ; GFX10-NEXT: s_waitcnt vmcnt(0)
996 ; GFX10-NEXT: s_setpc_b64 s[30:31]
998 ; GFX11-LABEL: extractelement_vgpr_v4i128_idx1:
1000 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1001 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:16
1002 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1003 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1004 %vector = load <4 x i128>, ptr addrspace(1) %ptr
1005 %element = extractelement <4 x i128> %vector, i32 1
1009 define i128 @extractelement_vgpr_v4i128_idx2(ptr addrspace(1) %ptr) {
1010 ; GFX9-LABEL: extractelement_vgpr_v4i128_idx2:
1012 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1013 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:32
1014 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1015 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1017 ; GFX8-LABEL: extractelement_vgpr_v4i128_idx2:
1019 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1020 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
1021 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1022 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1023 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1024 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1026 ; GFX7-LABEL: extractelement_vgpr_v4i128_idx2:
1028 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1029 ; GFX7-NEXT: s_mov_b32 s6, 0
1030 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1031 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1032 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:32
1033 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1034 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1036 ; GFX10-LABEL: extractelement_vgpr_v4i128_idx2:
1038 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1039 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:32
1040 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1041 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1043 ; GFX11-LABEL: extractelement_vgpr_v4i128_idx2:
1045 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1046 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:32
1047 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1048 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1049 %vector = load <4 x i128>, ptr addrspace(1) %ptr
1050 %element = extractelement <4 x i128> %vector, i32 2
1054 define i128 @extractelement_vgpr_v4i128_idx3(ptr addrspace(1) %ptr) {
1055 ; GFX9-LABEL: extractelement_vgpr_v4i128_idx3:
1057 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1058 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:48
1059 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1060 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1062 ; GFX8-LABEL: extractelement_vgpr_v4i128_idx3:
1064 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1065 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 48, v0
1066 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1067 ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
1068 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1069 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1071 ; GFX7-LABEL: extractelement_vgpr_v4i128_idx3:
1073 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1074 ; GFX7-NEXT: s_mov_b32 s6, 0
1075 ; GFX7-NEXT: s_mov_b32 s7, 0xf000
1076 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1077 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 offset:48
1078 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1079 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1081 ; GFX10-LABEL: extractelement_vgpr_v4i128_idx3:
1083 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1084 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:48
1085 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1086 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1088 ; GFX11-LABEL: extractelement_vgpr_v4i128_idx3:
1090 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1091 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off offset:48
1092 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1093 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1094 %vector = load <4 x i128>, ptr addrspace(1) %ptr
1095 %element = extractelement <4 x i128> %vector, i32 3