1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
6 define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
7 ; SI-LABEL: vec_8xi16_extract_4xi16:
9 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10 ; SI-NEXT: s_cbranch_scc0 .LBB0_2
11 ; SI-NEXT: ; %bb.1: ; %F
12 ; SI-NEXT: s_mov_b32 s6, 0
13 ; SI-NEXT: s_mov_b32 s7, 0xf000
14 ; SI-NEXT: s_mov_b32 s4, s6
15 ; SI-NEXT: s_mov_b32 s5, s6
16 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
17 ; SI-NEXT: s_waitcnt vmcnt(0)
18 ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
19 ; SI-NEXT: s_waitcnt vmcnt(0)
20 ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
21 ; SI-NEXT: s_waitcnt vmcnt(0)
22 ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
23 ; SI-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
25 ; SI-NEXT: s_waitcnt vmcnt(0)
26 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
27 ; SI-NEXT: s_waitcnt vmcnt(0)
28 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
29 ; SI-NEXT: s_waitcnt vmcnt(0)
30 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
31 ; SI-NEXT: s_waitcnt vmcnt(0)
32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
33 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
34 ; SI-NEXT: v_or_b32_e32 v3, v6, v2
35 ; SI-NEXT: v_or_b32_e32 v2, v4, v5
36 ; SI-NEXT: s_mov_b64 vcc, exec
37 ; SI-NEXT: s_cbranch_execz .LBB0_3
38 ; SI-NEXT: s_branch .LBB0_4
40 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
41 ; SI-NEXT: s_mov_b64 vcc, 0
42 ; SI-NEXT: .LBB0_3: ; %T
43 ; SI-NEXT: s_mov_b32 s6, 0
44 ; SI-NEXT: s_mov_b32 s7, 0xf000
45 ; SI-NEXT: s_mov_b32 s4, s6
46 ; SI-NEXT: s_mov_b32 s5, s6
47 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
48 ; SI-NEXT: s_waitcnt vmcnt(0)
49 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
50 ; SI-NEXT: s_waitcnt vmcnt(0)
51 ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
52 ; SI-NEXT: s_waitcnt vmcnt(0)
53 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
54 ; SI-NEXT: s_waitcnt vmcnt(0)
55 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
56 ; SI-NEXT: s_waitcnt vmcnt(0)
57 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
58 ; SI-NEXT: s_waitcnt vmcnt(0)
59 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
60 ; SI-NEXT: s_waitcnt vmcnt(0)
61 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
62 ; SI-NEXT: s_waitcnt vmcnt(0)
63 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
64 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
65 ; SI-NEXT: v_or_b32_e32 v3, v4, v0
66 ; SI-NEXT: v_or_b32_e32 v2, v2, v1
67 ; SI-NEXT: .LBB0_4: ; %exit
68 ; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2
69 ; SI-NEXT: v_bfe_i32 v1, v2, 0, 16
70 ; SI-NEXT: v_bfe_i32 v2, v3, 0, 16
71 ; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000
72 ; SI-NEXT: v_bfrev_b32_e32 v4, 1
73 ; SI-NEXT: v_mov_b32_e32 v5, 0xffff
74 ; SI-NEXT: v_mov_b32_e32 v6, 0x8000
75 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
76 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
77 ; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
78 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
79 ; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
80 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
81 ; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc
82 ; SI-NEXT: v_or_b32_e32 v0, v0, v4
83 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
84 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
85 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
86 ; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16
87 ; SI-NEXT: s_setpc_b64 s[30:31]
89 ; GFX9-LABEL: vec_8xi16_extract_4xi16:
91 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92 ; GFX9-NEXT: s_cbranch_scc0 .LBB0_2
93 ; GFX9-NEXT: ; %bb.1: ; %F
94 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc
95 ; GFX9-NEXT: s_waitcnt vmcnt(0)
96 ; GFX9-NEXT: s_cbranch_execz .LBB0_3
97 ; GFX9-NEXT: s_branch .LBB0_4
99 ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
100 ; GFX9-NEXT: .LBB0_3: ; %T
101 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
102 ; GFX9-NEXT: s_waitcnt vmcnt(0)
103 ; GFX9-NEXT: .LBB0_4: ; %exit
104 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v3 op_sel_hi:[0,0]
105 ; GFX9-NEXT: s_movk_i32 s4, 0x8000
106 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
107 ; GFX9-NEXT: v_or_b32_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
108 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
109 ; GFX9-NEXT: v_or_b32_e32 v2, 0xffff8000, v0
110 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
111 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
112 ; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
113 ; GFX9-NEXT: v_perm_b32 v1, v3, v1, s4
114 ; GFX9-NEXT: s_setpc_b64 s[30:31]
116 ; GFX11-LABEL: vec_8xi16_extract_4xi16:
118 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119 ; GFX11-NEXT: s_cbranch_scc0 .LBB0_2
120 ; GFX11-NEXT: ; %bb.1: ; %F
121 ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc
122 ; GFX11-NEXT: s_waitcnt vmcnt(0)
123 ; GFX11-NEXT: s_cbranch_execz .LBB0_3
124 ; GFX11-NEXT: s_branch .LBB0_4
125 ; GFX11-NEXT: .LBB0_2:
126 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
127 ; GFX11-NEXT: .LBB0_3: ; %T
128 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
129 ; GFX11-NEXT: s_waitcnt vmcnt(0)
130 ; GFX11-NEXT: .LBB0_4: ; %exit
131 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
132 ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
133 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
134 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
135 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
136 ; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1
137 ; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
138 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
139 ; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2
140 ; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3
141 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
142 ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
143 ; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
144 ; GFX11-NEXT: s_setpc_b64 s[30:31]
145 br i1 undef, label %T, label %F
148 %t = load volatile <8 x i16>, ptr addrspace(1) %p0
152 %f = load volatile <8 x i16>, ptr addrspace(1) %p1
156 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
157 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
158 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
159 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
163 define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
164 ; SI-LABEL: vec_8xi16_extract_4xi16_2:
166 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167 ; SI-NEXT: s_cbranch_scc0 .LBB1_2
168 ; SI-NEXT: ; %bb.1: ; %F
169 ; SI-NEXT: s_mov_b32 s6, 0
170 ; SI-NEXT: s_mov_b32 s7, 0xf000
171 ; SI-NEXT: s_mov_b32 s4, s6
172 ; SI-NEXT: s_mov_b32 s5, s6
173 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
174 ; SI-NEXT: s_waitcnt vmcnt(0)
175 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
176 ; SI-NEXT: s_waitcnt vmcnt(0)
177 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
178 ; SI-NEXT: s_waitcnt vmcnt(0)
179 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
180 ; SI-NEXT: s_waitcnt vmcnt(0)
181 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
182 ; SI-NEXT: s_waitcnt vmcnt(0)
183 ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
184 ; SI-NEXT: s_waitcnt vmcnt(0)
185 ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
186 ; SI-NEXT: s_waitcnt vmcnt(0)
187 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
188 ; SI-NEXT: s_waitcnt vmcnt(0)
189 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
190 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
191 ; SI-NEXT: v_or_b32_e32 v5, v6, v2
192 ; SI-NEXT: v_or_b32_e32 v4, v4, v3
193 ; SI-NEXT: s_mov_b64 vcc, exec
194 ; SI-NEXT: s_cbranch_execz .LBB1_3
195 ; SI-NEXT: s_branch .LBB1_4
197 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
198 ; SI-NEXT: s_mov_b64 vcc, 0
199 ; SI-NEXT: .LBB1_3: ; %T
200 ; SI-NEXT: s_mov_b32 s6, 0
201 ; SI-NEXT: s_mov_b32 s7, 0xf000
202 ; SI-NEXT: s_mov_b32 s4, s6
203 ; SI-NEXT: s_mov_b32 s5, s6
204 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
205 ; SI-NEXT: s_waitcnt vmcnt(0)
206 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
207 ; SI-NEXT: s_waitcnt vmcnt(0)
208 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
209 ; SI-NEXT: s_waitcnt vmcnt(0)
210 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
211 ; SI-NEXT: s_waitcnt vmcnt(0)
212 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
213 ; SI-NEXT: s_waitcnt vmcnt(0)
214 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
215 ; SI-NEXT: s_waitcnt vmcnt(0)
216 ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
217 ; SI-NEXT: s_waitcnt vmcnt(0)
218 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
219 ; SI-NEXT: s_waitcnt vmcnt(0)
220 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
221 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
222 ; SI-NEXT: v_or_b32_e32 v5, v4, v0
223 ; SI-NEXT: v_or_b32_e32 v4, v2, v1
224 ; SI-NEXT: .LBB1_4: ; %exit
225 ; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4
226 ; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48
227 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
228 ; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
229 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
230 ; SI-NEXT: v_bfrev_b32_e32 v5, 1
231 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff
232 ; SI-NEXT: v_mov_b32_e32 v7, 0x8000
233 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
234 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
235 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
236 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
237 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
238 ; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
239 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
240 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
241 ; SI-NEXT: v_or_b32_e32 v0, v1, v8
242 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
243 ; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16
244 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
245 ; SI-NEXT: s_setpc_b64 s[30:31]
247 ; GFX9-LABEL: vec_8xi16_extract_4xi16_2:
249 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
250 ; GFX9-NEXT: s_cbranch_scc0 .LBB1_2
251 ; GFX9-NEXT: ; %bb.1: ; %F
252 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc
253 ; GFX9-NEXT: s_waitcnt vmcnt(0)
254 ; GFX9-NEXT: s_cbranch_execz .LBB1_3
255 ; GFX9-NEXT: s_branch .LBB1_4
256 ; GFX9-NEXT: .LBB1_2:
257 ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
258 ; GFX9-NEXT: .LBB1_3: ; %T
259 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
261 ; GFX9-NEXT: .LBB1_4: ; %exit
262 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,1]
263 ; GFX9-NEXT: s_movk_i32 s4, 0x8000
264 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
265 ; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
266 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
267 ; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0
268 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
269 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
270 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
271 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
272 ; GFX9-NEXT: s_setpc_b64 s[30:31]
274 ; GFX11-LABEL: vec_8xi16_extract_4xi16_2:
276 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277 ; GFX11-NEXT: s_cbranch_scc0 .LBB1_2
278 ; GFX11-NEXT: ; %bb.1: ; %F
279 ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc
280 ; GFX11-NEXT: s_waitcnt vmcnt(0)
281 ; GFX11-NEXT: s_cbranch_execz .LBB1_3
282 ; GFX11-NEXT: s_branch .LBB1_4
283 ; GFX11-NEXT: .LBB1_2:
284 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
285 ; GFX11-NEXT: .LBB1_3: ; %T
286 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
287 ; GFX11-NEXT: s_waitcnt vmcnt(0)
288 ; GFX11-NEXT: .LBB1_4: ; %exit
289 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
290 ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
291 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
292 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
293 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
294 ; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1
295 ; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
296 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
297 ; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2
298 ; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3
299 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
300 ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
301 ; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
302 ; GFX11-NEXT: s_setpc_b64 s[30:31]
303 br i1 undef, label %T, label %F
306 %t = load volatile <8 x i16>, ptr addrspace(1) %p0
310 %f = load volatile <8 x i16>, ptr addrspace(1) %p1
314 %m = phi <8 x i16> [ %t, %T ], [ %f, %F ]
315 %v2 = shufflevector <8 x i16> %m, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
316 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
317 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
321 define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
322 ; SI-LABEL: vec_8xf16_extract_4xf16:
324 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325 ; SI-NEXT: s_cbranch_scc0 .LBB2_2
326 ; SI-NEXT: ; %bb.1: ; %F
327 ; SI-NEXT: s_mov_b32 s6, 0
328 ; SI-NEXT: s_mov_b32 s7, 0xf000
329 ; SI-NEXT: s_mov_b32 s4, s6
330 ; SI-NEXT: s_mov_b32 s5, s6
331 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
332 ; SI-NEXT: s_waitcnt vmcnt(0)
333 ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
334 ; SI-NEXT: s_waitcnt vmcnt(0)
335 ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
336 ; SI-NEXT: s_waitcnt vmcnt(0)
337 ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
338 ; SI-NEXT: s_waitcnt vmcnt(0)
339 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
340 ; SI-NEXT: s_waitcnt vmcnt(0)
341 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
342 ; SI-NEXT: s_waitcnt vmcnt(0)
343 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
344 ; SI-NEXT: s_waitcnt vmcnt(0)
345 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:14 glc
346 ; SI-NEXT: s_waitcnt vmcnt(0)
347 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
348 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
349 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
350 ; SI-NEXT: v_or_b32_e32 v2, v6, v2
351 ; SI-NEXT: v_or_b32_e32 v4, v4, v7
352 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
353 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
354 ; SI-NEXT: s_mov_b64 vcc, exec
355 ; SI-NEXT: s_cbranch_execz .LBB2_3
356 ; SI-NEXT: s_branch .LBB2_4
358 ; SI-NEXT: ; implicit-def: $vgpr4
359 ; SI-NEXT: ; implicit-def: $vgpr3
360 ; SI-NEXT: ; implicit-def: $vgpr2
361 ; SI-NEXT: s_mov_b64 vcc, 0
362 ; SI-NEXT: .LBB2_3: ; %T
363 ; SI-NEXT: s_mov_b32 s6, 0
364 ; SI-NEXT: s_mov_b32 s7, 0xf000
365 ; SI-NEXT: s_mov_b32 s4, s6
366 ; SI-NEXT: s_mov_b32 s5, s6
367 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
368 ; SI-NEXT: s_waitcnt vmcnt(0)
369 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
370 ; SI-NEXT: s_waitcnt vmcnt(0)
371 ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
372 ; SI-NEXT: s_waitcnt vmcnt(0)
373 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
374 ; SI-NEXT: s_waitcnt vmcnt(0)
375 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
376 ; SI-NEXT: s_waitcnt vmcnt(0)
377 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
378 ; SI-NEXT: s_waitcnt vmcnt(0)
379 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
380 ; SI-NEXT: s_waitcnt vmcnt(0)
381 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:14 glc
382 ; SI-NEXT: s_waitcnt vmcnt(0)
383 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
384 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
385 ; SI-NEXT: v_or_b32_e32 v0, v4, v0
386 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
387 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
388 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
389 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
390 ; SI-NEXT: .LBB2_4: ; %exit
391 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
392 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
393 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
394 ; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000
395 ; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000
396 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
397 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
398 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
399 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0
400 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
401 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1
402 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
403 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2
404 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
405 ; SI-NEXT: v_mov_b32_e32 v3, v2
406 ; SI-NEXT: s_setpc_b64 s[30:31]
408 ; GFX9-LABEL: vec_8xf16_extract_4xf16:
410 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
411 ; GFX9-NEXT: s_cbranch_scc0 .LBB2_2
412 ; GFX9-NEXT: ; %bb.1: ; %F
413 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off glc
414 ; GFX9-NEXT: s_waitcnt vmcnt(0)
415 ; GFX9-NEXT: s_cbranch_execz .LBB2_3
416 ; GFX9-NEXT: s_branch .LBB2_4
417 ; GFX9-NEXT: .LBB2_2:
418 ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
419 ; GFX9-NEXT: .LBB2_3: ; %T
420 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
422 ; GFX9-NEXT: .LBB2_4: ; %exit
423 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
424 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
425 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2
426 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800
427 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
428 ; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD
429 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
430 ; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v3
431 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc
432 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v3
433 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
434 ; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5
435 ; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2
436 ; GFX9-NEXT: s_setpc_b64 s[30:31]
438 ; GFX11-LABEL: vec_8xf16_extract_4xf16:
440 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
441 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_2
442 ; GFX11-NEXT: ; %bb.1: ; %F
443 ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc
444 ; GFX11-NEXT: s_waitcnt vmcnt(0)
445 ; GFX11-NEXT: s_cbranch_execz .LBB2_3
446 ; GFX11-NEXT: s_branch .LBB2_4
447 ; GFX11-NEXT: .LBB2_2:
448 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
449 ; GFX11-NEXT: .LBB2_3: ; %T
450 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
451 ; GFX11-NEXT: s_waitcnt vmcnt(0)
452 ; GFX11-NEXT: .LBB2_4: ; %exit
453 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00
454 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
455 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
456 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
457 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
458 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
459 ; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
460 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
461 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
462 ; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
463 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
464 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
465 ; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1
466 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4
467 ; GFX11-NEXT: s_setpc_b64 s[30:31]
468 br i1 undef, label %T, label %F
471 %t = load volatile <8 x half>, ptr addrspace(1) %p0
475 %f = load volatile <8 x half>, ptr addrspace(1) %p1
479 %m = phi <8 x half> [ %t, %T ], [ %f, %F ]
480 %v2 = shufflevector <8 x half> %m, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
481 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
482 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
486 define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
488 ; SI-LABEL: vec_16xi16_extract_4xi16:
490 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491 ; SI-NEXT: s_cbranch_scc0 .LBB3_2
492 ; SI-NEXT: ; %bb.1: ; %F
493 ; SI-NEXT: s_mov_b32 s6, 0
494 ; SI-NEXT: s_mov_b32 s7, 0xf000
495 ; SI-NEXT: s_mov_b32 s4, s6
496 ; SI-NEXT: s_mov_b32 s5, s6
497 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
498 ; SI-NEXT: s_waitcnt vmcnt(0)
499 ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
500 ; SI-NEXT: s_waitcnt vmcnt(0)
501 ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
502 ; SI-NEXT: s_waitcnt vmcnt(0)
503 ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
504 ; SI-NEXT: s_waitcnt vmcnt(0)
505 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
506 ; SI-NEXT: s_waitcnt vmcnt(0)
507 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
508 ; SI-NEXT: s_waitcnt vmcnt(0)
509 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
510 ; SI-NEXT: s_waitcnt vmcnt(0)
511 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc
512 ; SI-NEXT: s_waitcnt vmcnt(0)
513 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
514 ; SI-NEXT: s_waitcnt vmcnt(0)
515 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
516 ; SI-NEXT: s_waitcnt vmcnt(0)
517 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
518 ; SI-NEXT: s_waitcnt vmcnt(0)
519 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
520 ; SI-NEXT: s_waitcnt vmcnt(0)
521 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
522 ; SI-NEXT: s_waitcnt vmcnt(0)
523 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
524 ; SI-NEXT: s_waitcnt vmcnt(0)
525 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
526 ; SI-NEXT: s_waitcnt vmcnt(0)
527 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
528 ; SI-NEXT: s_waitcnt vmcnt(0)
529 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
530 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
531 ; SI-NEXT: v_or_b32_e32 v3, v6, v2
532 ; SI-NEXT: v_or_b32_e32 v2, v4, v5
533 ; SI-NEXT: s_mov_b64 vcc, exec
534 ; SI-NEXT: s_cbranch_execz .LBB3_3
535 ; SI-NEXT: s_branch .LBB3_4
537 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
538 ; SI-NEXT: s_mov_b64 vcc, 0
539 ; SI-NEXT: .LBB3_3: ; %T
540 ; SI-NEXT: s_mov_b32 s6, 0
541 ; SI-NEXT: s_mov_b32 s7, 0xf000
542 ; SI-NEXT: s_mov_b32 s4, s6
543 ; SI-NEXT: s_mov_b32 s5, s6
544 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
545 ; SI-NEXT: s_waitcnt vmcnt(0)
546 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
547 ; SI-NEXT: s_waitcnt vmcnt(0)
548 ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
549 ; SI-NEXT: s_waitcnt vmcnt(0)
550 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
551 ; SI-NEXT: s_waitcnt vmcnt(0)
552 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
553 ; SI-NEXT: s_waitcnt vmcnt(0)
554 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
555 ; SI-NEXT: s_waitcnt vmcnt(0)
556 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
557 ; SI-NEXT: s_waitcnt vmcnt(0)
558 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc
559 ; SI-NEXT: s_waitcnt vmcnt(0)
560 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
561 ; SI-NEXT: s_waitcnt vmcnt(0)
562 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
563 ; SI-NEXT: s_waitcnt vmcnt(0)
564 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
565 ; SI-NEXT: s_waitcnt vmcnt(0)
566 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
567 ; SI-NEXT: s_waitcnt vmcnt(0)
568 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
569 ; SI-NEXT: s_waitcnt vmcnt(0)
570 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
571 ; SI-NEXT: s_waitcnt vmcnt(0)
572 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
573 ; SI-NEXT: s_waitcnt vmcnt(0)
574 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
575 ; SI-NEXT: s_waitcnt vmcnt(0)
576 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
577 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
578 ; SI-NEXT: v_or_b32_e32 v3, v4, v0
579 ; SI-NEXT: v_or_b32_e32 v2, v2, v1
580 ; SI-NEXT: .LBB3_4: ; %exit
581 ; SI-NEXT: v_ashrrev_i32_e32 v0, 16, v2
582 ; SI-NEXT: v_bfe_i32 v1, v2, 0, 16
583 ; SI-NEXT: v_bfe_i32 v2, v3, 0, 16
584 ; SI-NEXT: v_mov_b32_e32 v3, 0xffff0000
585 ; SI-NEXT: v_bfrev_b32_e32 v4, 1
586 ; SI-NEXT: v_mov_b32_e32 v5, 0xffff
587 ; SI-NEXT: v_mov_b32_e32 v6, 0x8000
588 ; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
589 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
590 ; SI-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
591 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
592 ; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
593 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
594 ; SI-NEXT: v_cndmask_b32_e32 v1, -1, v7, vcc
595 ; SI-NEXT: v_or_b32_e32 v0, v0, v4
596 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
597 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1
598 ; SI-NEXT: v_or_b32_e32 v2, v3, v2
599 ; SI-NEXT: v_alignbit_b32 v1, v2, v4, 16
600 ; SI-NEXT: s_setpc_b64 s[30:31]
602 ; GFX9-LABEL: vec_16xi16_extract_4xi16:
604 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605 ; GFX9-NEXT: s_cbranch_scc0 .LBB3_2
606 ; GFX9-NEXT: ; %bb.1: ; %F
607 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
608 ; GFX9-NEXT: s_waitcnt vmcnt(0)
609 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc
610 ; GFX9-NEXT: s_waitcnt vmcnt(0)
611 ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
612 ; GFX9-NEXT: s_cbranch_execz .LBB3_3
613 ; GFX9-NEXT: s_branch .LBB3_4
614 ; GFX9-NEXT: .LBB3_2:
615 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
616 ; GFX9-NEXT: .LBB3_3: ; %T
617 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
618 ; GFX9-NEXT: s_waitcnt vmcnt(0)
619 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
620 ; GFX9-NEXT: s_waitcnt vmcnt(0)
621 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
622 ; GFX9-NEXT: .LBB3_4: ; %exit
623 ; GFX9-NEXT: s_waitcnt vmcnt(0)
624 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v5 op_sel_hi:[0,0]
625 ; GFX9-NEXT: s_movk_i32 s4, 0x8000
626 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
627 ; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
628 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
629 ; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0
630 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
631 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
632 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
633 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
634 ; GFX9-NEXT: s_setpc_b64 s[30:31]
636 ; GFX11-LABEL: vec_16xi16_extract_4xi16:
638 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
639 ; GFX11-NEXT: s_cbranch_scc0 .LBB3_2
640 ; GFX11-NEXT: ; %bb.1: ; %F
641 ; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
642 ; GFX11-NEXT: s_waitcnt vmcnt(0)
643 ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc
644 ; GFX11-NEXT: s_waitcnt vmcnt(0)
645 ; GFX11-NEXT: s_cbranch_execz .LBB3_3
646 ; GFX11-NEXT: s_branch .LBB3_4
647 ; GFX11-NEXT: .LBB3_2:
648 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
649 ; GFX11-NEXT: .LBB3_3: ; %T
650 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
651 ; GFX11-NEXT: s_waitcnt vmcnt(0)
652 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
653 ; GFX11-NEXT: s_waitcnt vmcnt(0)
654 ; GFX11-NEXT: .LBB3_4: ; %exit
655 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
656 ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
657 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
658 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
659 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
660 ; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1
661 ; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
662 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
663 ; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2
664 ; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3
665 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
666 ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
667 ; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
668 ; GFX11-NEXT: s_setpc_b64 s[30:31]
669 br i1 undef, label %T, label %F
672 %t = load volatile <16 x i16>, ptr addrspace(1) %p0
676 %f = load volatile <16 x i16>, ptr addrspace(1) %p1
680 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
681 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
682 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
683 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
687 define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
689 ; SI-LABEL: vec_16xi16_extract_4xi16_2:
691 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
692 ; SI-NEXT: s_cbranch_scc0 .LBB4_2
693 ; SI-NEXT: ; %bb.1: ; %F
694 ; SI-NEXT: s_mov_b32 s6, 0
695 ; SI-NEXT: s_mov_b32 s7, 0xf000
696 ; SI-NEXT: s_mov_b32 s4, s6
697 ; SI-NEXT: s_mov_b32 s5, s6
698 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
699 ; SI-NEXT: s_waitcnt vmcnt(0)
700 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:2 glc
701 ; SI-NEXT: s_waitcnt vmcnt(0)
702 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:4 glc
703 ; SI-NEXT: s_waitcnt vmcnt(0)
704 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:6 glc
705 ; SI-NEXT: s_waitcnt vmcnt(0)
706 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 offset:8 glc
707 ; SI-NEXT: s_waitcnt vmcnt(0)
708 ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:10 glc
709 ; SI-NEXT: s_waitcnt vmcnt(0)
710 ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:12 glc
711 ; SI-NEXT: s_waitcnt vmcnt(0)
712 ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:14 glc
713 ; SI-NEXT: s_waitcnt vmcnt(0)
714 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
715 ; SI-NEXT: s_waitcnt vmcnt(0)
716 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
717 ; SI-NEXT: s_waitcnt vmcnt(0)
718 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
719 ; SI-NEXT: s_waitcnt vmcnt(0)
720 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
721 ; SI-NEXT: s_waitcnt vmcnt(0)
722 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
723 ; SI-NEXT: s_waitcnt vmcnt(0)
724 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
725 ; SI-NEXT: s_waitcnt vmcnt(0)
726 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
727 ; SI-NEXT: s_waitcnt vmcnt(0)
728 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
729 ; SI-NEXT: s_waitcnt vmcnt(0)
730 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
731 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
732 ; SI-NEXT: v_or_b32_e32 v5, v6, v2
733 ; SI-NEXT: v_or_b32_e32 v4, v4, v3
734 ; SI-NEXT: s_mov_b64 vcc, exec
735 ; SI-NEXT: s_cbranch_execz .LBB4_3
736 ; SI-NEXT: s_branch .LBB4_4
738 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
739 ; SI-NEXT: s_mov_b64 vcc, 0
740 ; SI-NEXT: .LBB4_3: ; %T
741 ; SI-NEXT: s_mov_b32 s6, 0
742 ; SI-NEXT: s_mov_b32 s7, 0xf000
743 ; SI-NEXT: s_mov_b32 s4, s6
744 ; SI-NEXT: s_mov_b32 s5, s6
745 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
746 ; SI-NEXT: s_waitcnt vmcnt(0)
747 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:2 glc
748 ; SI-NEXT: s_waitcnt vmcnt(0)
749 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:4 glc
750 ; SI-NEXT: s_waitcnt vmcnt(0)
751 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:6 glc
752 ; SI-NEXT: s_waitcnt vmcnt(0)
753 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8 glc
754 ; SI-NEXT: s_waitcnt vmcnt(0)
755 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:10 glc
756 ; SI-NEXT: s_waitcnt vmcnt(0)
757 ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:12 glc
758 ; SI-NEXT: s_waitcnt vmcnt(0)
759 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:14 glc
760 ; SI-NEXT: s_waitcnt vmcnt(0)
761 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
762 ; SI-NEXT: s_waitcnt vmcnt(0)
763 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
764 ; SI-NEXT: s_waitcnt vmcnt(0)
765 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
766 ; SI-NEXT: s_waitcnt vmcnt(0)
767 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
768 ; SI-NEXT: s_waitcnt vmcnt(0)
769 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
770 ; SI-NEXT: s_waitcnt vmcnt(0)
771 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
772 ; SI-NEXT: s_waitcnt vmcnt(0)
773 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
774 ; SI-NEXT: s_waitcnt vmcnt(0)
775 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
776 ; SI-NEXT: s_waitcnt vmcnt(0)
777 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
778 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
779 ; SI-NEXT: v_or_b32_e32 v5, v4, v0
780 ; SI-NEXT: v_or_b32_e32 v4, v2, v1
781 ; SI-NEXT: .LBB4_4: ; %exit
782 ; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v4
783 ; SI-NEXT: v_ashr_i64 v[0:1], v[4:5], 48
784 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
785 ; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
786 ; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
787 ; SI-NEXT: v_bfrev_b32_e32 v5, 1
788 ; SI-NEXT: v_mov_b32_e32 v6, 0xffff
789 ; SI-NEXT: v_mov_b32_e32 v7, 0x8000
790 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
791 ; SI-NEXT: v_cndmask_b32_e32 v8, v4, v5, vcc
792 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
793 ; SI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc
794 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
795 ; SI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
796 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
797 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
798 ; SI-NEXT: v_or_b32_e32 v0, v1, v8
799 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
800 ; SI-NEXT: v_alignbit_b32 v1, v2, v8, 16
801 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
802 ; SI-NEXT: s_setpc_b64 s[30:31]
804 ; GFX9-LABEL: vec_16xi16_extract_4xi16_2:
806 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
807 ; GFX9-NEXT: s_cbranch_scc0 .LBB4_2
808 ; GFX9-NEXT: ; %bb.1: ; %F
809 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
810 ; GFX9-NEXT: s_waitcnt vmcnt(0)
811 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc
812 ; GFX9-NEXT: s_waitcnt vmcnt(0)
813 ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
814 ; GFX9-NEXT: s_cbranch_execz .LBB4_3
815 ; GFX9-NEXT: s_branch .LBB4_4
816 ; GFX9-NEXT: .LBB4_2:
817 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
818 ; GFX9-NEXT: .LBB4_3: ; %T
819 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
820 ; GFX9-NEXT: s_waitcnt vmcnt(0)
821 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
822 ; GFX9-NEXT: s_waitcnt vmcnt(0)
823 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
824 ; GFX9-NEXT: .LBB4_4: ; %exit
825 ; GFX9-NEXT: s_waitcnt vmcnt(0)
826 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v7 op_sel_hi:[0,1]
827 ; GFX9-NEXT: s_movk_i32 s4, 0x8000
828 ; GFX9-NEXT: v_or_b32_e32 v1, 0xffff8000, v0
829 ; GFX9-NEXT: v_or_b32_sdwa v2, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
830 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v6 op_sel_hi:[0,1]
831 ; GFX9-NEXT: v_or_b32_e32 v3, 0xffff8000, v0
832 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
833 ; GFX9-NEXT: s_mov_b32 s4, 0x5040100
834 ; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
835 ; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
836 ; GFX9-NEXT: s_setpc_b64 s[30:31]
838 ; GFX11-LABEL: vec_16xi16_extract_4xi16_2:
840 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
841 ; GFX11-NEXT: s_cbranch_scc0 .LBB4_2
842 ; GFX11-NEXT: ; %bb.1: ; %F
843 ; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
844 ; GFX11-NEXT: s_waitcnt vmcnt(0)
845 ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc
846 ; GFX11-NEXT: s_waitcnt vmcnt(0)
847 ; GFX11-NEXT: s_cbranch_execz .LBB4_3
848 ; GFX11-NEXT: s_branch .LBB4_4
849 ; GFX11-NEXT: .LBB4_2:
850 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
851 ; GFX11-NEXT: .LBB4_3: ; %T
852 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
853 ; GFX11-NEXT: s_waitcnt vmcnt(0)
854 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
855 ; GFX11-NEXT: s_waitcnt vmcnt(0)
856 ; GFX11-NEXT: .LBB4_4: ; %exit
857 ; GFX11-NEXT: v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
858 ; GFX11-NEXT: v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
859 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
860 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
861 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
862 ; GFX11-NEXT: v_or_b32_e32 v1, 0xffff8000, v1
863 ; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
864 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
865 ; GFX11-NEXT: v_or_b32_e32 v2, 0xffff8000, v2
866 ; GFX11-NEXT: v_or_b32_e32 v3, 0xffff8000, v3
867 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
868 ; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
869 ; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
870 ; GFX11-NEXT: s_setpc_b64 s[30:31]
871 br i1 undef, label %T, label %F
874 %t = load volatile <16 x i16>, ptr addrspace(1) %p0
878 %f = load volatile <16 x i16>, ptr addrspace(1) %p1
882 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
883 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
884 %b2 = icmp sgt <4 x i16> %v2, <i16 -1, i16 -1, i16 -1, i16 -1>
885 %r2 = select <4 x i1> %b2, <4 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>
889 define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
891 ; SI-LABEL: vec_16xf16_extract_4xf16:
893 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
894 ; SI-NEXT: s_cbranch_scc0 .LBB5_2
895 ; SI-NEXT: ; %bb.1: ; %F
896 ; SI-NEXT: s_mov_b32 s6, 0
897 ; SI-NEXT: s_mov_b32 s7, 0xf000
898 ; SI-NEXT: s_mov_b32 s4, s6
899 ; SI-NEXT: s_mov_b32 s5, s6
900 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[4:7], 0 addr64 glc
901 ; SI-NEXT: s_waitcnt vmcnt(0)
902 ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[4:7], 0 addr64 offset:2 glc
903 ; SI-NEXT: s_waitcnt vmcnt(0)
904 ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[4:7], 0 addr64 offset:4 glc
905 ; SI-NEXT: s_waitcnt vmcnt(0)
906 ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[4:7], 0 addr64 offset:6 glc
907 ; SI-NEXT: s_waitcnt vmcnt(0)
908 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:8 glc
909 ; SI-NEXT: s_waitcnt vmcnt(0)
910 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:10 glc
911 ; SI-NEXT: s_waitcnt vmcnt(0)
912 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:12 glc
913 ; SI-NEXT: s_waitcnt vmcnt(0)
914 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:14 glc
915 ; SI-NEXT: s_waitcnt vmcnt(0)
916 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:16 glc
917 ; SI-NEXT: s_waitcnt vmcnt(0)
918 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:18 glc
919 ; SI-NEXT: s_waitcnt vmcnt(0)
920 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:20 glc
921 ; SI-NEXT: s_waitcnt vmcnt(0)
922 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:22 glc
923 ; SI-NEXT: s_waitcnt vmcnt(0)
924 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:24 glc
925 ; SI-NEXT: s_waitcnt vmcnt(0)
926 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:26 glc
927 ; SI-NEXT: s_waitcnt vmcnt(0)
928 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[4:7], 0 addr64 offset:28 glc
929 ; SI-NEXT: s_waitcnt vmcnt(0)
930 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:30 glc
931 ; SI-NEXT: s_waitcnt vmcnt(0)
932 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7
933 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5
934 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
935 ; SI-NEXT: v_or_b32_e32 v2, v6, v2
936 ; SI-NEXT: v_or_b32_e32 v4, v4, v7
937 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
938 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
939 ; SI-NEXT: s_mov_b64 vcc, exec
940 ; SI-NEXT: s_cbranch_execz .LBB5_3
941 ; SI-NEXT: s_branch .LBB5_4
943 ; SI-NEXT: ; implicit-def: $vgpr4
944 ; SI-NEXT: ; implicit-def: $vgpr3
945 ; SI-NEXT: ; implicit-def: $vgpr2
946 ; SI-NEXT: s_mov_b64 vcc, 0
947 ; SI-NEXT: .LBB5_3: ; %T
948 ; SI-NEXT: s_mov_b32 s6, 0
949 ; SI-NEXT: s_mov_b32 s7, 0xf000
950 ; SI-NEXT: s_mov_b32 s4, s6
951 ; SI-NEXT: s_mov_b32 s5, s6
952 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
953 ; SI-NEXT: s_waitcnt vmcnt(0)
954 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 offset:2 glc
955 ; SI-NEXT: s_waitcnt vmcnt(0)
956 ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:4 glc
957 ; SI-NEXT: s_waitcnt vmcnt(0)
958 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:6 glc
959 ; SI-NEXT: s_waitcnt vmcnt(0)
960 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:8 glc
961 ; SI-NEXT: s_waitcnt vmcnt(0)
962 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:10 glc
963 ; SI-NEXT: s_waitcnt vmcnt(0)
964 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:12 glc
965 ; SI-NEXT: s_waitcnt vmcnt(0)
966 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:14 glc
967 ; SI-NEXT: s_waitcnt vmcnt(0)
968 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:16 glc
969 ; SI-NEXT: s_waitcnt vmcnt(0)
970 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:18 glc
971 ; SI-NEXT: s_waitcnt vmcnt(0)
972 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:20 glc
973 ; SI-NEXT: s_waitcnt vmcnt(0)
974 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:22 glc
975 ; SI-NEXT: s_waitcnt vmcnt(0)
976 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:24 glc
977 ; SI-NEXT: s_waitcnt vmcnt(0)
978 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:26 glc
979 ; SI-NEXT: s_waitcnt vmcnt(0)
980 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:28 glc
981 ; SI-NEXT: s_waitcnt vmcnt(0)
982 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:30 glc
983 ; SI-NEXT: s_waitcnt vmcnt(0)
984 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5
985 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
986 ; SI-NEXT: v_or_b32_e32 v0, v4, v0
987 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
988 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
989 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1
990 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
991 ; SI-NEXT: .LBB5_4: ; %exit
992 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
993 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
994 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
995 ; SI-NEXT: v_mov_b32_e32 v3, 0x3fa00000
996 ; SI-NEXT: v_mov_b32_e32 v4, 0x3f200000
997 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
998 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
999 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1000 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0
1001 ; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
1002 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1
1003 ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
1004 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v2
1005 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
1006 ; SI-NEXT: v_mov_b32_e32 v3, v2
1007 ; SI-NEXT: s_setpc_b64 s[30:31]
1009 ; GFX9-LABEL: vec_16xf16_extract_4xf16:
1011 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1012 ; GFX9-NEXT: s_cbranch_scc0 .LBB5_2
1013 ; GFX9-NEXT: ; %bb.1: ; %F
1014 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
1015 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1016 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc
1017 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1018 ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
1019 ; GFX9-NEXT: s_cbranch_execz .LBB5_3
1020 ; GFX9-NEXT: s_branch .LBB5_4
1021 ; GFX9-NEXT: .LBB5_2:
1022 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
1023 ; GFX9-NEXT: .LBB5_3: ; %T
1024 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
1025 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1026 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
1027 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1028 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
1029 ; GFX9-NEXT: .LBB5_4: ; %exit
1030 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
1031 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
1032 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1033 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
1034 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800
1035 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
1036 ; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD
1037 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
1038 ; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v5
1039 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
1040 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5
1041 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1042 ; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4
1043 ; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3
1044 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1046 ; GFX11-LABEL: vec_16xf16_extract_4xf16:
1048 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1049 ; GFX11-NEXT: s_cbranch_scc0 .LBB5_2
1050 ; GFX11-NEXT: ; %bb.1: ; %F
1051 ; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
1052 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1053 ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc
1054 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1055 ; GFX11-NEXT: s_cbranch_execz .LBB5_3
1056 ; GFX11-NEXT: s_branch .LBB5_4
1057 ; GFX11-NEXT: .LBB5_2:
1058 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
1059 ; GFX11-NEXT: .LBB5_3: ; %T
1060 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
1061 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1062 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
1063 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1064 ; GFX11-NEXT: .LBB5_4: ; %exit
1065 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x3d00
1066 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
1067 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
1068 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1069 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
1070 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
1071 ; GFX11-NEXT: v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
1072 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
1073 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1074 ; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
1075 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
1076 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
1077 ; GFX11-NEXT: v_pack_b32_f16 v0, v2, v1
1078 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v4
1079 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1080 br i1 undef, label %T, label %F
1083 %t = load volatile <16 x half>, ptr addrspace(1) %p0
1087 %f = load volatile <16 x half>, ptr addrspace(1) %p1
1091 %m = phi <16 x half> [ %t, %T ], [ %f, %F ]
1092 %v2 = shufflevector <16 x half> %m, <16 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1093 %b2 = fcmp ugt <4 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
1094 %r2 = select <4 x i1> %b2, <4 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <4 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>
1098 define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
1099 ; SI-LABEL: large_vector:
1101 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102 ; SI-NEXT: v_lshlrev_b32_e32 v1, 5, v1
1103 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1104 ; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0
1105 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0
1106 ; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0
1107 ; SI-NEXT: s_mov_b32 m0, -1
1108 ; SI-NEXT: ds_read_b32 v0, v0
1109 ; SI-NEXT: ds_read_b32 v2, v1
1110 ; SI-NEXT: ds_read_b32 v4, v3
1111 ; SI-NEXT: ds_read_b32 v6, v5
1112 ; SI-NEXT: s_waitcnt lgkmcnt(3)
1113 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1114 ; SI-NEXT: s_waitcnt lgkmcnt(2)
1115 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
1116 ; SI-NEXT: s_waitcnt lgkmcnt(1)
1117 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
1118 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1119 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
1120 ; SI-NEXT: s_setpc_b64 s[30:31]
1122 ; GFX9-LABEL: large_vector:
1124 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1125 ; GFX9-NEXT: v_lshl_add_u32 v2, v1, 5, v0
1126 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
1127 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
1128 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1129 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1131 ; GFX11-LABEL: large_vector:
1133 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1134 ; GFX11-NEXT: v_lshl_add_u32 v2, v1, 5, v0
1135 ; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v2 offset1:1
1136 ; GFX11-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
1137 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1138 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1139 %idx = shl i32 %idxp, 4
1141 %i.0 = or disjoint i32 %idx, 0
1142 %p.0 = getelementptr half, ptr addrspace(3) %p, i32 %i.0
1143 %x.0 = load i16, ptr addrspace(3) %p.0, align 4
1144 %v0p = insertelement <8 x i16> poison, i16 %x.0, i32 0
1145 %i.1 = or disjoint i32 %idx, 1
1146 %p.1 = getelementptr half, ptr addrspace(3) %p, i32 %i.1
1147 %x.1 = load i16, ptr addrspace(3) %p.1, align 2
1148 %v0 = insertelement <8 x i16> %v0p, i16 %x.1, i32 1
1150 %i.2 = or disjoint i32 %idx, 2
1151 %p.2 = getelementptr half, ptr addrspace(3) %p, i32 %i.2
1152 %x.2 = load i16, ptr addrspace(3) %p.2, align 4
1153 %v1p = insertelement <8 x i16> poison, i16 %x.2, i32 0
1154 %i.3 = or disjoint i32 %idx, 3
1155 %p.3 = getelementptr half, ptr addrspace(3) %p, i32 %i.3
1156 %x.3 = load i16, ptr addrspace(3) %p.3, align 2
1157 %v1 = insertelement <8 x i16> %v1p, i16 %x.3, i32 1
1159 %i.4 = or disjoint i32 %idx, 4
1160 %p.4 = getelementptr half, ptr addrspace(3) %p, i32 %i.4
1161 %x.4 = load i16, ptr addrspace(3) %p.4, align 4
1162 %v2p = insertelement <8 x i16> poison, i16 %x.4, i32 0
1163 %i.5 = or disjoint i32 %idx, 5
1164 %p.5 = getelementptr half, ptr addrspace(3) %p, i32 %i.5
1165 %x.5 = load i16, ptr addrspace(3) %p.5, align 2
1166 %v2 = insertelement <8 x i16> %v2p, i16 %x.5, i32 1
1168 %i.6 = or disjoint i32 %idx, 6
1169 %p.6 = getelementptr half, ptr addrspace(3) %p, i32 %i.6
1170 %x.6 = load i16, ptr addrspace(3) %p.6, align 4
1171 %v3p = insertelement <8 x i16> poison, i16 %x.6, i32 0
1172 %i.7 = or disjoint i32 %idx, 7
1173 %p.7 = getelementptr half, ptr addrspace(3) %p, i32 %i.7
1174 %x.7 = load i16, ptr addrspace(3) %p.7, align 2
1175 %v3 = insertelement <8 x i16> %v3p, i16 %x.7, i32 1
1177 %z.1 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
1178 %z.2 = shufflevector <8 x i16> %z.1, <8 x i16> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 undef, i32 undef>
1179 %z.3 = shufflevector <8 x i16> %z.2, <8 x i16> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
1183 define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
1184 ; SI-LABEL: vec_16xi16_extract_8xi16_0:
1186 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1187 ; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32
1188 ; SI-NEXT: s_waitcnt vmcnt(0)
1189 ; SI-NEXT: v_and_b32_e32 v4, 1, v4
1190 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
1191 ; SI-NEXT: s_and_b64 s[34:35], vcc, exec
1192 ; SI-NEXT: s_mov_b32 s38, 0
1193 ; SI-NEXT: s_cbranch_scc0 .LBB7_2
1194 ; SI-NEXT: ; %bb.1: ; %F
1195 ; SI-NEXT: s_mov_b32 s39, 0xf000
1196 ; SI-NEXT: s_mov_b32 s36, s38
1197 ; SI-NEXT: s_mov_b32 s37, s38
1198 ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 glc
1199 ; SI-NEXT: s_waitcnt vmcnt(0)
1200 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:2 glc
1201 ; SI-NEXT: s_waitcnt vmcnt(0)
1202 ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
1203 ; SI-NEXT: s_waitcnt vmcnt(0)
1204 ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 offset:6 glc
1205 ; SI-NEXT: s_waitcnt vmcnt(0)
1206 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
1207 ; SI-NEXT: s_waitcnt vmcnt(0)
1208 ; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
1209 ; SI-NEXT: s_waitcnt vmcnt(0)
1210 ; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
1211 ; SI-NEXT: s_waitcnt vmcnt(0)
1212 ; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
1213 ; SI-NEXT: s_waitcnt vmcnt(0)
1214 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
1215 ; SI-NEXT: s_waitcnt vmcnt(0)
1216 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:18 glc
1217 ; SI-NEXT: s_waitcnt vmcnt(0)
1218 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:20 glc
1219 ; SI-NEXT: s_waitcnt vmcnt(0)
1220 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:22 glc
1221 ; SI-NEXT: s_waitcnt vmcnt(0)
1222 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:24 glc
1223 ; SI-NEXT: s_waitcnt vmcnt(0)
1224 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:26 glc
1225 ; SI-NEXT: s_waitcnt vmcnt(0)
1226 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:28 glc
1227 ; SI-NEXT: s_waitcnt vmcnt(0)
1228 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
1229 ; SI-NEXT: s_waitcnt vmcnt(0)
1230 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11
1231 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9
1232 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v5
1233 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4
1234 ; SI-NEXT: v_or_b32_e32 v5, v10, v2
1235 ; SI-NEXT: v_or_b32_e32 v4, v8, v3
1236 ; SI-NEXT: v_or_b32_e32 v3, v7, v9
1237 ; SI-NEXT: v_or_b32_e32 v2, v6, v11
1238 ; SI-NEXT: s_mov_b64 vcc, exec
1239 ; SI-NEXT: s_cbranch_execz .LBB7_3
1240 ; SI-NEXT: s_branch .LBB7_4
1242 ; SI-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
1243 ; SI-NEXT: s_mov_b64 vcc, 0
1244 ; SI-NEXT: .LBB7_3: ; %T
1245 ; SI-NEXT: s_mov_b32 s39, 0xf000
1246 ; SI-NEXT: s_mov_b32 s36, s38
1247 ; SI-NEXT: s_mov_b32 s37, s38
1248 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 glc
1249 ; SI-NEXT: s_waitcnt vmcnt(0)
1250 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:2 glc
1251 ; SI-NEXT: s_waitcnt vmcnt(0)
1252 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:4 glc
1253 ; SI-NEXT: s_waitcnt vmcnt(0)
1254 ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
1255 ; SI-NEXT: s_waitcnt vmcnt(0)
1256 ; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:8 glc
1257 ; SI-NEXT: s_waitcnt vmcnt(0)
1258 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 offset:10 glc
1259 ; SI-NEXT: s_waitcnt vmcnt(0)
1260 ; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:12 glc
1261 ; SI-NEXT: s_waitcnt vmcnt(0)
1262 ; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:14 glc
1263 ; SI-NEXT: s_waitcnt vmcnt(0)
1264 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
1265 ; SI-NEXT: s_waitcnt vmcnt(0)
1266 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc
1267 ; SI-NEXT: s_waitcnt vmcnt(0)
1268 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc
1269 ; SI-NEXT: s_waitcnt vmcnt(0)
1270 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc
1271 ; SI-NEXT: s_waitcnt vmcnt(0)
1272 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc
1273 ; SI-NEXT: s_waitcnt vmcnt(0)
1274 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc
1275 ; SI-NEXT: s_waitcnt vmcnt(0)
1276 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc
1277 ; SI-NEXT: s_waitcnt vmcnt(0)
1278 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
1279 ; SI-NEXT: s_waitcnt vmcnt(0)
1280 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v9
1281 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
1282 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v4
1283 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3
1284 ; SI-NEXT: v_or_b32_e32 v5, v8, v0
1285 ; SI-NEXT: v_or_b32_e32 v4, v7, v1
1286 ; SI-NEXT: v_or_b32_e32 v3, v6, v9
1287 ; SI-NEXT: v_or_b32_e32 v2, v2, v10
1288 ; SI-NEXT: .LBB7_4: ; %exit
1289 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
1290 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
1291 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
1292 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
1293 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
1294 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5
1295 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
1296 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3
1297 ; SI-NEXT: s_movk_i32 s34, 0x3800
1298 ; SI-NEXT: v_mov_b32_e32 v8, 0x3d000000
1299 ; SI-NEXT: v_mov_b32_e32 v9, 0x39000000
1300 ; SI-NEXT: v_mov_b32_e32 v10, 0x3d00
1301 ; SI-NEXT: v_mov_b32_e32 v11, 0x3900
1302 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v0
1303 ; SI-NEXT: v_cndmask_b32_e32 v12, v8, v9, vcc
1304 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v1
1305 ; SI-NEXT: v_cndmask_b32_e32 v0, v10, v11, vcc
1306 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v2
1307 ; SI-NEXT: v_cndmask_b32_e32 v13, v8, v9, vcc
1308 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v4
1309 ; SI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc
1310 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v6
1311 ; SI-NEXT: v_cndmask_b32_e32 v14, v8, v9, vcc
1312 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v5
1313 ; SI-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc
1314 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v7
1315 ; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
1316 ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s34, v3
1317 ; SI-NEXT: v_cndmask_b32_e32 v3, v10, v11, vcc
1318 ; SI-NEXT: v_or_b32_e32 v0, v0, v12
1319 ; SI-NEXT: v_or_b32_e32 v4, v1, v13
1320 ; SI-NEXT: v_or_b32_e32 v6, v2, v14
1321 ; SI-NEXT: v_or_b32_e32 v2, v3, v5
1322 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
1323 ; SI-NEXT: v_alignbit_b32 v1, v2, v12, 16
1324 ; SI-NEXT: v_alignbit_b32 v5, v6, v13, 16
1325 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14
1326 ; SI-NEXT: s_setpc_b64 s[30:31]
1328 ; GFX9-LABEL: vec_16xi16_extract_8xi16_0:
1330 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1331 ; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32
1332 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1333 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
1334 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
1335 ; GFX9-NEXT: s_and_b64 s[34:35], vcc, exec
1336 ; GFX9-NEXT: s_cbranch_scc0 .LBB7_2
1337 ; GFX9-NEXT: ; %bb.1: ; %F
1338 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
1339 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1340 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc
1341 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1342 ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
1343 ; GFX9-NEXT: s_cbranch_execz .LBB7_3
1344 ; GFX9-NEXT: s_branch .LBB7_4
1345 ; GFX9-NEXT: .LBB7_2:
1346 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
1347 ; GFX9-NEXT: .LBB7_3: ; %T
1348 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
1349 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1350 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
1351 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1352 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
1353 ; GFX9-NEXT: .LBB7_4: ; %exit
1354 ; GFX9-NEXT: s_movk_i32 s35, 0x3801
1355 ; GFX9-NEXT: s_movk_i32 s34, 0x3800
1356 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
1357 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
1358 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1359 ; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v7
1360 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
1361 ; GFX9-NEXT: v_cmp_gt_u16_sdwa vcc, v7, s34 src0_sel:WORD_1 src1_sel:DWORD
1362 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v0, vcc
1363 ; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v6
1364 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
1365 ; GFX9-NEXT: v_cmp_lt_u16_sdwa vcc, v6, s35 src0_sel:WORD_1 src1_sel:DWORD
1366 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc
1367 ; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v5
1368 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc
1369 ; GFX9-NEXT: v_cmp_lt_u16_sdwa vcc, v5, s35 src0_sel:WORD_1 src1_sel:DWORD
1370 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
1371 ; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, s35, v4
1372 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc
1373 ; GFX9-NEXT: v_cmp_lt_u16_sdwa vcc, v4, s35 src0_sel:WORD_1 src1_sel:DWORD
1374 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1375 ; GFX9-NEXT: s_mov_b32 s34, 0x5040100
1376 ; GFX9-NEXT: v_perm_b32 v0, v0, v9, s34
1377 ; GFX9-NEXT: v_perm_b32 v1, v5, v8, s34
1378 ; GFX9-NEXT: v_perm_b32 v2, v6, v2, s34
1379 ; GFX9-NEXT: v_perm_b32 v3, v7, v3, s34
1380 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1382 ; GFX11-LABEL: vec_16xi16_extract_8xi16_0:
1384 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1385 ; GFX11-NEXT: scratch_load_u8 v4, off, s32
1386 ; GFX11-NEXT: s_mov_b32 s0, 0
1387 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1388 ; GFX11-NEXT: v_and_b32_e32 v4, 1, v4
1389 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1390 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
1391 ; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
1392 ; GFX11-NEXT: s_cbranch_scc0 .LBB7_2
1393 ; GFX11-NEXT: ; %bb.1: ; %F
1394 ; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
1395 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1396 ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc
1397 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1398 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
1399 ; GFX11-NEXT: s_cbranch_vccz .LBB7_3
1400 ; GFX11-NEXT: s_branch .LBB7_4
1401 ; GFX11-NEXT: .LBB7_2:
1402 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
1403 ; GFX11-NEXT: .LBB7_3: ; %T
1404 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
1405 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1406 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
1407 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1408 ; GFX11-NEXT: .LBB7_4: ; %exit
1409 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
1410 ; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900
1411 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00
1412 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4
1413 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5
1414 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
1415 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
1416 ; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
1417 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v4
1418 ; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
1419 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7
1420 ; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
1421 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v3
1422 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
1423 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v2
1424 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
1425 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v0
1426 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
1427 ; GFX11-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x3801, v6
1428 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
1429 ; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
1430 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
1431 ; GFX11-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x3800, v8
1432 ; GFX11-NEXT: v_perm_b32 v2, v7, v4, 0x5040100
1433 ; GFX11-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
1434 ; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
1435 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1436 ; GFX11-NEXT: v_perm_b32 v3, v6, v5, 0x5040100
1437 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1438 br i1 %cond, label %T, label %F
1441 %t = load volatile <16 x i16>, ptr addrspace(1) %p0
1445 %f = load volatile <16 x i16>, ptr addrspace(1) %p1
1449 %m = phi <16 x i16> [ %t, %T ], [ %f, %F ]
1450 %v2 = shufflevector <16 x i16> %m, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1451 %b2 = icmp ugt <8 x i16> %v2, <i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800, i16 u0x3800>
1452 %r2 = select <8 x i1> %b2, <8 x i16> <i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900, i16 u0x3900>, <8 x i16> <i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00, i16 u0x3D00>
1456 define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
1457 ; SI-LABEL: vec_16xf16_extract_8xf16_0:
1459 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1460 ; SI-NEXT: buffer_load_ubyte v4, off, s[0:3], s32
1461 ; SI-NEXT: s_waitcnt vmcnt(0)
1462 ; SI-NEXT: v_and_b32_e32 v4, 1, v4
1463 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
1464 ; SI-NEXT: s_and_b64 s[34:35], vcc, exec
1465 ; SI-NEXT: s_mov_b32 s38, 0
1466 ; SI-NEXT: s_cbranch_scc0 .LBB8_2
1467 ; SI-NEXT: ; %bb.1: ; %F
1468 ; SI-NEXT: s_mov_b32 s39, 0xf000
1469 ; SI-NEXT: s_mov_b32 s36, s38
1470 ; SI-NEXT: s_mov_b32 s37, s38
1471 ; SI-NEXT: buffer_load_ushort v5, v[2:3], s[36:39], 0 addr64 glc
1472 ; SI-NEXT: s_waitcnt vmcnt(0)
1473 ; SI-NEXT: buffer_load_ushort v6, v[2:3], s[36:39], 0 addr64 offset:2 glc
1474 ; SI-NEXT: s_waitcnt vmcnt(0)
1475 ; SI-NEXT: buffer_load_ushort v7, v[2:3], s[36:39], 0 addr64 offset:4 glc
1476 ; SI-NEXT: s_waitcnt vmcnt(0)
1477 ; SI-NEXT: buffer_load_ushort v4, v[2:3], s[36:39], 0 addr64 offset:6 glc
1478 ; SI-NEXT: s_waitcnt vmcnt(0)
1479 ; SI-NEXT: buffer_load_ushort v8, v[2:3], s[36:39], 0 addr64 offset:8 glc
1480 ; SI-NEXT: s_waitcnt vmcnt(0)
1481 ; SI-NEXT: buffer_load_ushort v9, v[2:3], s[36:39], 0 addr64 offset:10 glc
1482 ; SI-NEXT: s_waitcnt vmcnt(0)
1483 ; SI-NEXT: buffer_load_ushort v10, v[2:3], s[36:39], 0 addr64 offset:12 glc
1484 ; SI-NEXT: s_waitcnt vmcnt(0)
1485 ; SI-NEXT: buffer_load_ushort v11, v[2:3], s[36:39], 0 addr64 offset:14 glc
1486 ; SI-NEXT: s_waitcnt vmcnt(0)
1487 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:16 glc
1488 ; SI-NEXT: s_waitcnt vmcnt(0)
1489 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:18 glc
1490 ; SI-NEXT: s_waitcnt vmcnt(0)
1491 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:20 glc
1492 ; SI-NEXT: s_waitcnt vmcnt(0)
1493 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:22 glc
1494 ; SI-NEXT: s_waitcnt vmcnt(0)
1495 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:24 glc
1496 ; SI-NEXT: s_waitcnt vmcnt(0)
1497 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:26 glc
1498 ; SI-NEXT: s_waitcnt vmcnt(0)
1499 ; SI-NEXT: buffer_load_ushort v12, v[2:3], s[36:39], 0 addr64 offset:28 glc
1500 ; SI-NEXT: s_waitcnt vmcnt(0)
1501 ; SI-NEXT: buffer_load_ushort v2, v[2:3], s[36:39], 0 addr64 offset:30 glc
1502 ; SI-NEXT: s_waitcnt vmcnt(0)
1503 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11
1504 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9
1505 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4
1506 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v6
1507 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v11
1508 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v9
1509 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
1510 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
1511 ; SI-NEXT: v_or_b32_e32 v9, v10, v12
1512 ; SI-NEXT: v_or_b32_e32 v8, v8, v13
1513 ; SI-NEXT: v_or_b32_e32 v10, v7, v14
1514 ; SI-NEXT: v_or_b32_e32 v11, v5, v15
1515 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v9
1516 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v8
1517 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v10
1518 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v11
1519 ; SI-NEXT: s_mov_b64 vcc, exec
1520 ; SI-NEXT: s_cbranch_execz .LBB8_3
1521 ; SI-NEXT: s_branch .LBB8_4
1523 ; SI-NEXT: ; implicit-def: $vgpr9
1524 ; SI-NEXT: ; implicit-def: $vgpr6
1525 ; SI-NEXT: ; implicit-def: $vgpr8
1526 ; SI-NEXT: ; implicit-def: $vgpr4
1527 ; SI-NEXT: ; implicit-def: $vgpr7
1528 ; SI-NEXT: ; implicit-def: $vgpr3
1529 ; SI-NEXT: ; implicit-def: $vgpr5
1530 ; SI-NEXT: ; implicit-def: $vgpr2
1531 ; SI-NEXT: s_mov_b64 vcc, 0
1532 ; SI-NEXT: .LBB8_3: ; %T
1533 ; SI-NEXT: s_mov_b32 s39, 0xf000
1534 ; SI-NEXT: s_mov_b32 s36, s38
1535 ; SI-NEXT: s_mov_b32 s37, s38
1536 ; SI-NEXT: buffer_load_ushort v5, v[0:1], s[36:39], 0 addr64 glc
1537 ; SI-NEXT: s_waitcnt vmcnt(0)
1538 ; SI-NEXT: buffer_load_ushort v6, v[0:1], s[36:39], 0 addr64 offset:2 glc
1539 ; SI-NEXT: s_waitcnt vmcnt(0)
1540 ; SI-NEXT: buffer_load_ushort v7, v[0:1], s[36:39], 0 addr64 offset:4 glc
1541 ; SI-NEXT: s_waitcnt vmcnt(0)
1542 ; SI-NEXT: buffer_load_ushort v4, v[0:1], s[36:39], 0 addr64 offset:6 glc
1543 ; SI-NEXT: s_waitcnt vmcnt(0)
1544 ; SI-NEXT: buffer_load_ushort v8, v[0:1], s[36:39], 0 addr64 offset:8 glc
1545 ; SI-NEXT: s_waitcnt vmcnt(0)
1546 ; SI-NEXT: buffer_load_ushort v3, v[0:1], s[36:39], 0 addr64 offset:10 glc
1547 ; SI-NEXT: s_waitcnt vmcnt(0)
1548 ; SI-NEXT: buffer_load_ushort v9, v[0:1], s[36:39], 0 addr64 offset:12 glc
1549 ; SI-NEXT: s_waitcnt vmcnt(0)
1550 ; SI-NEXT: buffer_load_ushort v2, v[0:1], s[36:39], 0 addr64 offset:14 glc
1551 ; SI-NEXT: s_waitcnt vmcnt(0)
1552 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:16 glc
1553 ; SI-NEXT: s_waitcnt vmcnt(0)
1554 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:18 glc
1555 ; SI-NEXT: s_waitcnt vmcnt(0)
1556 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:20 glc
1557 ; SI-NEXT: s_waitcnt vmcnt(0)
1558 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:22 glc
1559 ; SI-NEXT: s_waitcnt vmcnt(0)
1560 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:24 glc
1561 ; SI-NEXT: s_waitcnt vmcnt(0)
1562 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:26 glc
1563 ; SI-NEXT: s_waitcnt vmcnt(0)
1564 ; SI-NEXT: buffer_load_ushort v10, v[0:1], s[36:39], 0 addr64 offset:28 glc
1565 ; SI-NEXT: s_waitcnt vmcnt(0)
1566 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[36:39], 0 addr64 offset:30 glc
1567 ; SI-NEXT: s_waitcnt vmcnt(0)
1568 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
1569 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
1570 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4
1571 ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6
1572 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
1573 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
1574 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
1575 ; SI-NEXT: v_or_b32_e32 v0, v9, v0
1576 ; SI-NEXT: v_or_b32_e32 v1, v8, v1
1577 ; SI-NEXT: v_or_b32_e32 v8, v7, v10
1578 ; SI-NEXT: v_or_b32_e32 v9, v5, v11
1579 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v0
1580 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v1
1581 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
1582 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
1583 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
1584 ; SI-NEXT: .LBB8_4: ; %exit
1585 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v9
1586 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6
1587 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v8
1588 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
1589 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
1590 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
1591 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
1592 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
1593 ; SI-NEXT: v_mov_b32_e32 v8, 0x3fa00000
1594 ; SI-NEXT: v_mov_b32_e32 v9, 0x3f200000
1595 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
1596 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
1597 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
1598 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
1599 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
1600 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v3
1601 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v5
1602 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v2
1603 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v0
1604 ; SI-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc
1605 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v1
1606 ; SI-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
1607 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v6
1608 ; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
1609 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v4
1610 ; SI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
1611 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v7
1612 ; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
1613 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v10
1614 ; SI-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
1615 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v11
1616 ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc
1617 ; SI-NEXT: v_cmp_nge_f32_e32 vcc, 0.5, v12
1618 ; SI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc
1619 ; SI-NEXT: s_setpc_b64 s[30:31]
1621 ; GFX9-LABEL: vec_16xf16_extract_8xf16_0:
1623 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1624 ; GFX9-NEXT: buffer_load_ubyte v4, off, s[0:3], s32
1625 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1626 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
1627 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
1628 ; GFX9-NEXT: s_and_b64 s[34:35], vcc, exec
1629 ; GFX9-NEXT: s_cbranch_scc0 .LBB8_2
1630 ; GFX9-NEXT: ; %bb.1: ; %F
1631 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 glc
1632 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1633 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[2:3], off glc
1634 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1635 ; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
1636 ; GFX9-NEXT: s_cbranch_execz .LBB8_3
1637 ; GFX9-NEXT: s_branch .LBB8_4
1638 ; GFX9-NEXT: .LBB8_2:
1639 ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
1640 ; GFX9-NEXT: .LBB8_3: ; %T
1641 ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
1642 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1643 ; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off glc
1644 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1645 ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
1646 ; GFX9-NEXT: .LBB8_4: ; %exit
1647 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3800
1648 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x3900
1649 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3d00
1650 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1651 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v7
1652 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
1653 ; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD
1654 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc
1655 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v6
1656 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v2, vcc
1657 ; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v6, v0 src0_sel:WORD_1 src1_sel:DWORD
1658 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v2, vcc
1659 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5
1660 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v2, vcc
1661 ; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v5, v0 src0_sel:WORD_1 src1_sel:DWORD
1662 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc
1663 ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
1664 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v2, vcc
1665 ; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
1666 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
1667 ; GFX9-NEXT: v_pack_b32_f16 v0, v10, v0
1668 ; GFX9-NEXT: v_pack_b32_f16 v1, v9, v5
1669 ; GFX9-NEXT: v_pack_b32_f16 v2, v8, v6
1670 ; GFX9-NEXT: v_pack_b32_f16 v3, v3, v7
1671 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1673 ; GFX11-LABEL: vec_16xf16_extract_8xf16_0:
1675 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1676 ; GFX11-NEXT: scratch_load_u8 v4, off, s32
1677 ; GFX11-NEXT: s_mov_b32 s0, 0
1678 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1679 ; GFX11-NEXT: v_and_b32_e32 v4, 1, v4
1680 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1681 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4
1682 ; GFX11-NEXT: s_and_b32 s1, vcc_lo, exec_lo
1683 ; GFX11-NEXT: s_cbranch_scc0 .LBB8_2
1684 ; GFX11-NEXT: ; %bb.1: ; %F
1685 ; GFX11-NEXT: global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
1686 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1687 ; GFX11-NEXT: global_load_b128 v[2:5], v[2:3], off glc dlc
1688 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1689 ; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
1690 ; GFX11-NEXT: s_cbranch_vccz .LBB8_3
1691 ; GFX11-NEXT: s_branch .LBB8_4
1692 ; GFX11-NEXT: .LBB8_2:
1693 ; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
1694 ; GFX11-NEXT: .LBB8_3: ; %T
1695 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
1696 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1697 ; GFX11-NEXT: global_load_b128 v[2:5], v[0:1], off glc dlc
1698 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1699 ; GFX11-NEXT: .LBB8_4: ; %exit
1700 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
1701 ; GFX11-NEXT: v_mov_b32_e32 v9, 0x3900
1702 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x3d00
1703 ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4
1704 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5
1705 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v2
1706 ; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3
1707 ; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
1708 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v4
1709 ; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
1710 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v7
1711 ; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
1712 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
1713 ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
1714 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
1715 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
1716 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v0
1717 ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
1718 ; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, 0.5, v6
1719 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
1720 ; GFX11-NEXT: v_pack_b32_f16 v0, v2, v0
1721 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
1722 ; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, 0.5, v8
1723 ; GFX11-NEXT: v_pack_b32_f16 v2, v4, v7
1724 ; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1
1725 ; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
1726 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1727 ; GFX11-NEXT: v_pack_b32_f16 v3, v5, v6
1728 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1729 br i1 %cond, label %T, label %F
1732 %t = load volatile <16 x half>, ptr addrspace(1) %p0
1736 %f = load volatile <16 x half>, ptr addrspace(1) %p1
1740 %m = phi <16 x half> [ %t, %T ], [ %f, %F ]
1741 %v2 = shufflevector <16 x half> %m, <16 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1742 %b2 = fcmp ugt <8 x half> %v2, <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>
1743 %r2 = select <8 x i1> %b2, <8 x half> <half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900, half 0xH3900>, <8 x half> <half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00, half 0xH3D00>