1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX900 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX10,GFX10_DEFAULT %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -mattr=-unaligned-access-mode -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GFX10,FLATSCR_GFX10 %s
7 define <2 x half> @chain_hi_to_lo_private() {
8 ; GFX900-LABEL: chain_hi_to_lo_private:
9 ; GFX900: ; %bb.0: ; %bb
10 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11 ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
12 ; GFX900-NEXT: s_waitcnt vmcnt(0)
13 ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0
14 ; GFX900-NEXT: s_waitcnt vmcnt(0)
15 ; GFX900-NEXT: s_setpc_b64 s[30:31]
17 ; FLATSCR-LABEL: chain_hi_to_lo_private:
18 ; FLATSCR: ; %bb.0: ; %bb
19 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20 ; FLATSCR-NEXT: s_mov_b32 s0, 2
21 ; FLATSCR-NEXT: scratch_load_ushort v0, off, s0
22 ; FLATSCR-NEXT: s_mov_b32 s0, 0
23 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
24 ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0
25 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
26 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
28 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private:
29 ; GFX10_DEFAULT: ; %bb.0: ; %bb
30 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0
32 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2
33 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
34 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0
35 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
36 ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31]
38 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private:
39 ; FLATSCR_GFX10: ; %bb.0: ; %bb
40 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
42 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 2
43 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0
44 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3
45 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0
46 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
47 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0
48 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
49 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
51 %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
52 %load_lo = load half, half addrspace(5)* %gep_lo
53 %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0
54 %load_hi = load half, half addrspace(5)* %gep_hi
56 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
57 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
59 ret <2 x half> %result
62 define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) {
63 ; GFX900-LABEL: chain_hi_to_lo_private_different_bases:
64 ; GFX900: ; %bb.0: ; %bb
65 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66 ; GFX900-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen
67 ; GFX900-NEXT: s_waitcnt vmcnt(0)
68 ; GFX900-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
69 ; GFX900-NEXT: s_waitcnt vmcnt(0)
70 ; GFX900-NEXT: s_setpc_b64 s[30:31]
72 ; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases:
73 ; FLATSCR: ; %bb.0: ; %bb
74 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75 ; FLATSCR-NEXT: scratch_load_ushort v0, v0, off
76 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
77 ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off
78 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
79 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
81 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_different_bases:
82 ; GFX10_DEFAULT: ; %bb.0: ; %bb
83 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
84 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0
85 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen
86 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
87 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
88 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
89 ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31]
91 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_different_bases:
92 ; FLATSCR_GFX10: ; %bb.0: ; %bb
93 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
95 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off
96 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
97 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off
98 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
99 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
101 %load_lo = load half, half addrspace(5)* %base_lo
102 %load_hi = load half, half addrspace(5)* %base_hi
104 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
105 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
107 ret <2 x half> %result
110 define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) {
111 ; GFX900-LABEL: chain_hi_to_lo_arithmatic:
112 ; GFX900: ; %bb.0: ; %bb
113 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114 ; GFX900-NEXT: v_add_f16_e32 v1, 1.0, v1
115 ; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
116 ; GFX900-NEXT: s_waitcnt vmcnt(0)
117 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
118 ; GFX900-NEXT: s_setpc_b64 s[30:31]
120 ; FLATSCR-LABEL: chain_hi_to_lo_arithmatic:
121 ; FLATSCR: ; %bb.0: ; %bb
122 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123 ; FLATSCR-NEXT: v_add_f16_e32 v1, 1.0, v1
124 ; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off
125 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
126 ; FLATSCR-NEXT: v_mov_b32_e32 v0, v1
127 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
129 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_arithmatic:
130 ; GFX10_DEFAULT: ; %bb.0: ; %bb
131 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0
133 ; GFX10_DEFAULT-NEXT: v_add_f16_e32 v1, 1.0, v1
134 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
135 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
136 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1
137 ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31]
139 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_arithmatic:
140 ; FLATSCR_GFX10: ; %bb.0: ; %bb
141 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
143 ; FLATSCR_GFX10-NEXT: v_add_f16_e32 v1, 1.0, v1
144 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off
145 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
146 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
147 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
149 %arith_lo = fadd half %in, 1.0
150 %load_hi = load half, half addrspace(5)* %base
152 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0
153 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
155 ret <2 x half> %result
158 define <2 x half> @chain_hi_to_lo_group() {
159 ; GCN-LABEL: chain_hi_to_lo_group:
160 ; GCN: ; %bb.0: ; %bb
161 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162 ; GCN-NEXT: v_mov_b32_e32 v1, 0
163 ; GCN-NEXT: ds_read_u16 v0, v1 offset:2
164 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
165 ; GCN-NEXT: ds_read_u16_d16_hi v0, v1
166 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
167 ; GCN-NEXT: s_setpc_b64 s[30:31]
169 ; GFX10-LABEL: chain_hi_to_lo_group:
170 ; GFX10: ; %bb.0: ; %bb
171 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
173 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
174 ; GFX10-NEXT: ds_read_u16 v0, v1 offset:2
175 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
176 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1
177 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
178 ; GFX10-NEXT: s_setpc_b64 s[30:31]
180 %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
181 %load_lo = load half, half addrspace(3)* %gep_lo
182 %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0
183 %load_hi = load half, half addrspace(3)* %gep_hi
185 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
186 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
188 ret <2 x half> %result
191 define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) {
192 ; GCN-LABEL: chain_hi_to_lo_group_different_bases:
193 ; GCN: ; %bb.0: ; %bb
194 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195 ; GCN-NEXT: ds_read_u16 v0, v0
196 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
197 ; GCN-NEXT: ds_read_u16_d16_hi v0, v1
198 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
199 ; GCN-NEXT: s_setpc_b64 s[30:31]
201 ; GFX10-LABEL: chain_hi_to_lo_group_different_bases:
202 ; GFX10: ; %bb.0: ; %bb
203 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
205 ; GFX10-NEXT: ds_read_u16 v0, v0
206 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v1
208 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX10-NEXT: s_setpc_b64 s[30:31]
211 %load_lo = load half, half addrspace(3)* %base_lo
212 %load_hi = load half, half addrspace(3)* %base_hi
214 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
215 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
217 ret <2 x half> %result
220 define <2 x half> @chain_hi_to_lo_global() {
221 ; GCN-LABEL: chain_hi_to_lo_global:
222 ; GCN: ; %bb.0: ; %bb
223 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224 ; GCN-NEXT: v_mov_b32_e32 v0, 2
225 ; GCN-NEXT: v_mov_b32_e32 v1, 0
226 ; GCN-NEXT: global_load_ushort v0, v[0:1], off
227 ; GCN-NEXT: v_mov_b32_e32 v2, 0
228 ; GCN-NEXT: v_mov_b32_e32 v3, 0
229 ; GCN-NEXT: s_waitcnt vmcnt(0)
230 ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off
231 ; GCN-NEXT: s_waitcnt vmcnt(0)
232 ; GCN-NEXT: s_setpc_b64 s[30:31]
234 ; GFX10-LABEL: chain_hi_to_lo_global:
235 ; GFX10: ; %bb.0: ; %bb
236 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
237 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
238 ; GFX10-NEXT: v_mov_b32_e32 v0, 2
239 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
240 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
241 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
242 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
243 ; GFX10-NEXT: s_waitcnt vmcnt(0)
244 ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off
245 ; GFX10-NEXT: s_waitcnt vmcnt(0)
246 ; GFX10-NEXT: s_setpc_b64 s[30:31]
248 %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
249 %load_lo = load half, half addrspace(1)* %gep_lo
250 %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0
251 %load_hi = load half, half addrspace(1)* %gep_hi
253 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
254 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
256 ret <2 x half> %result
259 define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) {
260 ; GCN-LABEL: chain_hi_to_lo_global_different_bases:
261 ; GCN: ; %bb.0: ; %bb
262 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263 ; GCN-NEXT: global_load_ushort v0, v[0:1], off
264 ; GCN-NEXT: s_waitcnt vmcnt(0)
265 ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off
266 ; GCN-NEXT: s_waitcnt vmcnt(0)
267 ; GCN-NEXT: s_setpc_b64 s[30:31]
269 ; GFX10-LABEL: chain_hi_to_lo_global_different_bases:
270 ; GFX10: ; %bb.0: ; %bb
271 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
273 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off
274 ; GFX10-NEXT: s_waitcnt vmcnt(0)
275 ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off
276 ; GFX10-NEXT: s_waitcnt vmcnt(0)
277 ; GFX10-NEXT: s_setpc_b64 s[30:31]
279 %load_lo = load half, half addrspace(1)* %base_lo
280 %load_hi = load half, half addrspace(1)* %base_hi
282 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
283 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
285 ret <2 x half> %result
288 define <2 x half> @chain_hi_to_lo_flat() {
289 ; GCN-LABEL: chain_hi_to_lo_flat:
290 ; GCN: ; %bb.0: ; %bb
291 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
292 ; GCN-NEXT: v_mov_b32_e32 v0, 2
293 ; GCN-NEXT: v_mov_b32_e32 v1, 0
294 ; GCN-NEXT: flat_load_ushort v0, v[0:1]
295 ; GCN-NEXT: v_mov_b32_e32 v2, 0
296 ; GCN-NEXT: v_mov_b32_e32 v3, 0
297 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
298 ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3]
299 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300 ; GCN-NEXT: s_setpc_b64 s[30:31]
302 ; GFX10-LABEL: chain_hi_to_lo_flat:
303 ; GFX10: ; %bb.0: ; %bb
304 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
306 ; GFX10-NEXT: v_mov_b32_e32 v0, 2
307 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
308 ; GFX10-NEXT: flat_load_ushort v0, v[0:1]
309 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
310 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
311 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
312 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2]
313 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
314 ; GFX10-NEXT: s_setpc_b64 s[30:31]
316 %gep_lo = getelementptr inbounds half, half* null, i64 1
317 %load_lo = load half, half* %gep_lo
318 %gep_hi = getelementptr inbounds half, half* null, i64 0
319 %load_hi = load half, half* %gep_hi
321 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
322 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
324 ret <2 x half> %result
327 define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %base_hi) {
328 ; GCN-LABEL: chain_hi_to_lo_flat_different_bases:
329 ; GCN: ; %bb.0: ; %bb
330 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
331 ; GCN-NEXT: flat_load_ushort v0, v[0:1]
332 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
333 ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3]
334 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
335 ; GCN-NEXT: s_setpc_b64 s[30:31]
337 ; GFX10-LABEL: chain_hi_to_lo_flat_different_bases:
338 ; GFX10: ; %bb.0: ; %bb
339 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
340 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
341 ; GFX10-NEXT: flat_load_ushort v0, v[0:1]
342 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
343 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[2:3]
344 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
345 ; GFX10-NEXT: s_setpc_b64 s[30:31]
347 %load_lo = load half, half* %base_lo
348 %load_hi = load half, half* %base_hi
350 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
351 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
353 ret <2 x half> %result
356 ; Make sure we don't lose any of the private stores.
357 define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
358 ; GFX900-LABEL: vload2_private:
359 ; GFX900: ; %bb.0: ; %entry
360 ; GFX900-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
361 ; GFX900-NEXT: v_mov_b32_e32 v2, 0
362 ; GFX900-NEXT: s_add_u32 s0, s0, s9
363 ; GFX900-NEXT: s_addc_u32 s1, s1, 0
364 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
365 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5]
366 ; GFX900-NEXT: s_waitcnt vmcnt(0)
367 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
368 ; GFX900-NEXT: s_waitcnt vmcnt(0)
369 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:2
370 ; GFX900-NEXT: s_waitcnt vmcnt(0)
371 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6
372 ; GFX900-NEXT: s_waitcnt vmcnt(0)
373 ; GFX900-NEXT: global_load_ushort v0, v2, s[4:5] offset:4
374 ; GFX900-NEXT: s_waitcnt vmcnt(0)
375 ; GFX900-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8
376 ; GFX900-NEXT: s_waitcnt vmcnt(0)
377 ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4
378 ; GFX900-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:6
379 ; GFX900-NEXT: s_waitcnt vmcnt(1)
380 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
381 ; GFX900-NEXT: s_waitcnt vmcnt(0)
382 ; GFX900-NEXT: v_mov_b32_e32 v1, v3
383 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
384 ; GFX900-NEXT: v_lshl_or_b32 v0, v3, 16, v0
385 ; GFX900-NEXT: s_waitcnt vmcnt(0)
386 ; GFX900-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
387 ; GFX900-NEXT: s_endpgm
389 ; FLATSCR-LABEL: vload2_private:
390 ; FLATSCR: ; %bb.0: ; %entry
391 ; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
392 ; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
393 ; FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
394 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
395 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
396 ; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
397 ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1]
398 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
399 ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:4
400 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
401 ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:2
402 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
403 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
404 ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:6
405 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
406 ; FLATSCR-NEXT: global_load_ushort v0, v2, s[0:1] offset:4
407 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
408 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
409 ; FLATSCR-NEXT: scratch_store_short off, v0, vcc_hi offset:8
410 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
411 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
412 ; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
413 ; FLATSCR-NEXT: s_mov_b32 vcc_hi, 0
414 ; FLATSCR-NEXT: scratch_load_dword v1, off, vcc_hi offset:6
415 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
416 ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
417 ; FLATSCR-NEXT: s_endpgm
419 ; GFX10_DEFAULT-LABEL: vload2_private:
420 ; GFX10_DEFAULT: ; %bb.0: ; %entry
421 ; GFX10_DEFAULT-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
422 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v2, 0
423 ; GFX10_DEFAULT-NEXT: s_add_u32 s0, s0, s9
424 ; GFX10_DEFAULT-NEXT: s_addc_u32 s1, s1, 0
425 ; GFX10_DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5]
427 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
428 ; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
429 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0
430 ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:2
431 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
432 ; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:6
433 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0
434 ; GFX10_DEFAULT-NEXT: global_load_ushort v0, v2, s[4:5] offset:4
435 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
436 ; GFX10_DEFAULT-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:8
437 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0
438 ; GFX10_DEFAULT-NEXT: s_clause 0x1
439 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:6
440 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4
441 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(1)
442 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v1, v0
443 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
444 ; GFX10_DEFAULT-NEXT: v_and_b32_e32 v3, 0xffff, v3
445 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:8
446 ; GFX10_DEFAULT-NEXT: v_lshl_or_b32 v0, v0, 16, v3
447 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
448 ; GFX10_DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
449 ; GFX10_DEFAULT-NEXT: s_endpgm
451 ; FLATSCR_GFX10-LABEL: vload2_private:
452 ; FLATSCR_GFX10: ; %bb.0: ; %entry
453 ; FLATSCR_GFX10-NEXT: s_add_u32 s2, s2, s5
454 ; FLATSCR_GFX10-NEXT: s_addc_u32 s3, s3, 0
455 ; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
456 ; FLATSCR_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
457 ; FLATSCR_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
458 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v2, 0
459 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0
460 ; FLATSCR_GFX10-NEXT: s_waitcnt lgkmcnt(0)
461 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1]
462 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
463 ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:4
464 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
465 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:2
466 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3
467 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0
468 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
469 ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:6
470 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
471 ; FLATSCR_GFX10-NEXT: global_load_ushort v0, v2, s[0:1] offset:4
472 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3
473 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0
474 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
475 ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, vcc_lo offset:8
476 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
477 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3
478 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0
479 ; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, vcc_lo offset:4
480 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3
481 ; FLATSCR_GFX10-NEXT: s_mov_b32 vcc_lo, 0
482 ; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, vcc_lo offset:6
483 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
484 ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
485 ; FLATSCR_GFX10-NEXT: s_endpgm
487 %loc = alloca [3 x i16], align 2, addrspace(5)
488 %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
489 %tmp = load i16, i16 addrspace(1)* %in, align 2
490 %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
491 store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx
492 %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
493 %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2
494 %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
495 store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3
496 %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2
497 %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2
498 %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2
499 store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx
500 %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)*
501 %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2
502 store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4
503 %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
504 %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)*
505 %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2
506 %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1
507 store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4
508 %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
512 ; There is another instruction between the misordered instruction and
513 ; the value dependent load, so a simple operand check is insufficient.
514 define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
515 ; GCN-LABEL: chain_hi_to_lo_group_other_dep:
516 ; GCN: ; %bb.0: ; %bb
517 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518 ; GCN-NEXT: ds_read_u16_d16_hi v1, v0
519 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
520 ; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
521 ; GCN-NEXT: ds_read_u16_d16 v1, v0 offset:2
522 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
523 ; GCN-NEXT: v_mov_b32_e32 v0, v1
524 ; GCN-NEXT: s_setpc_b64 s[30:31]
526 ; GFX10-LABEL: chain_hi_to_lo_group_other_dep:
527 ; GFX10: ; %bb.0: ; %bb
528 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
530 ; GFX10-NEXT: ds_read_u16_d16_hi v1, v0
531 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
532 ; GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
533 ; GFX10-NEXT: ds_read_u16_d16 v1, v0 offset:2
534 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
535 ; GFX10-NEXT: v_mov_b32_e32 v0, v1
536 ; GFX10-NEXT: s_setpc_b64 s[30:31]
538 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
539 %load_lo = load i16, i16 addrspace(3)* %gep_lo
540 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
541 %load_hi = load i16, i16 addrspace(3)* %gep_hi
542 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
543 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
544 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
545 ret <2 x i16> %result
548 ; The volatile operations aren't put on the same chain
549 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
550 ; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
551 ; GCN: ; %bb.0: ; %bb
552 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
553 ; GCN-NEXT: ds_read_u16 v1, v0 offset:2
554 ; GCN-NEXT: ds_read_u16_d16_hi v0, v0
555 ; GCN-NEXT: v_mov_b32_e32 v2, 0xffff
556 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
557 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
558 ; GCN-NEXT: v_bfi_b32 v0, v2, v1, v0
559 ; GCN-NEXT: s_setpc_b64 s[30:31]
561 ; GFX10-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
562 ; GFX10: ; %bb.0: ; %bb
563 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
564 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
565 ; GFX10-NEXT: ds_read_u16 v1, v0 offset:2
566 ; GFX10-NEXT: ds_read_u16_d16_hi v0, v0
567 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
569 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
570 ; GFX10-NEXT: s_setpc_b64 s[30:31]
572 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
573 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
574 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
575 %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
576 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
577 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
578 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
579 ret <2 x i16> %result
582 define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
583 ; GFX900-LABEL: chain_hi_to_lo_private_other_dep:
584 ; GFX900: ; %bb.0: ; %bb
585 ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
586 ; GFX900-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
587 ; GFX900-NEXT: s_waitcnt vmcnt(0)
588 ; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
589 ; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
590 ; GFX900-NEXT: s_waitcnt vmcnt(0)
591 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
592 ; GFX900-NEXT: s_setpc_b64 s[30:31]
594 ; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
595 ; FLATSCR: ; %bb.0: ; %bb
596 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
597 ; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off
598 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
599 ; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
600 ; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2
601 ; FLATSCR-NEXT: s_waitcnt vmcnt(0)
602 ; FLATSCR-NEXT: v_mov_b32_e32 v0, v1
603 ; FLATSCR-NEXT: s_setpc_b64 s[30:31]
605 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
606 ; GFX10_DEFAULT: ; %bb.0: ; %bb
607 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
608 ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0
609 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
610 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
611 ; GFX10_DEFAULT-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
612 ; GFX10_DEFAULT-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
613 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0)
614 ; GFX10_DEFAULT-NEXT: v_mov_b32_e32 v0, v1
615 ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31]
617 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep:
618 ; FLATSCR_GFX10: ; %bb.0: ; %bb
619 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0
621 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off
622 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
623 ; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
624 ; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2
625 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
626 ; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
627 ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
629 %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
630 %load_lo = load i16, i16 addrspace(5)* %gep_lo
631 %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
632 %load_hi = load i16, i16 addrspace(5)* %gep_hi
633 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
634 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
635 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
636 ret <2 x i16> %result
639 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
640 ; GCN-LABEL: chain_hi_to_lo_global_other_dep:
641 ; GCN: ; %bb.0: ; %bb
642 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
643 ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc
644 ; GCN-NEXT: s_waitcnt vmcnt(0)
645 ; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off glc
646 ; GCN-NEXT: s_waitcnt vmcnt(0)
647 ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff
648 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
649 ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0
650 ; GCN-NEXT: s_setpc_b64 s[30:31]
652 ; GFX10-LABEL: chain_hi_to_lo_global_other_dep:
653 ; GFX10: ; %bb.0: ; %bb
654 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
656 ; GFX10-NEXT: global_load_ushort v2, v[0:1], off offset:2 glc dlc
657 ; GFX10-NEXT: s_waitcnt vmcnt(0)
658 ; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off glc dlc
659 ; GFX10-NEXT: s_waitcnt vmcnt(0)
660 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
661 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
662 ; GFX10-NEXT: s_setpc_b64 s[30:31]
664 %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
665 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
666 %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
667 %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
668 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
669 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
670 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
671 ret <2 x i16> %result
674 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
675 ; GCN-LABEL: chain_hi_to_lo_flat_other_dep:
676 ; GCN: ; %bb.0: ; %bb
677 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
678 ; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2 glc
679 ; GCN-NEXT: s_waitcnt vmcnt(0)
680 ; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1] glc
681 ; GCN-NEXT: s_waitcnt vmcnt(0)
682 ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff
683 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
684 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
685 ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0
686 ; GCN-NEXT: s_setpc_b64 s[30:31]
688 ; GFX10-LABEL: chain_hi_to_lo_flat_other_dep:
689 ; GFX10: ; %bb.0: ; %bb
690 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
691 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
692 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 2
693 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
694 ; GFX10-NEXT: flat_load_ushort v2, v[2:3] glc dlc
695 ; GFX10-NEXT: s_waitcnt vmcnt(0)
696 ; GFX10-NEXT: flat_load_short_d16_hi v0, v[0:1] glc dlc
697 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
698 ; GFX10-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
699 ; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
700 ; GFX10-NEXT: s_setpc_b64 s[30:31]
702 %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
703 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
704 %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
705 %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
706 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
707 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
708 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
709 ret <2 x i16> %result
712 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
713 ; GCN-LABEL: chain_hi_to_lo_group_may_alias_store:
714 ; GCN: ; %bb.0: ; %bb
715 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
716 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7b
717 ; GCN-NEXT: ds_read_u16 v2, v0
718 ; GCN-NEXT: ds_write_b16 v1, v3
719 ; GCN-NEXT: ds_read_u16 v0, v0 offset:2
720 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
721 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
722 ; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0
723 ; GCN-NEXT: s_setpc_b64 s[30:31]
725 ; GFX10-LABEL: chain_hi_to_lo_group_may_alias_store:
726 ; GFX10: ; %bb.0: ; %bb
727 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
729 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b
730 ; GFX10-NEXT: ds_read_u16 v3, v0
731 ; GFX10-NEXT: ds_write_b16 v1, v2
732 ; GFX10-NEXT: ds_read_u16 v0, v0 offset:2
733 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
734 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
735 ; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0
736 ; GFX10-NEXT: s_setpc_b64 s[30:31]
738 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
739 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
740 %load_hi = load i16, i16 addrspace(3)* %gep_hi
741 store i16 123, i16 addrspace(3)* %may.alias
742 %load_lo = load i16, i16 addrspace(3)* %gep_lo
744 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
745 %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
746 ret <2 x i16> %result