1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
3 ; GCN-LABEL: {{^}}chain_hi_to_lo_private:
4 ; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2
5 ; GCN-NEXT: s_waitcnt vmcnt(0)
6 ; GCN-NEXT: buffer_load_short_d16_hi [[DST]], off, [[RSRC]], [[SOFF]]
7 define <2 x half> @chain_hi_to_lo_private() {
9 %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
10 %load_lo = load half, half addrspace(5)* %gep_lo
11 %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0
12 %load_hi = load half, half addrspace(5)* %gep_hi
14 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
15 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
17 ret <2 x half> %result
20 ; GCN-LABEL: {{^}}chain_hi_to_lo_private_different_bases:
21 ; GCN: buffer_load_ushort [[DST:v[0-9]+]], v{{[0-9]+}}, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offen
22 ; GCN-NEXT: s_waitcnt vmcnt(0)
23 ; GCN-NEXT: buffer_load_short_d16_hi [[DST]], v{{[0-9]+}}, [[RSRC]], [[SOFF]] offen
24 define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) {
26 %load_lo = load half, half addrspace(5)* %base_lo
27 %load_hi = load half, half addrspace(5)* %base_hi
29 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
30 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
32 ret <2 x half> %result
35 ; GCN-LABEL: {{^}}chain_hi_to_lo_arithmatic:
36 ; GCN: v_add_f16_e32 [[DST:v[0-9]+]], 1.0, v{{[0-9]+}}
37 ; GCN-NEXT: buffer_load_short_d16_hi [[DST]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
38 define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) {
40 %arith_lo = fadd half %in, 1.0
41 %load_hi = load half, half addrspace(5)* %base
43 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0
44 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
46 ret <2 x half> %result
49 ; GCN-LABEL: {{^}}chain_hi_to_lo_group:
50 ; GCN: ds_read_u16 [[DST:v[0-9]+]], [[ADDR:v[0-9]+]] offset:2
51 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
52 ; GCN-NEXT: ds_read_u16_d16_hi [[DST]], [[ADDR]]
53 define <2 x half> @chain_hi_to_lo_group() {
55 %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
56 %load_lo = load half, half addrspace(3)* %gep_lo
57 %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0
58 %load_hi = load half, half addrspace(3)* %gep_hi
60 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
61 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
63 ret <2 x half> %result
66 ; GCN-LABEL: {{^}}chain_hi_to_lo_group_different_bases:
67 ; GCN: ds_read_u16 [[DST:v[0-9]+]], v{{[0-9]+}}
68 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
69 ; GCN-NEXT: ds_read_u16_d16_hi [[DST]], v{{[0-9]+}}
70 define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) {
72 %load_lo = load half, half addrspace(3)* %base_lo
73 %load_hi = load half, half addrspace(3)* %base_hi
75 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
76 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
78 ret <2 x half> %result
81 ; GCN-LABEL: {{^}}chain_hi_to_lo_global:
82 ; GCN: global_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off
83 ; GCN: global_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}, off
84 define <2 x half> @chain_hi_to_lo_global() {
86 %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
87 %load_lo = load half, half addrspace(1)* %gep_lo
88 %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0
89 %load_hi = load half, half addrspace(1)* %gep_hi
91 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
92 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
94 ret <2 x half> %result
97 ; GCN-LABEL: {{^}}chain_hi_to_lo_global_different_bases:
98 ; GCN: global_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off
99 ; GCN-NEXT: s_waitcnt vmcnt(0)
100 ; GCN-NEXT: global_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}, off
101 define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) {
103 %load_lo = load half, half addrspace(1)* %base_lo
104 %load_hi = load half, half addrspace(1)* %base_hi
106 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
107 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
109 ret <2 x half> %result
112 ; GCN-LABEL: {{^}}chain_hi_to_lo_flat:
113 ; GCN: flat_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
114 ; GCN: flat_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}
115 define <2 x half> @chain_hi_to_lo_flat() {
117 %gep_lo = getelementptr inbounds half, half* null, i64 1
118 %load_lo = load half, half* %gep_lo
119 %gep_hi = getelementptr inbounds half, half* null, i64 0
120 %load_hi = load half, half* %gep_hi
122 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
123 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
125 ret <2 x half> %result
128 ; GCN-LABEL: {{^}}chain_hi_to_lo_flat_different_bases:
129 ; GCN: flat_load_ushort [[DST:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
130 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
131 ; GCN-NEXT: flat_load_short_d16_hi [[DST]], v{{\[[0-9]+:[0-9]+\]}}
132 define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %base_hi) {
134 %load_lo = load half, half* %base_lo
135 %load_hi = load half, half* %base_hi
137 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
138 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
140 ret <2 x half> %result
143 ; Make sure we don't lose any of the private stores.
144 ; GCN-LABEL: {{^}}vload2_private:
145 ; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4
146 ; GCN: buffer_store_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6
147 ; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8
149 ; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4
150 ; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6
151 ; GCN: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8
152 define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
154 %loc = alloca [3 x i16], align 2, addrspace(5)
155 %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
156 %tmp = load i16, i16 addrspace(1)* %in, align 2
157 %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
158 store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx
159 %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
160 %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2
161 %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
162 store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3
163 %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2
164 %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2
165 %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2
166 store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx
167 %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)*
168 %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2
169 store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4
170 %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
171 %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)*
172 %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2
173 %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1
174 store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4
175 %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
179 ; There is another instruction between the misordered instruction and
180 ; the value dependent load, so a simple operand check is insufficient.
181 ; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep:
182 ; GFX900: ds_read_u16_d16_hi v1, v0
183 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
184 ; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
185 ; GFX900-NEXT: ds_read_u16_d16 v1, v0 offset:2
186 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
187 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
188 ; GFX900-NEXT: s_setpc_b64
189 define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
191 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
192 %load_lo = load i16, i16 addrspace(3)* %gep_lo
193 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
194 %load_hi = load i16, i16 addrspace(3)* %gep_hi
195 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
196 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
197 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
198 ret <2 x i16> %result
201 ; The volatile operations aren't put on the same chain
202 ; GCN-LABEL: {{^}}chain_hi_to_lo_group_other_dep_multi_chain:
203 ; GFX900: ds_read_u16 v1, v0 offset:2
204 ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
205 ; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
206 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
207 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
208 ; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v1, v0
209 ; GFX900-NEXT: s_setpc_b64
210 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
212 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
213 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
214 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
215 %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
216 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
217 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
218 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
219 ret <2 x i16> %result
222 ; GCN-LABEL: {{^}}chain_hi_to_lo_private_other_dep:
223 ; GFX900: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen
224 ; GFX900-NEXT: s_waitcnt vmcnt(0)
225 ; GFX900-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
226 ; GFX900-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s33 offen offset:2
227 ; GFX900-NEXT: s_waitcnt vmcnt(0)
228 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
229 ; GFX900-NEXT: s_setpc_b64
230 define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
232 %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
233 %load_lo = load i16, i16 addrspace(5)* %gep_lo
234 %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
235 %load_hi = load i16, i16 addrspace(5)* %gep_hi
236 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
237 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
238 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
239 ret <2 x i16> %result
242 ; GCN-LABEL: {{^}}chain_hi_to_lo_global_other_dep:
243 ; GFX900: global_load_ushort v2, v[0:1], off offset:2
244 ; GFX900-NEXT: global_load_short_d16_hi v0, v[0:1], off
245 ; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
246 ; GFX900-NEXT: s_waitcnt vmcnt(0)
247 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
248 ; GFX900-NEXT: v_bfi_b32 v0, [[MASK]], v2, v0
249 ; GFX900-NEXT: s_setpc_b64
250 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
252 %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
253 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
254 %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
255 %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
256 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
257 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
258 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
259 ret <2 x i16> %result
262 ; GCN-LABEL: {{^}}chain_hi_to_lo_flat_other_dep:
263 ; GFX900: flat_load_ushort v2, v[0:1] offset:2
264 ; GFX900-NEXT: flat_load_short_d16_hi v0, v[0:1]
265 ; GFX900-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
266 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
267 ; GFX900-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
268 ; GFX900-NEXT: v_bfi_b32 v0, v1, v2, v0
269 ; GFX900-NEXT: s_setpc_b64
270 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
272 %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
273 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
274 %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
275 %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
276 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
277 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
278 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
279 ret <2 x i16> %result
282 ; GCN-LABEL: {{^}}chain_hi_to_lo_group_may_alias_store:
283 ; GFX900: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b
284 ; GFX900-NEXT: ds_read_u16 v2, v0
285 ; GFX900-NEXT: ds_write_b16 v1, [[K]]
286 ; GFX900-NEXT: ds_read_u16 v0, v0 offset:2
287 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
288 ; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0
289 ; GFX900-NEXT: v_lshl_or_b32 v0, v2, 16, v0
290 ; GFX900-NEXT: s_setpc_b64
291 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
293 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
294 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
295 %load_hi = load i16, i16 addrspace(3)* %gep_hi
296 store i16 123, i16 addrspace(3)* %may.alias
297 %load_lo = load i16, i16 addrspace(3)* %gep_lo
299 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
300 %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
301 ret <2 x i16> %result