1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
4 define <2 x half> @chain_hi_to_lo_private() {
5 ; GCN-LABEL: chain_hi_to_lo_private:
7 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8 ; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:2
9 ; GCN-NEXT: s_waitcnt vmcnt(0)
10 ; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s33
11 ; GCN-NEXT: s_waitcnt vmcnt(0)
12 ; GCN-NEXT: s_setpc_b64 s[30:31]
14 %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
15 %load_lo = load half, half addrspace(5)* %gep_lo
16 %gep_hi = getelementptr inbounds half, half addrspace(5)* null, i64 0
17 %load_hi = load half, half addrspace(5)* %gep_hi
19 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
20 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
22 ret <2 x half> %result
25 define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) {
26 ; GCN-LABEL: chain_hi_to_lo_private_different_bases:
28 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29 ; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen
30 ; GCN-NEXT: s_waitcnt vmcnt(0)
31 ; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], s33 offen
32 ; GCN-NEXT: s_waitcnt vmcnt(0)
33 ; GCN-NEXT: s_setpc_b64 s[30:31]
35 %load_lo = load half, half addrspace(5)* %base_lo
36 %load_hi = load half, half addrspace(5)* %base_hi
38 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
39 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
41 ret <2 x half> %result
44 define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) {
45 ; GCN-LABEL: chain_hi_to_lo_arithmatic:
47 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; GCN-NEXT: v_add_f16_e32 v1, 1.0, v1
49 ; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen
50 ; GCN-NEXT: s_waitcnt vmcnt(0)
51 ; GCN-NEXT: v_mov_b32_e32 v0, v1
52 ; GCN-NEXT: s_setpc_b64 s[30:31]
54 %arith_lo = fadd half %in, 1.0
55 %load_hi = load half, half addrspace(5)* %base
57 %temp = insertelement <2 x half> undef, half %arith_lo, i32 0
58 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
60 ret <2 x half> %result
63 define <2 x half> @chain_hi_to_lo_group() {
64 ; GCN-LABEL: chain_hi_to_lo_group:
66 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67 ; GCN-NEXT: v_mov_b32_e32 v1, 0
68 ; GCN-NEXT: ds_read_u16 v0, v1 offset:2
69 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
70 ; GCN-NEXT: ds_read_u16_d16_hi v0, v1
71 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
72 ; GCN-NEXT: s_setpc_b64 s[30:31]
74 %gep_lo = getelementptr inbounds half, half addrspace(3)* null, i64 1
75 %load_lo = load half, half addrspace(3)* %gep_lo
76 %gep_hi = getelementptr inbounds half, half addrspace(3)* null, i64 0
77 %load_hi = load half, half addrspace(3)* %gep_hi
79 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
80 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
82 ret <2 x half> %result
85 define <2 x half> @chain_hi_to_lo_group_different_bases(half addrspace(3)* %base_lo, half addrspace(3)* %base_hi) {
86 ; GCN-LABEL: chain_hi_to_lo_group_different_bases:
88 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89 ; GCN-NEXT: ds_read_u16 v0, v0
90 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
91 ; GCN-NEXT: ds_read_u16_d16_hi v0, v1
92 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
93 ; GCN-NEXT: s_setpc_b64 s[30:31]
95 %load_lo = load half, half addrspace(3)* %base_lo
96 %load_hi = load half, half addrspace(3)* %base_hi
98 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
99 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
101 ret <2 x half> %result
104 define <2 x half> @chain_hi_to_lo_global() {
105 ; GCN-LABEL: chain_hi_to_lo_global:
106 ; GCN: ; %bb.0: ; %bb
107 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108 ; GCN-NEXT: v_mov_b32_e32 v0, 2
109 ; GCN-NEXT: v_mov_b32_e32 v1, 0
110 ; GCN-NEXT: global_load_ushort v0, v[0:1], off
111 ; GCN-NEXT: v_mov_b32_e32 v1, 0
112 ; GCN-NEXT: v_mov_b32_e32 v2, 0
113 ; GCN-NEXT: s_waitcnt vmcnt(0)
114 ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off
115 ; GCN-NEXT: s_waitcnt vmcnt(0)
116 ; GCN-NEXT: s_setpc_b64 s[30:31]
118 %gep_lo = getelementptr inbounds half, half addrspace(1)* null, i64 1
119 %load_lo = load half, half addrspace(1)* %gep_lo
120 %gep_hi = getelementptr inbounds half, half addrspace(1)* null, i64 0
121 %load_hi = load half, half addrspace(1)* %gep_hi
123 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
124 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
126 ret <2 x half> %result
129 define <2 x half> @chain_hi_to_lo_global_different_bases(half addrspace(1)* %base_lo, half addrspace(1)* %base_hi) {
130 ; GCN-LABEL: chain_hi_to_lo_global_different_bases:
131 ; GCN: ; %bb.0: ; %bb
132 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
133 ; GCN-NEXT: global_load_ushort v0, v[0:1], off
134 ; GCN-NEXT: s_waitcnt vmcnt(0)
135 ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off
136 ; GCN-NEXT: s_waitcnt vmcnt(0)
137 ; GCN-NEXT: s_setpc_b64 s[30:31]
139 %load_lo = load half, half addrspace(1)* %base_lo
140 %load_hi = load half, half addrspace(1)* %base_hi
142 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
143 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
145 ret <2 x half> %result
148 define <2 x half> @chain_hi_to_lo_flat() {
149 ; GCN-LABEL: chain_hi_to_lo_flat:
150 ; GCN: ; %bb.0: ; %bb
151 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
152 ; GCN-NEXT: v_mov_b32_e32 v0, 2
153 ; GCN-NEXT: v_mov_b32_e32 v1, 0
154 ; GCN-NEXT: flat_load_ushort v0, v[0:1]
155 ; GCN-NEXT: v_mov_b32_e32 v1, 0
156 ; GCN-NEXT: v_mov_b32_e32 v2, 0
157 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
158 ; GCN-NEXT: flat_load_short_d16_hi v0, v[1:2]
159 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
160 ; GCN-NEXT: s_setpc_b64 s[30:31]
162 %gep_lo = getelementptr inbounds half, half* null, i64 1
163 %load_lo = load half, half* %gep_lo
164 %gep_hi = getelementptr inbounds half, half* null, i64 0
165 %load_hi = load half, half* %gep_hi
167 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
168 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
170 ret <2 x half> %result
173 define <2 x half> @chain_hi_to_lo_flat_different_bases(half* %base_lo, half* %base_hi) {
174 ; GCN-LABEL: chain_hi_to_lo_flat_different_bases:
175 ; GCN: ; %bb.0: ; %bb
176 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177 ; GCN-NEXT: flat_load_ushort v0, v[0:1]
178 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
179 ; GCN-NEXT: flat_load_short_d16_hi v0, v[2:3]
180 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
181 ; GCN-NEXT: s_setpc_b64 s[30:31]
183 %load_lo = load half, half* %base_lo
184 %load_hi = load half, half* %base_hi
186 %temp = insertelement <2 x half> undef, half %load_lo, i32 0
187 %result = insertelement <2 x half> %temp, half %load_hi, i32 1
189 ret <2 x half> %result
192 ; Make sure we don't lose any of the private stores.
193 define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
194 ; GCN-LABEL: vload2_private:
195 ; GCN: ; %bb.0: ; %entry
196 ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9
197 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0
198 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
199 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
200 ; GCN-NEXT: v_mov_b32_e32 v2, s4
201 ; GCN-NEXT: v_mov_b32_e32 v3, s5
202 ; GCN-NEXT: global_load_dword v4, v[2:3], off
203 ; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4
204 ; GCN-NEXT: v_mov_b32_e32 v0, s6
205 ; GCN-NEXT: v_mov_b32_e32 v1, s7
206 ; GCN-NEXT: s_waitcnt vmcnt(1)
207 ; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:4
208 ; GCN-NEXT: buffer_store_short_d16_hi v4, off, s[0:3], s9 offset:6
209 ; GCN-NEXT: s_waitcnt vmcnt(2)
210 ; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8
211 ; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4
212 ; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6
213 ; GCN-NEXT: s_waitcnt vmcnt(1)
214 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2
215 ; GCN-NEXT: s_waitcnt vmcnt(0)
216 ; GCN-NEXT: v_mov_b32_e32 v3, v4
217 ; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], s9 offset:8
218 ; GCN-NEXT: v_lshl_or_b32 v2, v4, 16, v2
219 ; GCN-NEXT: s_waitcnt vmcnt(0)
220 ; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
223 %loc = alloca [3 x i16], align 2, addrspace(5)
224 %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
225 %tmp = load i16, i16 addrspace(1)* %in, align 2
226 %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
227 store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx
228 %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
229 %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2
230 %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
231 store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3
232 %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2
233 %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2
234 %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2
235 store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx
236 %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)*
237 %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2
238 store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4
239 %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
240 %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)*
241 %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2
242 %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1
243 store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4
244 %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
248 ; There is another instruction between the misordered instruction and
249 ; the value dependent load, so a simple operand check is insufficient.
250 define <2 x i16> @chain_hi_to_lo_group_other_dep(i16 addrspace(3)* %ptr) {
251 ; GCN-LABEL: chain_hi_to_lo_group_other_dep:
252 ; GCN: ; %bb.0: ; %bb
253 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
254 ; GCN-NEXT: ds_read_u16_d16_hi v1, v0
255 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
256 ; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
257 ; GCN-NEXT: ds_read_u16_d16 v1, v0 offset:2
258 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
259 ; GCN-NEXT: v_mov_b32_e32 v0, v1
260 ; GCN-NEXT: s_setpc_b64 s[30:31]
262 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
263 %load_lo = load i16, i16 addrspace(3)* %gep_lo
264 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
265 %load_hi = load i16, i16 addrspace(3)* %gep_hi
266 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
267 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
268 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
269 ret <2 x i16> %result
272 ; The volatile operations aren't put on the same chain
273 define <2 x i16> @chain_hi_to_lo_group_other_dep_multi_chain(i16 addrspace(3)* %ptr) {
274 ; GCN-LABEL: chain_hi_to_lo_group_other_dep_multi_chain:
275 ; GCN: ; %bb.0: ; %bb
276 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
277 ; GCN-NEXT: ds_read_u16 v1, v0 offset:2
278 ; GCN-NEXT: ds_read_u16_d16_hi v0, v0
279 ; GCN-NEXT: v_mov_b32_e32 v2, 0xffff
280 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
281 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
282 ; GCN-NEXT: v_bfi_b32 v0, v2, v1, v0
283 ; GCN-NEXT: s_setpc_b64 s[30:31]
285 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
286 %load_lo = load volatile i16, i16 addrspace(3)* %gep_lo
287 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
288 %load_hi = load volatile i16, i16 addrspace(3)* %gep_hi
289 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
290 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
291 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
292 ret <2 x i16> %result
295 define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
296 ; GCN-LABEL: chain_hi_to_lo_private_other_dep:
297 ; GCN: ; %bb.0: ; %bb
298 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299 ; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen
300 ; GCN-NEXT: s_waitcnt vmcnt(0)
301 ; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
302 ; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s33 offen offset:2
303 ; GCN-NEXT: s_waitcnt vmcnt(0)
304 ; GCN-NEXT: v_mov_b32_e32 v0, v1
305 ; GCN-NEXT: s_setpc_b64 s[30:31]
307 %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
308 %load_lo = load i16, i16 addrspace(5)* %gep_lo
309 %gep_hi = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 0
310 %load_hi = load i16, i16 addrspace(5)* %gep_hi
311 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
312 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
313 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
314 ret <2 x i16> %result
317 define <2 x i16> @chain_hi_to_lo_global_other_dep(i16 addrspace(1)* %ptr) {
318 ; GCN-LABEL: chain_hi_to_lo_global_other_dep:
319 ; GCN: ; %bb.0: ; %bb
320 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321 ; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2
322 ; GCN-NEXT: global_load_short_d16_hi v0, v[0:1], off
323 ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff
324 ; GCN-NEXT: s_waitcnt vmcnt(0)
325 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
326 ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0
327 ; GCN-NEXT: s_setpc_b64 s[30:31]
329 %gep_lo = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 1
330 %load_lo = load volatile i16, i16 addrspace(1)* %gep_lo
331 %gep_hi = getelementptr inbounds i16, i16 addrspace(1)* %ptr, i64 0
332 %load_hi = load volatile i16, i16 addrspace(1)* %gep_hi
333 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
334 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
335 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
336 ret <2 x i16> %result
339 define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
340 ; GCN-LABEL: chain_hi_to_lo_flat_other_dep:
341 ; GCN: ; %bb.0: ; %bb
342 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
343 ; GCN-NEXT: flat_load_ushort v2, v[0:1] offset:2
344 ; GCN-NEXT: flat_load_short_d16_hi v0, v[0:1]
345 ; GCN-NEXT: v_mov_b32_e32 v1, 0xffff
346 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
347 ; GCN-NEXT: v_pk_sub_u16 v0, v0, -12 op_sel_hi:[1,0]
348 ; GCN-NEXT: v_bfi_b32 v0, v1, v2, v0
349 ; GCN-NEXT: s_setpc_b64 s[30:31]
351 %gep_lo = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 1
352 %load_lo = load volatile i16, i16 addrspace(0)* %gep_lo
353 %gep_hi = getelementptr inbounds i16, i16 addrspace(0)* %ptr, i64 0
354 %load_hi = load volatile i16, i16 addrspace(0)* %gep_hi
355 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
356 %op.hi = add <2 x i16> %to.hi, <i16 12, i16 12>
357 %result = insertelement <2 x i16> %op.hi, i16 %load_lo, i32 0
358 ret <2 x i16> %result
361 define <2 x i16> @chain_hi_to_lo_group_may_alias_store(i16 addrspace(3)* %ptr, i16 addrspace(3)* %may.alias) {
362 ; GCN-LABEL: chain_hi_to_lo_group_may_alias_store:
363 ; GCN: ; %bb.0: ; %bb
364 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
365 ; GCN-NEXT: v_mov_b32_e32 v3, 0x7b
366 ; GCN-NEXT: ds_read_u16 v2, v0
367 ; GCN-NEXT: ds_write_b16 v1, v3
368 ; GCN-NEXT: ds_read_u16 v0, v0 offset:2
369 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
370 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
371 ; GCN-NEXT: v_lshl_or_b32 v0, v2, 16, v0
372 ; GCN-NEXT: s_setpc_b64 s[30:31]
374 %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
375 %gep_hi = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 0
376 %load_hi = load i16, i16 addrspace(3)* %gep_hi
377 store i16 123, i16 addrspace(3)* %may.alias
378 %load_lo = load i16, i16 addrspace(3)* %gep_lo
380 %to.hi = insertelement <2 x i16> undef, i16 %load_hi, i32 1
381 %result = insertelement <2 x i16> %to.hi, i16 %load_lo, i32 0
382 ret <2 x i16> %result