1 ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-access-mode -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,UNALIGNED %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FLATSCR,ALIGNED %s
6 ; SI-LABEL: {{^}}local_unaligned_load_store_i16:
12 define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
13 %v = load i16, i16 addrspace(3)* %p, align 1
14 store i16 %v, i16 addrspace(3)* %r, align 1
18 ; SI-LABEL: {{^}}global_unaligned_load_store_i16:
19 ; ALIGNED: buffer_load_ubyte
20 ; ALIGNED: buffer_load_ubyte
21 ; ALIGNED: buffer_store_byte
22 ; ALIGNED: buffer_store_byte
24 ; UNALIGNED: buffer_load_ushort
25 ; UNALIGNED: buffer_store_short
27 define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
28 %v = load i16, i16 addrspace(1)* %p, align 1
29 store i16 %v, i16 addrspace(1)* %r, align 1
33 ; SI-LABEL: {{^}}local_unaligned_load_store_i32:
46 define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
47 %v = load i32, i32 addrspace(3)* %p, align 1
48 store i32 %v, i32 addrspace(3)* %r, align 1
52 ; SI-LABEL: {{^}}global_unaligned_load_store_i32:
53 ; ALIGNED: buffer_load_ubyte
54 ; ALIGNED: buffer_load_ubyte
55 ; ALIGNED: buffer_load_ubyte
56 ; ALIGNED: buffer_load_ubyte
57 ; ALIGNED: buffer_store_byte
58 ; ALIGNED: buffer_store_byte
59 ; ALIGNED: buffer_store_byte
60 ; ALIGNED: buffer_store_byte
62 ; UNALIGNED: buffer_load_dword
63 ; UNALIGNED: buffer_store_dword
64 define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
65 %v = load i32, i32 addrspace(1)* %p, align 1
66 store i32 %v, i32 addrspace(1)* %r, align 1
70 ; SI-LABEL: {{^}}global_align2_load_store_i32:
71 ; ALIGNED: buffer_load_ushort
72 ; ALIGNED: buffer_load_ushort
73 ; ALIGNED: buffer_store_short
74 ; ALIGNED: buffer_store_short
76 ; UNALIGNED: buffer_load_dword
77 ; UNALIGNED: buffer_store_dword
78 define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
79 %v = load i32, i32 addrspace(1)* %p, align 2
80 store i32 %v, i32 addrspace(1)* %r, align 2
84 ; GCN-LABEL: {{^}}local_align2_load_store_i32:
89 define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
90 %v = load i32, i32 addrspace(3)* %p, align 2
91 store i32 %v, i32 addrspace(3)* %r, align 2
95 ; SI-LABEL: {{^}}local_unaligned_load_store_i64:
136 define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
137 %v = load i64, i64 addrspace(3)* %p, align 1
138 store i64 %v, i64 addrspace(3)* %r, align 1
142 ; SI-LABEL: {{^}}local_unaligned_load_store_v2i32:
183 define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
184 %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
185 store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
189 ; SI-LABEL: {{^}}global_align2_load_store_i64:
190 ; ALIGNED: buffer_load_ushort
191 ; ALIGNED: buffer_load_ushort
194 ; ALIGNED-NOT: v_lshl
196 ; ALIGNED: buffer_load_ushort
199 ; ALIGNED-NOT: v_lshl
201 ; ALIGNED: buffer_load_ushort
204 ; ALIGNED-NOT: v_lshl
206 ; ALIGNED: buffer_store_short
207 ; ALIGNED: buffer_store_short
208 ; ALIGNED: buffer_store_short
209 ; ALIGNED: buffer_store_short
211 ; UNALIGNED: buffer_load_dwordx2
212 ; UNALIGNED: buffer_store_dwordx2
213 define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
214 %v = load i64, i64 addrspace(1)* %p, align 2
215 store i64 %v, i64 addrspace(1)* %r, align 2
219 ; SI-LABEL: {{^}}unaligned_load_store_i64_global:
220 ; ALIGNED: buffer_load_ubyte
221 ; ALIGNED: buffer_load_ubyte
222 ; ALIGNED: buffer_load_ubyte
223 ; ALIGNED: buffer_load_ubyte
224 ; ALIGNED: buffer_load_ubyte
225 ; ALIGNED: buffer_load_ubyte
226 ; ALIGNED: buffer_load_ubyte
227 ; ALIGNED: buffer_load_ubyte
230 ; ALIGNED-NOT: v_lshl
232 ; ALIGNED: buffer_store_byte
233 ; ALIGNED: buffer_store_byte
234 ; ALIGNED: buffer_store_byte
235 ; ALIGNED: buffer_store_byte
236 ; ALIGNED: buffer_store_byte
237 ; ALIGNED: buffer_store_byte
238 ; ALIGNED: buffer_store_byte
239 ; ALIGNED: buffer_store_byte
241 ; UNALIGNED: buffer_load_dwordx2
242 ; UNALIGNED: buffer_store_dwordx2
243 define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
244 %v = load i64, i64 addrspace(1)* %p, align 1
245 store i64 %v, i64 addrspace(1)* %r, align 1
249 ; GCN-LABEL: {{^}}local_unaligned_load_store_v4i32:
290 define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
291 %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
292 store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
296 ; SI-LABEL: {{^}}global_unaligned_load_store_v4i32
297 ; ALIGNED: buffer_load_ubyte
298 ; ALIGNED: buffer_load_ubyte
299 ; ALIGNED: buffer_load_ubyte
300 ; ALIGNED: buffer_load_ubyte
301 ; ALIGNED: buffer_load_ubyte
302 ; ALIGNED: buffer_load_ubyte
303 ; ALIGNED: buffer_load_ubyte
304 ; ALIGNED: buffer_load_ubyte
305 ; ALIGNED: buffer_load_ubyte
306 ; ALIGNED: buffer_load_ubyte
307 ; ALIGNED: buffer_load_ubyte
308 ; ALIGNED: buffer_load_ubyte
309 ; ALIGNED: buffer_load_ubyte
310 ; ALIGNED: buffer_load_ubyte
311 ; ALIGNED: buffer_load_ubyte
312 ; ALIGNED: buffer_load_ubyte
314 ; ALIGNED: buffer_store_byte
315 ; ALIGNED: buffer_store_byte
316 ; ALIGNED: buffer_store_byte
317 ; ALIGNED: buffer_store_byte
318 ; ALIGNED: buffer_store_byte
319 ; ALIGNED: buffer_store_byte
320 ; ALIGNED: buffer_store_byte
321 ; ALIGNED: buffer_store_byte
322 ; ALIGNED: buffer_store_byte
323 ; ALIGNED: buffer_store_byte
324 ; ALIGNED: buffer_store_byte
325 ; ALIGNED: buffer_store_byte
326 ; ALIGNED: buffer_store_byte
327 ; ALIGNED: buffer_store_byte
328 ; ALIGNED: buffer_store_byte
329 ; ALIGNED: buffer_store_byte
331 ; UNALIGNED: buffer_load_dwordx4
332 ; UNALIGNED: buffer_store_dwordx4
333 define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
334 %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
335 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
339 ; GCN-LABEL: {{^}}local_load_i64_align_4:
341 define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
342 %val = load i64, i64 addrspace(3)* %in, align 4
343 store i64 %val, i64 addrspace(1)* %out, align 8
347 ; GCN-LABEL: {{^}}local_load_i64_align_4_with_offset
348 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
349 define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
350 %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
351 %val = load i64, i64 addrspace(3)* %ptr, align 4
352 store i64 %val, i64 addrspace(1)* %out, align 8
356 ; GCN-LABEL: {{^}}local_load_i64_align_4_with_split_offset:
357 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
358 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
360 define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
361 %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
362 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
363 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
364 %val = load i64, i64 addrspace(3)* %ptri64, align 4
365 store i64 %val, i64 addrspace(1)* %out, align 8
369 ; GCN-LABEL: {{^}}local_load_i64_align_1:
379 define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
380 %val = load i64, i64 addrspace(3)* %in, align 1
381 store i64 %val, i64 addrspace(1)* %out, align 8
385 ; GCN-LABEL: {{^}}local_store_i64_align_4:
387 define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
388 store i64 %val, i64 addrspace(3)* %out, align 4
392 ; GCN-LABEL: {{^}}local_store_i64_align_4_with_offset
393 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
395 define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
396 %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
397 store i64 0, i64 addrspace(3)* %ptr, align 4
401 ; GCN-LABEL: {{^}}local_store_i64_align_4_with_split_offset:
402 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
403 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
405 define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
406 %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
407 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
408 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
409 store i64 0, i64 addrspace(3)* %out, align 4
413 ; SI-LABEL: {{^}}constant_unaligned_load_i32:
414 ; ALIGNED: buffer_load_ubyte
415 ; ALIGNED: buffer_load_ubyte
416 ; ALIGNED: buffer_load_ubyte
417 ; ALIGNED: buffer_load_ubyte
419 ; UNALIGNED: s_load_dword
421 ; SI: buffer_store_dword
422 define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
423 %v = load i32, i32 addrspace(4)* %p, align 1
424 store i32 %v, i32 addrspace(1)* %r, align 4
428 ; SI-LABEL: {{^}}constant_align2_load_i32:
429 ; ALIGNED: buffer_load_ushort
430 ; ALIGNED: buffer_load_ushort
432 ; UNALIGNED: s_load_dword
433 ; UNALIGNED: buffer_store_dword
434 define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
435 %v = load i32, i32 addrspace(4)* %p, align 2
436 store i32 %v, i32 addrspace(1)* %r, align 4
440 ; SI-LABEL: {{^}}constant_align2_load_i64:
441 ; ALIGNED: buffer_load_ushort
442 ; ALIGNED: buffer_load_ushort
443 ; ALIGNED: buffer_load_ushort
444 ; ALIGNED: buffer_load_ushort
446 ; UNALIGNED: s_load_dwordx4
447 ; UNALIGNED: buffer_store_dwordx2
448 define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
449 %v = load i64, i64 addrspace(4)* %p, align 2
450 store i64 %v, i64 addrspace(1)* %r, align 4
454 ; SI-LABEL: {{^}}constant_align4_load_i64:
456 ; SI: buffer_store_dwordx2
457 define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
458 %v = load i64, i64 addrspace(4)* %p, align 4
459 store i64 %v, i64 addrspace(1)* %r, align 4
463 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
465 ; SI: buffer_store_dwordx4
466 define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
467 %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 4
468 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
472 ; SI-LABEL: {{^}}constant_unaligned_load_v2i32:
473 ; ALIGNED: buffer_load_ubyte
474 ; ALIGNED: buffer_load_ubyte
475 ; ALIGNED: buffer_load_ubyte
476 ; ALIGNED: buffer_load_ubyte
478 ; ALIGNED: buffer_load_ubyte
479 ; ALIGNED: buffer_load_ubyte
480 ; ALIGNED: buffer_load_ubyte
481 ; ALIGNED: buffer_load_ubyte
483 ; UNALIGNED: buffer_load_dwordx2
485 ; SI: buffer_store_dwordx2
486 define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(4)* %p, <2 x i32> addrspace(1)* %r) #0 {
487 %v = load <2 x i32>, <2 x i32> addrspace(4)* %p, align 1
488 store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
492 ; SI-LABEL: {{^}}constant_unaligned_load_v4i32:
493 ; ALIGNED: buffer_load_ubyte
494 ; ALIGNED: buffer_load_ubyte
495 ; ALIGNED: buffer_load_ubyte
496 ; ALIGNED: buffer_load_ubyte
498 ; ALIGNED: buffer_load_ubyte
499 ; ALIGNED: buffer_load_ubyte
500 ; ALIGNED: buffer_load_ubyte
501 ; ALIGNED: buffer_load_ubyte
503 ; ALIGNED: buffer_load_ubyte
504 ; ALIGNED: buffer_load_ubyte
505 ; ALIGNED: buffer_load_ubyte
506 ; ALIGNED: buffer_load_ubyte
508 ; ALIGNED: buffer_load_ubyte
509 ; ALIGNED: buffer_load_ubyte
510 ; ALIGNED: buffer_load_ubyte
511 ; ALIGNED: buffer_load_ubyte
513 ; UNALIGNED: buffer_load_dwordx4
515 ; SI: buffer_store_dwordx4
516 define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
517 %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 1
518 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
522 ; SI-LABEL: {{^}}constant_align4_load_i8:
524 ; SI: buffer_store_byte
525 define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
526 %v = load i8, i8 addrspace(4)* %p, align 4
527 store i8 %v, i8 addrspace(1)* %r, align 4
531 ; SI-LABEL: {{^}}constant_align2_load_i8:
532 ; SI: buffer_load_ubyte
533 ; SI: buffer_store_byte
534 define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
535 %v = load i8, i8 addrspace(4)* %p, align 2
536 store i8 %v, i8 addrspace(1)* %r, align 2
540 ; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:
541 ; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
542 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
543 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
544 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
545 define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
546 %gep0 = getelementptr i32, i32 addrspace(4)* %p, i64 1
547 %v0 = load i32, i32 addrspace(4)* %p, align 4
548 %v1 = load i32, i32 addrspace(4)* %gep0, align 4
550 %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1
551 store i32 %v0, i32 addrspace(1)* %r, align 4
552 store i32 %v1, i32 addrspace(1)* %gep1, align 4
556 ; SI-LABEL: {{^}}local_load_align1_v16i8:
574 ; SI: ScratchSize: 0{{$}}
575 define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
576 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
577 store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
581 ; SI-LABEL: {{^}}local_store_align1_v16i8:
599 ; SI: ScratchSize: 0{{$}}
600 define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
601 store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
605 ; SI-LABEL: {{^}}private_load_align1_f64:
606 ; MUBUF: buffer_load_ubyte
607 ; MUBUF: buffer_load_ubyte
608 ; MUBUF: buffer_load_ubyte
609 ; MUBUF: buffer_load_ubyte
610 ; MUBUF: buffer_load_ubyte
611 ; MUBUF: buffer_load_ubyte
612 ; MUBUF: buffer_load_ubyte
613 ; MUBUF: buffer_load_ubyte
614 ; FLATSCR: scratch_load_dwordx2
615 define double @private_load_align1_f64(double addrspace(5)* %in) {
616 %x = load double, double addrspace(5)* %in, align 1
620 ; SI-LABEL: {{^}}private_store_align1_f64:
621 ; MUBUF: buffer_store_byte
622 ; MUBUF: buffer_store_byte
623 ; MUBUF: buffer_store_byte
624 ; MUBUF: buffer_store_byte
625 ; MUBUF: buffer_store_byte
626 ; MUBUF: buffer_store_byte
627 ; MUBUF: buffer_store_byte
628 ; MUBUF: buffer_store_byte
629 ; FLATSCR: scratch_store_dwordx2
630 define void @private_store_align1_f64(double addrspace(5)* %out, double %x) #0 {
631 store double %x, double addrspace(5)* %out, align 1
635 ; SI-LABEL: {{^}}private_load_align4_f64:
636 ; MUBUF: buffer_load_dword
637 ; MUBUF: buffer_load_dword
638 ; FLATSCR: scratch_load_dwordx2
639 define double @private_load_align4_f64(double addrspace(5)* %in) {
640 %x = load double, double addrspace(5)* %in, align 4
644 ; SI-LABEL: {{^}}private_store_align4_f64:
645 ; MUBUF: buffer_store_dword
646 ; MUBUF: buffer_store_dword
647 ; FLATSCR: scratch_store_dwordx2
648 define void @private_store_align4_f64(double addrspace(5)* %out, double %x) #0 {
649 store double %x, double addrspace(5)* %out, align 4
653 ; SI-LABEL: {{^}}private_load_align2_f64:
654 ; MUBUF: buffer_load_ushort
655 ; MUBUF: buffer_load_ushort
656 ; MUBUF: buffer_load_ushort
657 ; MUBUF: buffer_load_ushort
658 ; FLATSCR: scratch_load_dwordx2
659 define double @private_load_align2_f64(double addrspace(5)* %in) {
660 %x = load double, double addrspace(5)* %in, align 2
664 ; SI-LABEL: {{^}}private_store_align2_f64:
665 ; MUBUF: buffer_store_short
666 ; MUBUF: buffer_store_short
667 ; MUBUF: buffer_store_short
668 ; MUBUF: buffer_store_short
669 ; FLATSCR: scratch_store_dwordx2
670 define void @private_store_align2_f64(double addrspace(5)* %out, double %x) #0 {
671 store double %x, double addrspace(5)* %out, align 2
675 ; Should not merge this to a dword store
676 define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
677 %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
678 %v = load i16, i16 addrspace(1)* %p, align 2
679 store i16 1, i16 addrspace(1)* %r, align 2
680 store i16 2, i16 addrspace(1)* %gep.r, align 2
684 ; Should not merge this to a word load
685 define i32 @load_2xi16_align2(i16 addrspace(1)* %p) #0 {
686 %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
687 %p.0 = load i16, i16 addrspace(1)* %p, align 2
688 %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2
689 %zext.0 = zext i16 %p.0 to i32
690 %zext.1 = zext i16 %p.1 to i32
691 %shl.1 = shl i32 %zext.1, 16
692 %or = or i32 %zext.0, %shl.1
696 attributes #0 = { nounwind }