1 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
3 ;CHECK-LABEL: {{^}}s_buffer_load_imm:
5 ;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x4
6 define amdgpu_ps void @s_buffer_load_imm(<4 x i32> inreg %desc) {
8 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
9 %bitcast = bitcast i32 %load to float
10 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
14 ;CHECK-LABEL: {{^}}s_buffer_load_index:
15 ;CHECK-NOT: s_waitcnt;
16 ;CHECK: s_buffer_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
17 define amdgpu_ps void @s_buffer_load_index(<4 x i32> inreg %desc, i32 inreg %index) {
19 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %index, i32 0)
20 %bitcast = bitcast i32 %load to float
21 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
25 ;CHECK-LABEL: {{^}}s_buffer_loadx2_imm:
26 ;CHECK-NOT: s_waitcnt;
27 ;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x40
28 define amdgpu_ps void @s_buffer_loadx2_imm(<4 x i32> inreg %desc) {
30 %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 64, i32 0)
31 %bitcast = bitcast <2 x i32> %load to <2 x float>
32 %x = extractelement <2 x float> %bitcast, i32 0
33 %y = extractelement <2 x float> %bitcast, i32 1
34 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
38 ;CHECK-LABEL: {{^}}s_buffer_loadx2_index:
39 ;CHECK-NOT: s_waitcnt;
40 ;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
41 define amdgpu_ps void @s_buffer_loadx2_index(<4 x i32> inreg %desc, i32 inreg %index) {
43 %load = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %index, i32 0)
44 %bitcast = bitcast <2 x i32> %load to <2 x float>
45 %x = extractelement <2 x float> %bitcast, i32 0
46 %y = extractelement <2 x float> %bitcast, i32 1
47 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
51 ;CHECK-LABEL: {{^}}s_buffer_loadx4_imm:
52 ;CHECK-NOT: s_waitcnt;
53 ;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0xc8
54 define amdgpu_ps void @s_buffer_loadx4_imm(<4 x i32> inreg %desc) {
56 %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 200, i32 0)
57 %bitcast = bitcast <4 x i32> %load to <4 x float>
58 %x = extractelement <4 x float> %bitcast, i32 0
59 %y = extractelement <4 x float> %bitcast, i32 1
60 %z = extractelement <4 x float> %bitcast, i32 2
61 %w = extractelement <4 x float> %bitcast, i32 3
62 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
66 ;CHECK-LABEL: {{^}}s_buffer_loadx4_index:
67 ;CHECK-NOT: s_waitcnt;
68 ;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
69 define amdgpu_ps void @s_buffer_loadx4_index(<4 x i32> inreg %desc, i32 inreg %index) {
71 %load = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %index, i32 0)
72 %bitcast = bitcast <4 x i32> %load to <4 x float>
73 %x = extractelement <4 x float> %bitcast, i32 0
74 %y = extractelement <4 x float> %bitcast, i32 1
75 %z = extractelement <4 x float> %bitcast, i32 2
76 %w = extractelement <4 x float> %bitcast, i32 3
77 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
81 ;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex2:
82 ;CHECK-NOT: s_waitcnt;
83 ;CHECK: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x4
84 define amdgpu_ps void @s_buffer_load_imm_mergex2(<4 x i32> inreg %desc) {
86 %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 4, i32 0)
87 %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
88 %x = bitcast i32 %load0 to float
89 %y = bitcast i32 %load1 to float
90 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float undef, float undef, i1 true, i1 true)
94 ;CHECK-LABEL: {{^}}s_buffer_load_imm_mergex4:
95 ;CHECK-NOT: s_waitcnt;
96 ;CHECK: s_buffer_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0x8
97 define amdgpu_ps void @s_buffer_load_imm_mergex4(<4 x i32> inreg %desc) {
99 %load0 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 8, i32 0)
100 %load1 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 12, i32 0)
101 %load2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 16, i32 0)
102 %load3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 20, i32 0)
103 %x = bitcast i32 %load0 to float
104 %y = bitcast i32 %load1 to float
105 %z = bitcast i32 %load2 to float
106 %w = bitcast i32 %load3 to float
107 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true)
111 ;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb:
112 ;CHECK-NOT: s_waitcnt;
114 ;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
115 define amdgpu_ps void @s_buffer_load_index_across_bb(<4 x i32> inreg %desc, i32 %index) {
117 %tmp = shl i32 %index, 4
120 bb1: ; preds = %main_body
121 %tmp1 = or i32 %tmp, 8
122 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0)
123 %bitcast = bitcast i32 %load to float
124 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float undef, float undef, float undef, i1 true, i1 true)
128 ;CHECK-LABEL: {{^}}s_buffer_load_index_across_bb_merged:
129 ;CHECK-NOT: s_waitcnt;
132 ;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
133 ;CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen
134 define amdgpu_ps void @s_buffer_load_index_across_bb_merged(<4 x i32> inreg %desc, i32 %index) {
136 %tmp = shl i32 %index, 4
139 bb1: ; preds = %main_body
140 %tmp1 = or i32 %tmp, 8
141 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp1, i32 0)
142 %tmp2 = or i32 %tmp1, 4
143 %load2 = tail call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 %tmp2, i32 0)
144 %bitcast = bitcast i32 %load to float
145 %bitcast2 = bitcast i32 %load2 to float
146 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %bitcast, float %bitcast2, float undef, float undef, i1 true, i1 true)
150 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
151 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32)
152 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
153 declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)