1 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI,SI %s
2 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICIVI,SICI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,SICIVI,VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
6 ; GCN-LABEL: {{^}}load_i32:
7 ; GCN-DAG: s_mov_b32 s3, 0
8 ; GCN-DAG: s_mov_b32 s2, s1
9 ; GCN-DAG: s_mov_b32 s1, s3
10 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
11 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
12 ; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
13 ; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
14 define amdgpu_vs float @load_i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
15 %gep1 = getelementptr inbounds i32, ptr addrspace(6) %p1, i32 2
16 %r0 = load i32, ptr addrspace(6) %p0
17 %r1 = load i32, ptr addrspace(6) %gep1
19 %r2 = bitcast i32 %r to float
23 ; GCN-LABEL: {{^}}load_v2i32:
24 ; SICIVI-DAG: s_mov_b32 s3, 0
25 ; SICIVI-DAG: s_mov_b32 s2, s1
26 ; SICIVI-DAG: s_mov_b32 s1, s3
27 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
28 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
29 ; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
30 ; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
31 ; GFX9-DAG: s_mov_b32 s2, s1
32 ; GFX9-DAG: s_mov_b32 s3, 0
33 ; GFX9-DAG: s_mov_b32 s1, s3
34 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
35 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
36 define amdgpu_vs <2 x float> @load_v2i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
37 %gep1 = getelementptr inbounds <2 x i32>, ptr addrspace(6) %p1, i32 2
38 %r0 = load <2 x i32>, ptr addrspace(6) %p0
39 %r1 = load <2 x i32>, ptr addrspace(6) %gep1
40 %r = add <2 x i32> %r0, %r1
41 %r2 = bitcast <2 x i32> %r to <2 x float>
45 ; GCN-LABEL: {{^}}load_v4i32:
46 ; GCN-DAG: s_mov_b32 s3, 0
47 ; GCN-DAG: s_mov_b32 s2, s1
48 ; GCN-DAG: s_mov_b32 s1, s3
49 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
50 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8
51 ; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
52 ; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
53 ; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
54 ; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
55 define amdgpu_vs <4 x float> @load_v4i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
56 %gep1 = getelementptr inbounds <4 x i32>, ptr addrspace(6) %p1, i32 2
57 %r0 = load <4 x i32>, ptr addrspace(6) %p0
58 %r1 = load <4 x i32>, ptr addrspace(6) %gep1
59 %r = add <4 x i32> %r0, %r1
60 %r2 = bitcast <4 x i32> %r to <4 x float>
64 ; GCN-LABEL: {{^}}load_v8i32:
65 ; GCN-DAG: s_mov_b32 s3, 0
66 ; GCN-DAG: s_mov_b32 s2, s1
67 ; GCN-DAG: s_mov_b32 s1, s3
68 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
69 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10
70 ; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
71 ; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
72 ; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
73 ; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
74 define amdgpu_vs <8 x float> @load_v8i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
75 %gep1 = getelementptr inbounds <8 x i32>, ptr addrspace(6) %p1, i32 2
76 %r0 = load <8 x i32>, ptr addrspace(6) %p0
77 %r1 = load <8 x i32>, ptr addrspace(6) %gep1
78 %r = add <8 x i32> %r0, %r1
79 %r2 = bitcast <8 x i32> %r to <8 x float>
83 ; GCN-LABEL: {{^}}load_v16i32:
84 ; GCN-DAG: s_mov_b32 s3, 0
85 ; GCN-DAG: s_mov_b32 s2, s1
86 ; GCN-DAG: s_mov_b32 s1, s3
87 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
88 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20
89 ; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
90 ; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
91 ; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
92 ; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
93 define amdgpu_vs <16 x float> @load_v16i32(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
94 %gep1 = getelementptr inbounds <16 x i32>, ptr addrspace(6) %p1, i32 2
95 %r0 = load <16 x i32>, ptr addrspace(6) %p0
96 %r1 = load <16 x i32>, ptr addrspace(6) %gep1
97 %r = add <16 x i32> %r0, %r1
98 %r2 = bitcast <16 x i32> %r to <16 x float>
102 ; GCN-LABEL: {{^}}load_float:
103 ; GCN-DAG: s_mov_b32 s3, 0
104 ; GCN-DAG: s_mov_b32 s2, s1
105 ; GCN-DAG: s_mov_b32 s1, s3
106 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
107 ; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2
108 ; VI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
109 ; VI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
110 ; GFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0
111 ; GFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8
112 define amdgpu_vs float @load_float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
113 %gep1 = getelementptr inbounds float, ptr addrspace(6) %p1, i32 2
114 %r0 = load float, ptr addrspace(6) %p0
115 %r1 = load float, ptr addrspace(6) %gep1
116 %r = fadd float %r0, %r1
120 ; GCN-LABEL: {{^}}load_v2float:
121 ; SICIVI-DAG: s_mov_b32 s3, 0
122 ; SICIVI-DAG: s_mov_b32 s2, s1
123 ; SICIVI-DAG: s_mov_b32 s1, s3
124 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
125 ; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4
126 ; VI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
127 ; VI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
128 ; GFX9-DAG: s_mov_b32 s2, s1
129 ; GFX9-DAG: s_mov_b32 s3, 0
130 ; GFX9-DAG: s_mov_b32 s1, s3
131 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0
132 ; GFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10
133 define amdgpu_vs <2 x float> @load_v2float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
134 %gep1 = getelementptr inbounds <2 x float>, ptr addrspace(6) %p1, i32 2
135 %r0 = load <2 x float>, ptr addrspace(6) %p0
136 %r1 = load <2 x float>, ptr addrspace(6) %gep1
137 %r = fadd <2 x float> %r0, %r1
141 ; GCN-LABEL: {{^}}load_v4float:
142 ; GCN-DAG: s_mov_b32 s3, 0
143 ; GCN-DAG: s_mov_b32 s2, s1
144 ; GCN-DAG: s_mov_b32 s1, s3
145 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
146 ; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8
147 ; VI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
148 ; VI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
149 ; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0
150 ; GFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20
151 define amdgpu_vs <4 x float> @load_v4float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
152 %gep1 = getelementptr inbounds <4 x float>, ptr addrspace(6) %p1, i32 2
153 %r0 = load <4 x float>, ptr addrspace(6) %p0
154 %r1 = load <4 x float>, ptr addrspace(6) %gep1
155 %r = fadd <4 x float> %r0, %r1
159 ; GCN-LABEL: {{^}}load_v8float:
160 ; GCN-DAG: s_mov_b32 s3, 0
161 ; GCN-DAG: s_mov_b32 s2, s1
162 ; GCN-DAG: s_mov_b32 s1, s3
163 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
164 ; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10
165 ; VI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
166 ; VI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
167 ; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0
168 ; GFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40
169 define amdgpu_vs <8 x float> @load_v8float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
170 %gep1 = getelementptr inbounds <8 x float>, ptr addrspace(6) %p1, i32 2
171 %r0 = load <8 x float>, ptr addrspace(6) %p0
172 %r1 = load <8 x float>, ptr addrspace(6) %gep1
173 %r = fadd <8 x float> %r0, %r1
177 ; GCN-LABEL: {{^}}load_v16float:
178 ; GCN-DAG: s_mov_b32 s3, 0
179 ; GCN-DAG: s_mov_b32 s2, s1
180 ; GCN-DAG: s_mov_b32 s1, s3
181 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
182 ; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20
183 ; VI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
184 ; VI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
185 ; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0
186 ; GFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80
187 define amdgpu_vs <16 x float> @load_v16float(ptr addrspace(6) inreg %p0, ptr addrspace(6) inreg %p1) #0 {
188 %gep1 = getelementptr inbounds <16 x float>, ptr addrspace(6) %p1, i32 2
189 %r0 = load <16 x float>, ptr addrspace(6) %p0
190 %r1 = load <16 x float>, ptr addrspace(6) %gep1
191 %r = fadd <16 x float> %r0, %r1
195 ; GCN-LABEL: {{^}}load_i32_hi0:
196 ; GCN: s_mov_b32 s1, 0
197 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
198 define amdgpu_vs i32 @load_i32_hi0(ptr addrspace(6) inreg %p) #1 {
199 %r0 = load i32, ptr addrspace(6) %p
203 ; GCN-LABEL: {{^}}load_i32_hi1:
204 ; GCN: s_mov_b32 s1, 1
205 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
206 define amdgpu_vs i32 @load_i32_hi1(ptr addrspace(6) inreg %p) #2 {
207 %r0 = load i32, ptr addrspace(6) %p
211 ; GCN-LABEL: {{^}}load_i32_hiffff8000:
212 ; GCN: s_movk_i32 s1, 0x8000
213 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
214 define amdgpu_vs i32 @load_i32_hiffff8000(ptr addrspace(6) inreg %p) #3 {
215 %r0 = load i32, ptr addrspace(6) %p
219 ; GCN-LABEL: {{^}}load_i32_hifffffff0:
220 ; GCN: s_mov_b32 s1, -16
221 ; GCN-NEXT: s_load_dword s0, s[0:1], 0x0
222 define amdgpu_vs i32 @load_i32_hifffffff0(ptr addrspace(6) inreg %p) #4 {
223 %r0 = load i32, ptr addrspace(6) %p
227 ; GCN-LABEL: {{^}}load_sampler
228 ; GCN: v_readfirstlane_b32
230 ; GCN: s_load_dwordx8
231 ; GCN-NEXT: s_load_dwordx4
233 define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
235 %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
236 %23 = bitcast float %22 to i32
238 %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24, !amdgpu.uniform !0
239 %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
242 %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28, !amdgpu.uniform !0
243 %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
244 %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
245 %32 = extractelement <4 x float> %31, i32 0
246 %33 = extractelement <4 x float> %31, i32 1
247 %34 = extractelement <4 x float> %31, i32 2
248 %35 = extractelement <4 x float> %31, i32 3
249 %36 = bitcast float %4 to i32
250 %37 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %36, 4
251 %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %37, float %32, 5
252 %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 6
253 %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 7
254 %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 8
255 %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %20, 19
256 ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42
259 ; GCN-LABEL: {{^}}load_sampler_nouniform
260 ; GCN: v_readfirstlane_b32
262 ; GCN: s_load_dwordx8
263 ; GCN-NEXT: s_load_dwordx4
265 define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform(ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), ptr addrspace(6) inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
267 %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
268 %23 = bitcast float %22 to i32
270 %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24
271 %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0
274 %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28
275 %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0
276 %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8
277 %32 = extractelement <4 x float> %31, i32 0
278 %33 = extractelement <4 x float> %31, i32 1
279 %34 = extractelement <4 x float> %31, i32 2
280 %35 = extractelement <4 x float> %31, i32 3
281 %36 = bitcast float %4 to i32
282 %37 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %36, 4
283 %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %37, float %32, 5
284 %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 6
285 %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 7
286 %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 8
287 %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %20, 19
288 ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42
291 ; GCN-LABEL: {{^}}load_addr_no_fold:
292 ; GCN-DAG: s_add_i32 s0, s0, 4
293 ; GCN-DAG: s_mov_b32 s1, 0
294 ; GCN: s_load_dword s{{[0-9]}}, s[0:1], 0x0
295 define amdgpu_vs float @load_addr_no_fold(ptr addrspace(6) inreg noalias %p0) #0 {
296 %gep1 = getelementptr i32, ptr addrspace(6) %p0, i32 1
297 %r1 = load i32, ptr addrspace(6) %gep1
298 %r2 = bitcast i32 %r1 to float
302 ; GCN-LABEL: {{^}}vgpr_arg_src:
303 ; GCN: v_readfirstlane_b32 s[[READLANE:[0-9]+]], v0
304 ; GCN: s_mov_b32 s[[ZERO:[0-9]+]]
305 ; GCN: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[[[READLANE]]:[[ZERO]]]
306 define amdgpu_vs float @vgpr_arg_src(ptr addrspace(6) %arg) {
308 %tmp9 = load ptr addrspace(8), ptr addrspace(6) %arg
309 %tmp10 = call nsz float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %tmp9, i32 undef, i32 0, i32 0, i32 0) #1
313 ; Function Attrs: nounwind readnone speculatable
314 declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6
316 ; Function Attrs: nounwind readonly
317 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #7
319 declare float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8), i32, i32, i32, i32) #7
323 attributes #0 = { nounwind }
324 attributes #1 = { nounwind "amdgpu-32bit-address-high-bits"="0" }
325 attributes #2 = { nounwind "amdgpu-32bit-address-high-bits"="1" }
326 attributes #3 = { nounwind "amdgpu-32bit-address-high-bits"="0xffff8000" }
327 attributes #4 = { nounwind "amdgpu-32bit-address-high-bits"="0xfffffff0" }
328 attributes #5 = { "InitialPSInputAddr"="45175" }
329 attributes #6 = { nounwind readnone speculatable }
330 attributes #7 = { nounwind memory(argmem: read) }
331 attributes #8 = { nounwind readnone }