1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
4 define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) {
5 ; GCN-LABEL: divergent_or3_b32:
7 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
8 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
9 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
10 ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
11 ; GCN-NEXT: s_waitcnt vmcnt(0)
12 ; GCN-NEXT: v_or3_b32 v0, v1, v0, v2
13 ; GCN-NEXT: v_not_b32_e32 v0, v0
14 ; GCN-NEXT: global_store_dword v3, v0, s[0:1]
17 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
18 %i1 = zext i32 %i to i64
19 %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
20 %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
21 %i4 = extractelement <3 x i32> %i3, i64 0
22 %i5 = extractelement <3 x i32> %i3, i64 1
23 %i6 = extractelement <3 x i32> %i3, i64 2
27 store i32 %i9, ptr addrspace(1) %i2, align 16
31 define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
32 ; GCN-LABEL: divergent_or3_b64:
34 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
35 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
36 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
37 ; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
38 ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
39 ; GCN-NEXT: s_waitcnt vmcnt(0)
40 ; GCN-NEXT: v_or3_b32 v1, v3, v1, v5
41 ; GCN-NEXT: v_or3_b32 v0, v2, v0, v4
42 ; GCN-NEXT: v_not_b32_e32 v1, v1
43 ; GCN-NEXT: v_not_b32_e32 v0, v0
44 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
47 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
48 %i1 = zext i32 %i to i64
49 %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
50 %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
51 %i4 = extractelement <3 x i64> %i3, i64 0
52 %i5 = extractelement <3 x i64> %i3, i64 1
53 %i6 = extractelement <3 x i64> %i3, i64 2
57 store i64 %i9, ptr addrspace(1) %i2, align 32
61 define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) {
62 ; GCN-LABEL: divergent_and3_b32:
64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
65 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
66 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
67 ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
68 ; GCN-NEXT: s_waitcnt vmcnt(0)
69 ; GCN-NEXT: v_and_b32_e32 v0, v1, v0
70 ; GCN-NEXT: v_and_b32_e32 v0, v0, v2
71 ; GCN-NEXT: v_not_b32_e32 v0, v0
72 ; GCN-NEXT: global_store_dword v3, v0, s[0:1]
75 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
76 %i1 = zext i32 %i to i64
77 %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
78 %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
79 %i4 = extractelement <3 x i32> %i3, i64 0
80 %i5 = extractelement <3 x i32> %i3, i64 1
81 %i6 = extractelement <3 x i32> %i3, i64 2
82 %i7 = and i32 %i5, %i4
83 %i8 = and i32 %i7, %i6
85 store i32 %i9, ptr addrspace(1) %i2, align 16
89 define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
90 ; GCN-LABEL: divergent_and3_b64:
92 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
93 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
94 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
95 ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
96 ; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
97 ; GCN-NEXT: s_waitcnt vmcnt(1)
98 ; GCN-NEXT: v_and_b32_e32 v1, v3, v1
99 ; GCN-NEXT: v_and_b32_e32 v0, v2, v0
100 ; GCN-NEXT: s_waitcnt vmcnt(0)
101 ; GCN-NEXT: v_and_b32_e32 v1, v1, v5
102 ; GCN-NEXT: v_and_b32_e32 v0, v0, v4
103 ; GCN-NEXT: v_not_b32_e32 v1, v1
104 ; GCN-NEXT: v_not_b32_e32 v0, v0
105 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
108 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
109 %i1 = zext i32 %i to i64
110 %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
111 %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
112 %i4 = extractelement <3 x i64> %i3, i64 0
113 %i5 = extractelement <3 x i64> %i3, i64 1
114 %i6 = extractelement <3 x i64> %i3, i64 2
115 %i7 = and i64 %i5, %i4
116 %i8 = and i64 %i7, %i6
117 %i9 = xor i64 %i8, -1
118 store i64 %i9, ptr addrspace(1) %i2, align 32
122 define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) {
123 ; GCN-LABEL: divergent_xor3_b32:
124 ; GCN: ; %bb.0: ; %bb
125 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
126 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
127 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
128 ; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
129 ; GCN-NEXT: s_waitcnt vmcnt(0)
130 ; GCN-NEXT: v_xor_b32_e32 v0, v1, v0
131 ; GCN-NEXT: v_xnor_b32_e32 v0, v0, v2
132 ; GCN-NEXT: global_store_dword v3, v0, s[0:1]
135 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
136 %i1 = zext i32 %i to i64
137 %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1
138 %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16
139 %i4 = extractelement <3 x i32> %i3, i64 0
140 %i5 = extractelement <3 x i32> %i3, i64 1
141 %i6 = extractelement <3 x i32> %i3, i64 2
142 %i7 = xor i32 %i5, %i4
143 %i8 = xor i32 %i7, %i6
144 %i9 = xor i32 %i8, -1
145 store i32 %i9, ptr addrspace(1) %i2, align 16
149 define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) {
150 ; GCN-LABEL: divergent_xor3_b64:
151 ; GCN: ; %bb.0: ; %bb
152 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
153 ; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
154 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
155 ; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
156 ; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
157 ; GCN-NEXT: s_waitcnt vmcnt(1)
158 ; GCN-NEXT: v_xor_b32_e32 v1, v3, v1
159 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0
160 ; GCN-NEXT: s_waitcnt vmcnt(0)
161 ; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5
162 ; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4
163 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
166 %i = tail call i32 @llvm.amdgcn.workitem.id.x()
167 %i1 = zext i32 %i to i64
168 %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1
169 %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32
170 %i4 = extractelement <3 x i64> %i3, i64 0
171 %i5 = extractelement <3 x i64> %i3, i64 1
172 %i6 = extractelement <3 x i64> %i3, i64 2
173 %i7 = xor i64 %i5, %i4
174 %i8 = xor i64 %i7, %i6
175 %i9 = xor i64 %i8, -1
176 store i64 %i9, ptr addrspace(1) %i2, align 32
180 define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) {
181 ; GCN-LABEL: uniform_or3_b32:
182 ; GCN: ; %bb.0: ; %bb
183 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
184 ; GCN-NEXT: v_mov_b32_e32 v0, 0
185 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
186 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
187 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
188 ; GCN-NEXT: s_or_b32 s0, s1, s0
189 ; GCN-NEXT: s_nor_b32 s0, s0, s2
190 ; GCN-NEXT: v_mov_b32_e32 v1, s0
191 ; GCN-NEXT: global_store_dword v0, v1, s[4:5]
194 %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
195 %i4 = extractelement <3 x i32> %i3, i64 0
196 %i5 = extractelement <3 x i32> %i3, i64 1
197 %i6 = extractelement <3 x i32> %i3, i64 2
198 %i7 = or i32 %i5, %i4
199 %i8 = or i32 %i7, %i6
200 %i9 = xor i32 %i8, -1
201 store i32 %i9, ptr addrspace(1) %arg, align 16
205 define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) {
206 ; GCN-LABEL: uniform_or3_b64:
207 ; GCN: ; %bb.0: ; %bb
208 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
209 ; GCN-NEXT: v_mov_b32_e32 v2, 0
210 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
211 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
212 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
213 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
214 ; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
215 ; GCN-NEXT: s_nor_b64 s[0:1], s[0:1], s[6:7]
216 ; GCN-NEXT: v_mov_b32_e32 v0, s0
217 ; GCN-NEXT: v_mov_b32_e32 v1, s1
218 ; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
221 %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
222 %i4 = extractelement <3 x i64> %i3, i64 0
223 %i5 = extractelement <3 x i64> %i3, i64 1
224 %i6 = extractelement <3 x i64> %i3, i64 2
225 %i7 = or i64 %i5, %i4
226 %i8 = or i64 %i7, %i6
227 %i9 = xor i64 %i8, -1
228 store i64 %i9, ptr addrspace(1) %arg, align 32
232 define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) {
233 ; GCN-LABEL: uniform_and3_b32:
234 ; GCN: ; %bb.0: ; %bb
235 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
236 ; GCN-NEXT: v_mov_b32_e32 v0, 0
237 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
238 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
239 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
240 ; GCN-NEXT: s_and_b32 s0, s1, s0
241 ; GCN-NEXT: s_nand_b32 s0, s0, s2
242 ; GCN-NEXT: v_mov_b32_e32 v1, s0
243 ; GCN-NEXT: global_store_dword v0, v1, s[4:5]
246 %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
247 %i4 = extractelement <3 x i32> %i3, i64 0
248 %i5 = extractelement <3 x i32> %i3, i64 1
249 %i6 = extractelement <3 x i32> %i3, i64 2
250 %i7 = and i32 %i5, %i4
251 %i8 = and i32 %i7, %i6
252 %i9 = xor i32 %i8, -1
253 store i32 %i9, ptr addrspace(1) %arg, align 16
257 define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) {
258 ; GCN-LABEL: uniform_and3_b64:
259 ; GCN: ; %bb.0: ; %bb
260 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
261 ; GCN-NEXT: v_mov_b32_e32 v2, 0
262 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
263 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
264 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
265 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
266 ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
267 ; GCN-NEXT: s_nand_b64 s[0:1], s[0:1], s[6:7]
268 ; GCN-NEXT: v_mov_b32_e32 v0, s0
269 ; GCN-NEXT: v_mov_b32_e32 v1, s1
270 ; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
273 %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
274 %i4 = extractelement <3 x i64> %i3, i64 0
275 %i5 = extractelement <3 x i64> %i3, i64 1
276 %i6 = extractelement <3 x i64> %i3, i64 2
277 %i7 = and i64 %i5, %i4
278 %i8 = and i64 %i7, %i6
279 %i9 = xor i64 %i8, -1
280 store i64 %i9, ptr addrspace(1) %arg, align 32
284 define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) {
285 ; GCN-LABEL: uniform_xor3_b32:
286 ; GCN: ; %bb.0: ; %bb
287 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
288 ; GCN-NEXT: v_mov_b32_e32 v0, 0
289 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
290 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
291 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
292 ; GCN-NEXT: s_xor_b32 s0, s1, s0
293 ; GCN-NEXT: s_xnor_b32 s0, s0, s2
294 ; GCN-NEXT: v_mov_b32_e32 v1, s0
295 ; GCN-NEXT: global_store_dword v0, v1, s[4:5]
298 %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16
299 %i4 = extractelement <3 x i32> %i3, i64 0
300 %i5 = extractelement <3 x i32> %i3, i64 1
301 %i6 = extractelement <3 x i32> %i3, i64 2
302 %i7 = xor i32 %i5, %i4
303 %i8 = xor i32 %i7, %i6
304 %i9 = xor i32 %i8, -1
305 store i32 %i9, ptr addrspace(1) %arg, align 16
309 define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) {
310 ; GCN-LABEL: uniform_xor3_b64:
311 ; GCN: ; %bb.0: ; %bb
312 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
313 ; GCN-NEXT: v_mov_b32_e32 v2, 0
314 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
315 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
316 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
317 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
318 ; GCN-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
319 ; GCN-NEXT: s_xnor_b64 s[0:1], s[0:1], s[6:7]
320 ; GCN-NEXT: v_mov_b32_e32 v0, s0
321 ; GCN-NEXT: v_mov_b32_e32 v1, s1
322 ; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
325 %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32
326 %i4 = extractelement <3 x i64> %i3, i64 0
327 %i5 = extractelement <3 x i64> %i3, i64 1
328 %i6 = extractelement <3 x i64> %i3, i64 2
329 %i7 = xor i64 %i5, %i4
330 %i8 = xor i64 %i7, %i6
331 %i9 = xor i64 %i8, -1
332 store i64 %i9, ptr addrspace(1) %arg, align 32
336 declare i32 @llvm.amdgcn.workitem.id.x()