1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
6 define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
7 ; GFX6-LABEL: or_v2i32:
9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
10 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
11 ; GFX6-NEXT: s_mov_b32 s6, -1
12 ; GFX6-NEXT: s_mov_b32 s10, s6
13 ; GFX6-NEXT: s_mov_b32 s11, s7
14 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX6-NEXT: s_mov_b32 s8, s2
16 ; GFX6-NEXT: s_mov_b32 s9, s3
17 ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
18 ; GFX6-NEXT: s_mov_b32 s4, s0
19 ; GFX6-NEXT: s_mov_b32 s5, s1
20 ; GFX6-NEXT: s_waitcnt vmcnt(0)
21 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3
22 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
23 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
26 ; GFX8-LABEL: or_v2i32:
28 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
29 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
30 ; GFX8-NEXT: s_mov_b32 s6, -1
31 ; GFX8-NEXT: s_mov_b32 s10, s6
32 ; GFX8-NEXT: s_mov_b32 s11, s7
33 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
34 ; GFX8-NEXT: s_mov_b32 s8, s2
35 ; GFX8-NEXT: s_mov_b32 s9, s3
36 ; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
37 ; GFX8-NEXT: s_mov_b32 s4, s0
38 ; GFX8-NEXT: s_mov_b32 s5, s1
39 ; GFX8-NEXT: s_waitcnt vmcnt(0)
40 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
41 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
42 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
47 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
49 ; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
50 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
53 ; EG-NEXT: Fetch clause starting at 6:
54 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
55 ; EG-NEXT: ALU clause starting at 8:
56 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
57 ; EG-NEXT: ALU clause starting at 9:
58 ; EG-NEXT: OR_INT * T0.Y, T0.Y, T0.W,
59 ; EG-NEXT: OR_INT T0.X, T0.X, T0.Z,
60 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
61 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
62 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
63 %a = load <2 x i32>, ptr addrspace(1) %in
64 %b = load <2 x i32>, ptr addrspace(1) %b_ptr
65 %result = or <2 x i32> %a, %b
66 store <2 x i32> %result, ptr addrspace(1) %out
70 define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
71 ; GFX6-LABEL: or_v4i32:
73 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
74 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
75 ; GFX6-NEXT: s_mov_b32 s6, -1
76 ; GFX6-NEXT: s_mov_b32 s10, s6
77 ; GFX6-NEXT: s_mov_b32 s11, s7
78 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
79 ; GFX6-NEXT: s_mov_b32 s8, s2
80 ; GFX6-NEXT: s_mov_b32 s9, s3
81 ; GFX6-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
82 ; GFX6-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
83 ; GFX6-NEXT: s_mov_b32 s4, s0
84 ; GFX6-NEXT: s_mov_b32 s5, s1
85 ; GFX6-NEXT: s_waitcnt vmcnt(0)
86 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v7
87 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
88 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5
89 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4
90 ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
93 ; GFX8-LABEL: or_v4i32:
95 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
96 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
97 ; GFX8-NEXT: s_mov_b32 s6, -1
98 ; GFX8-NEXT: s_mov_b32 s10, s6
99 ; GFX8-NEXT: s_mov_b32 s11, s7
100 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
101 ; GFX8-NEXT: s_mov_b32 s8, s2
102 ; GFX8-NEXT: s_mov_b32 s9, s3
103 ; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
104 ; GFX8-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
105 ; GFX8-NEXT: s_mov_b32 s4, s0
106 ; GFX8-NEXT: s_mov_b32 s5, s1
107 ; GFX8-NEXT: s_waitcnt vmcnt(0)
108 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v7
109 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
110 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5
111 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
112 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
113 ; GFX8-NEXT: s_endpgm
115 ; EG-LABEL: or_v4i32:
117 ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
119 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
120 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
123 ; EG-NEXT: Fetch clause starting at 6:
124 ; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
125 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
126 ; EG-NEXT: ALU clause starting at 10:
127 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
128 ; EG-NEXT: ALU clause starting at 11:
129 ; EG-NEXT: OR_INT * T0.W, T0.W, T1.W,
130 ; EG-NEXT: OR_INT * T0.Z, T0.Z, T1.Z,
131 ; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y,
132 ; EG-NEXT: OR_INT T0.X, T0.X, T1.X,
133 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
134 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
135 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
136 %a = load <4 x i32>, ptr addrspace(1) %in
137 %b = load <4 x i32>, ptr addrspace(1) %b_ptr
138 %result = or <4 x i32> %a, %b
139 store <4 x i32> %result, ptr addrspace(1) %out
143 define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
144 ; GFX6-LABEL: scalar_or_i32:
146 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
147 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
148 ; GFX6-NEXT: s_mov_b32 s6, -1
149 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
150 ; GFX6-NEXT: s_mov_b32 s4, s0
151 ; GFX6-NEXT: s_or_b32 s0, s2, s3
152 ; GFX6-NEXT: s_mov_b32 s5, s1
153 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
154 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
155 ; GFX6-NEXT: s_endpgm
157 ; GFX8-LABEL: scalar_or_i32:
159 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
160 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
161 ; GFX8-NEXT: s_mov_b32 s6, -1
162 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
163 ; GFX8-NEXT: s_mov_b32 s4, s0
164 ; GFX8-NEXT: s_or_b32 s0, s2, s3
165 ; GFX8-NEXT: s_mov_b32 s5, s1
166 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
167 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
168 ; GFX8-NEXT: s_endpgm
170 ; EG-LABEL: scalar_or_i32:
172 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
173 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
176 ; EG-NEXT: ALU clause starting at 4:
177 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
178 ; EG-NEXT: OR_INT * T1.X, KC0[2].Z, KC0[2].W,
179 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
181 store i32 %or, ptr addrspace(1) %out
185 define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) {
186 ; GFX6-LABEL: vector_or_i32:
188 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
189 ; GFX6-NEXT: s_load_dword s12, s[4:5], 0xd
190 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
191 ; GFX6-NEXT: s_mov_b32 s6, -1
192 ; GFX6-NEXT: s_mov_b32 s10, s6
193 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
194 ; GFX6-NEXT: s_mov_b32 s8, s2
195 ; GFX6-NEXT: s_mov_b32 s9, s3
196 ; GFX6-NEXT: s_mov_b32 s11, s7
197 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
198 ; GFX6-NEXT: s_mov_b32 s4, s0
199 ; GFX6-NEXT: s_mov_b32 s5, s1
200 ; GFX6-NEXT: s_waitcnt vmcnt(0)
201 ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0
202 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
203 ; GFX6-NEXT: s_endpgm
205 ; GFX8-LABEL: vector_or_i32:
207 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
208 ; GFX8-NEXT: s_load_dword s12, s[4:5], 0x34
209 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
210 ; GFX8-NEXT: s_mov_b32 s6, -1
211 ; GFX8-NEXT: s_mov_b32 s10, s6
212 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
213 ; GFX8-NEXT: s_mov_b32 s8, s2
214 ; GFX8-NEXT: s_mov_b32 s9, s3
215 ; GFX8-NEXT: s_mov_b32 s11, s7
216 ; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
217 ; GFX8-NEXT: s_mov_b32 s4, s0
218 ; GFX8-NEXT: s_mov_b32 s5, s1
219 ; GFX8-NEXT: s_waitcnt vmcnt(0)
220 ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0
221 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
222 ; GFX8-NEXT: s_endpgm
224 ; EG-LABEL: vector_or_i32:
226 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
228 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
229 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
232 ; EG-NEXT: Fetch clause starting at 6:
233 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
234 ; EG-NEXT: ALU clause starting at 8:
235 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
236 ; EG-NEXT: ALU clause starting at 9:
237 ; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W,
238 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
239 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
240 %loada = load i32, ptr addrspace(1) %a
241 %or = or i32 %loada, %b
242 store i32 %or, ptr addrspace(1) %out
246 define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) {
247 ; GFX6-LABEL: scalar_or_literal_i32:
249 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
250 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
251 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
252 ; GFX6-NEXT: s_mov_b32 s2, -1
253 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
254 ; GFX6-NEXT: s_or_b32 s4, s6, 0x1869f
255 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
256 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
257 ; GFX6-NEXT: s_endpgm
259 ; GFX8-LABEL: scalar_or_literal_i32:
261 ; GFX8-NEXT: s_load_dword s6, s[4:5], 0x2c
262 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
263 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
264 ; GFX8-NEXT: s_mov_b32 s2, -1
265 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX8-NEXT: s_or_b32 s4, s6, 0x1869f
267 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
268 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
269 ; GFX8-NEXT: s_endpgm
271 ; EG-LABEL: scalar_or_literal_i32:
273 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
274 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
277 ; EG-NEXT: ALU clause starting at 4:
278 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
279 ; EG-NEXT: OR_INT * T1.X, KC0[2].Z, literal.y,
280 ; EG-NEXT: 2(2.802597e-45), 99999(1.401284e-40)
281 %or = or i32 %a, 99999
282 store i32 %or, ptr addrspace(1) %out, align 4
286 define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
287 ; GFX6-LABEL: scalar_or_literal_i64:
289 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
290 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
291 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
292 ; GFX6-NEXT: s_mov_b32 s2, -1
293 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX6-NEXT: s_or_b32 s4, s7, 0xf237b
295 ; GFX6-NEXT: s_or_b32 s5, s6, 0x3039
296 ; GFX6-NEXT: v_mov_b32_e32 v0, s5
297 ; GFX6-NEXT: v_mov_b32_e32 v1, s4
298 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
299 ; GFX6-NEXT: s_endpgm
301 ; GFX8-LABEL: scalar_or_literal_i64:
303 ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
304 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
305 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
306 ; GFX8-NEXT: s_mov_b32 s2, -1
307 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
308 ; GFX8-NEXT: s_or_b32 s4, s7, 0xf237b
309 ; GFX8-NEXT: s_or_b32 s5, s6, 0x3039
310 ; GFX8-NEXT: v_mov_b32_e32 v0, s5
311 ; GFX8-NEXT: v_mov_b32_e32 v1, s4
312 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
313 ; GFX8-NEXT: s_endpgm
315 ; EG-LABEL: scalar_or_literal_i64:
317 ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
318 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
321 ; EG-NEXT: ALU clause starting at 4:
322 ; EG-NEXT: OR_INT * T0.Y, KC0[5].X, literal.x,
323 ; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
324 ; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
325 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
326 ; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45)
327 %or = or i64 %a, 4261135838621753
328 store i64 %or, ptr addrspace(1) %out
332 define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
333 ; GFX6-LABEL: scalar_or_literal_multi_use_i64:
335 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
336 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
337 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d
338 ; GFX6-NEXT: s_movk_i32 s8, 0x3039
339 ; GFX6-NEXT: s_mov_b32 s9, 0xf237b
340 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
341 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
342 ; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
343 ; GFX6-NEXT: v_mov_b32_e32 v0, s6
344 ; GFX6-NEXT: s_mov_b32 s2, -1
345 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
346 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
347 ; GFX6-NEXT: s_add_u32 s0, s4, 0x3039
348 ; GFX6-NEXT: s_addc_u32 s1, s5, 0xf237b
349 ; GFX6-NEXT: s_waitcnt expcnt(0)
350 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
351 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
352 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
353 ; GFX6-NEXT: s_waitcnt vmcnt(0)
354 ; GFX6-NEXT: s_endpgm
356 ; GFX8-LABEL: scalar_or_literal_multi_use_i64:
358 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
359 ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
360 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x74
361 ; GFX8-NEXT: s_movk_i32 s8, 0x3039
362 ; GFX8-NEXT: s_mov_b32 s9, 0xf237b
363 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
364 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
365 ; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
366 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
367 ; GFX8-NEXT: s_mov_b32 s2, -1
368 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
369 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
370 ; GFX8-NEXT: s_add_u32 s0, s4, 0x3039
371 ; GFX8-NEXT: s_addc_u32 s1, s5, 0xf237b
372 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
373 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
374 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
375 ; GFX8-NEXT: s_waitcnt vmcnt(0)
376 ; GFX8-NEXT: s_endpgm
378 ; EG-LABEL: scalar_or_literal_multi_use_i64:
380 ; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[]
381 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0
382 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
383 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
386 ; EG-NEXT: ALU clause starting at 6:
387 ; EG-NEXT: ADDC_UINT * T0.W, KC0[7].Y, literal.x,
388 ; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00)
389 ; EG-NEXT: ADD_INT T0.X, KC0[7].Y, literal.x,
390 ; EG-NEXT: ADD_INT * T0.W, KC0[7].Z, PV.W,
391 ; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00)
392 ; EG-NEXT: ADD_INT T1.X, PV.W, literal.x,
393 ; EG-NEXT: MOV * T2.X, literal.y,
394 ; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
395 ; EG-NEXT: OR_INT * T3.Y, KC0[5].X, literal.x,
396 ; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00)
397 ; EG-NEXT: OR_INT T3.X, KC0[4].W, literal.x,
398 ; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
399 ; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45)
400 %or = or i64 %a, 4261135838621753
401 store i64 %or, ptr addrspace(1) %out
403 %foo = add i64 %b, 4261135838621753
404 store volatile i64 %foo, ptr addrspace(1) undef
408 define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
409 ; GFX6-LABEL: scalar_or_inline_imm_i64:
411 ; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13
412 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
413 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
414 ; GFX6-NEXT: s_mov_b32 s2, -1
415 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
416 ; GFX6-NEXT: s_or_b32 s4, s6, 63
417 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
418 ; GFX6-NEXT: v_mov_b32_e32 v1, s7
419 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
420 ; GFX6-NEXT: s_endpgm
422 ; GFX8-LABEL: scalar_or_inline_imm_i64:
424 ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c
425 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
426 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
427 ; GFX8-NEXT: s_mov_b32 s2, -1
428 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
429 ; GFX8-NEXT: s_or_b32 s4, s6, 63
430 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
431 ; GFX8-NEXT: v_mov_b32_e32 v1, s7
432 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
433 ; GFX8-NEXT: s_endpgm
435 ; EG-LABEL: scalar_or_inline_imm_i64:
437 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
438 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
441 ; EG-NEXT: ALU clause starting at 4:
442 ; EG-NEXT: MOV * T0.Y, KC0[5].X,
443 ; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
444 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
445 ; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45)
447 store i64 %or, ptr addrspace(1) %out
451 define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
452 ; GFX6-LABEL: scalar_or_inline_imm_multi_use_i64:
454 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
455 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
456 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
457 ; GFX6-NEXT: s_mov_b32 s6, -1
458 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
459 ; GFX6-NEXT: s_mov_b32 s4, s0
460 ; GFX6-NEXT: s_or_b32 s0, s2, 63
461 ; GFX6-NEXT: s_mov_b32 s5, s1
462 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
463 ; GFX6-NEXT: v_mov_b32_e32 v1, s3
464 ; GFX6-NEXT: s_add_u32 s0, s8, 63
465 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
466 ; GFX6-NEXT: s_addc_u32 s1, s9, 0
467 ; GFX6-NEXT: s_waitcnt expcnt(0)
468 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
469 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
470 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
471 ; GFX6-NEXT: s_waitcnt vmcnt(0)
472 ; GFX6-NEXT: s_endpgm
474 ; GFX8-LABEL: scalar_or_inline_imm_multi_use_i64:
476 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
477 ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
478 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
479 ; GFX8-NEXT: s_mov_b32 s6, -1
480 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
481 ; GFX8-NEXT: s_mov_b32 s4, s0
482 ; GFX8-NEXT: s_or_b32 s0, s2, 63
483 ; GFX8-NEXT: s_mov_b32 s5, s1
484 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
485 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
486 ; GFX8-NEXT: s_add_u32 s0, s8, 63
487 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
488 ; GFX8-NEXT: s_addc_u32 s1, s9, 0
489 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
490 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
491 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
492 ; GFX8-NEXT: s_waitcnt vmcnt(0)
493 ; GFX8-NEXT: s_endpgm
495 ; EG-LABEL: scalar_or_inline_imm_multi_use_i64:
497 ; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[]
498 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0
499 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
500 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1
503 ; EG-NEXT: ALU clause starting at 6:
504 ; EG-NEXT: ADD_INT T0.X, KC0[3].Y, literal.x,
505 ; EG-NEXT: ADDC_UINT * T0.W, KC0[3].Y, literal.x,
506 ; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00)
507 ; EG-NEXT: ADD_INT T1.X, KC0[3].Z, PV.W,
508 ; EG-NEXT: MOV * T2.X, literal.x,
509 ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00)
510 ; EG-NEXT: MOV * T3.Y, KC0[3].X,
511 ; EG-NEXT: OR_INT T3.X, KC0[2].W, literal.x,
512 ; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
513 ; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45)
515 store i64 %or, ptr addrspace(1) %out
516 %foo = add i64 %b, 63
517 store volatile i64 %foo, ptr addrspace(1) undef
521 define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
522 ; GFX6-LABEL: scalar_or_neg_inline_imm_i64:
524 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0x13
525 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
526 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
527 ; GFX6-NEXT: s_mov_b32 s2, -1
528 ; GFX6-NEXT: v_mov_b32_e32 v1, -1
529 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
530 ; GFX6-NEXT: s_or_b32 s4, s6, -8
531 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
532 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
533 ; GFX6-NEXT: s_endpgm
535 ; GFX8-LABEL: scalar_or_neg_inline_imm_i64:
537 ; GFX8-NEXT: s_load_dword s6, s[4:5], 0x4c
538 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
539 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
540 ; GFX8-NEXT: s_mov_b32 s2, -1
541 ; GFX8-NEXT: v_mov_b32_e32 v1, -1
542 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
543 ; GFX8-NEXT: s_or_b32 s4, s6, -8
544 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
545 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
546 ; GFX8-NEXT: s_endpgm
548 ; EG-LABEL: scalar_or_neg_inline_imm_i64:
550 ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[]
551 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
554 ; EG-NEXT: ALU clause starting at 4:
555 ; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x,
556 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
557 ; EG-NEXT: -8(nan), 2(2.802597e-45)
558 ; EG-NEXT: MOV * T0.Y, literal.x,
559 ; EG-NEXT: -1(nan), 0(0.000000e+00)
561 store i64 %or, ptr addrspace(1) %out
565 define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
566 ; GFX6-LABEL: vector_or_literal_i32:
568 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
569 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
570 ; GFX6-NEXT: s_mov_b32 s6, -1
571 ; GFX6-NEXT: s_mov_b32 s10, s6
572 ; GFX6-NEXT: s_mov_b32 s11, s7
573 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
574 ; GFX6-NEXT: s_mov_b32 s8, s2
575 ; GFX6-NEXT: s_mov_b32 s9, s3
576 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
577 ; GFX6-NEXT: s_mov_b32 s4, s0
578 ; GFX6-NEXT: s_mov_b32 s5, s1
579 ; GFX6-NEXT: s_waitcnt vmcnt(0)
580 ; GFX6-NEXT: v_or_b32_e32 v0, 0xffff, v0
581 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
582 ; GFX6-NEXT: s_endpgm
584 ; GFX8-LABEL: vector_or_literal_i32:
586 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
587 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
588 ; GFX8-NEXT: s_mov_b32 s6, -1
589 ; GFX8-NEXT: s_mov_b32 s10, s6
590 ; GFX8-NEXT: s_mov_b32 s11, s7
591 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
592 ; GFX8-NEXT: s_mov_b32 s8, s2
593 ; GFX8-NEXT: s_mov_b32 s9, s3
594 ; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
595 ; GFX8-NEXT: s_mov_b32 s4, s0
596 ; GFX8-NEXT: s_mov_b32 s5, s1
597 ; GFX8-NEXT: s_waitcnt vmcnt(0)
598 ; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0
599 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
600 ; GFX8-NEXT: s_endpgm
602 ; EG-LABEL: vector_or_literal_i32:
604 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
606 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
607 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
610 ; EG-NEXT: Fetch clause starting at 6:
611 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
612 ; EG-NEXT: ALU clause starting at 8:
613 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
614 ; EG-NEXT: ALU clause starting at 9:
615 ; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
616 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
617 ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
618 %loada = load i32, ptr addrspace(1) %a, align 4
619 %or = or i32 %loada, 65535
620 store i32 %or, ptr addrspace(1) %out, align 4
624 define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
625 ; GFX6-LABEL: vector_or_inline_immediate_i32:
627 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
628 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
629 ; GFX6-NEXT: s_mov_b32 s6, -1
630 ; GFX6-NEXT: s_mov_b32 s10, s6
631 ; GFX6-NEXT: s_mov_b32 s11, s7
632 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
633 ; GFX6-NEXT: s_mov_b32 s8, s2
634 ; GFX6-NEXT: s_mov_b32 s9, s3
635 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
636 ; GFX6-NEXT: s_mov_b32 s4, s0
637 ; GFX6-NEXT: s_mov_b32 s5, s1
638 ; GFX6-NEXT: s_waitcnt vmcnt(0)
639 ; GFX6-NEXT: v_or_b32_e32 v0, 4, v0
640 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
641 ; GFX6-NEXT: s_endpgm
643 ; GFX8-LABEL: vector_or_inline_immediate_i32:
645 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
646 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
647 ; GFX8-NEXT: s_mov_b32 s6, -1
648 ; GFX8-NEXT: s_mov_b32 s10, s6
649 ; GFX8-NEXT: s_mov_b32 s11, s7
650 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
651 ; GFX8-NEXT: s_mov_b32 s8, s2
652 ; GFX8-NEXT: s_mov_b32 s9, s3
653 ; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
654 ; GFX8-NEXT: s_mov_b32 s4, s0
655 ; GFX8-NEXT: s_mov_b32 s5, s1
656 ; GFX8-NEXT: s_waitcnt vmcnt(0)
657 ; GFX8-NEXT: v_or_b32_e32 v0, 4, v0
658 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
659 ; GFX8-NEXT: s_endpgm
661 ; EG-LABEL: vector_or_inline_immediate_i32:
663 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
665 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
666 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
669 ; EG-NEXT: Fetch clause starting at 6:
670 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
671 ; EG-NEXT: ALU clause starting at 8:
672 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
673 ; EG-NEXT: ALU clause starting at 9:
674 ; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
675 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
676 ; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45)
677 %loada = load i32, ptr addrspace(1) %a, align 4
678 %or = or i32 %loada, 4
679 store i32 %or, ptr addrspace(1) %out, align 4
683 define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
684 ; GFX6-LABEL: scalar_or_i64:
686 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
687 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
688 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
689 ; GFX6-NEXT: s_mov_b32 s6, -1
690 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
691 ; GFX6-NEXT: s_mov_b32 s4, s0
692 ; GFX6-NEXT: s_mov_b32 s5, s1
693 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
694 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
695 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
696 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
697 ; GFX6-NEXT: s_endpgm
699 ; GFX8-LABEL: scalar_or_i64:
701 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
702 ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
703 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
704 ; GFX8-NEXT: s_mov_b32 s6, -1
705 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
706 ; GFX8-NEXT: s_mov_b32 s4, s0
707 ; GFX8-NEXT: s_mov_b32 s5, s1
708 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
709 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
710 ; GFX8-NEXT: v_mov_b32_e32 v1, s1
711 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
712 ; GFX8-NEXT: s_endpgm
714 ; EG-LABEL: scalar_or_i64:
716 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
717 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
720 ; EG-NEXT: ALU clause starting at 4:
721 ; EG-NEXT: OR_INT * T0.Y, KC0[3].X, KC0[3].Z,
722 ; EG-NEXT: OR_INT * T0.X, KC0[2].W, KC0[3].Y,
723 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
724 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
726 store i64 %or, ptr addrspace(1) %out
730 define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
731 ; GFX6-LABEL: vector_or_i64:
733 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
734 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
735 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
736 ; GFX6-NEXT: s_mov_b32 s6, -1
737 ; GFX6-NEXT: s_mov_b32 s10, s6
738 ; GFX6-NEXT: s_mov_b32 s11, s7
739 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
740 ; GFX6-NEXT: s_mov_b32 s12, s2
741 ; GFX6-NEXT: s_mov_b32 s13, s3
742 ; GFX6-NEXT: s_mov_b32 s14, s6
743 ; GFX6-NEXT: s_mov_b32 s15, s7
744 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
745 ; GFX6-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
746 ; GFX6-NEXT: s_mov_b32 s4, s0
747 ; GFX6-NEXT: s_mov_b32 s5, s1
748 ; GFX6-NEXT: s_waitcnt vmcnt(0)
749 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
750 ; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
751 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
752 ; GFX6-NEXT: s_endpgm
754 ; GFX8-LABEL: vector_or_i64:
756 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
757 ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
758 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
759 ; GFX8-NEXT: s_mov_b32 s6, -1
760 ; GFX8-NEXT: s_mov_b32 s10, s6
761 ; GFX8-NEXT: s_mov_b32 s11, s7
762 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
763 ; GFX8-NEXT: s_mov_b32 s12, s2
764 ; GFX8-NEXT: s_mov_b32 s13, s3
765 ; GFX8-NEXT: s_mov_b32 s14, s6
766 ; GFX8-NEXT: s_mov_b32 s15, s7
767 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
768 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0
769 ; GFX8-NEXT: s_mov_b32 s4, s0
770 ; GFX8-NEXT: s_mov_b32 s5, s1
771 ; GFX8-NEXT: s_waitcnt vmcnt(0)
772 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
773 ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
774 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
775 ; GFX8-NEXT: s_endpgm
777 ; EG-LABEL: vector_or_i64:
779 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
781 ; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[]
782 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
785 ; EG-NEXT: Fetch clause starting at 6:
786 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
787 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
788 ; EG-NEXT: ALU clause starting at 10:
789 ; EG-NEXT: MOV T0.X, KC0[2].Z,
790 ; EG-NEXT: MOV * T1.X, KC0[2].W,
791 ; EG-NEXT: ALU clause starting at 12:
792 ; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y,
793 ; EG-NEXT: OR_INT T0.X, T0.X, T1.X,
794 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
795 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
796 %loada = load i64, ptr addrspace(1) %a, align 8
797 %loadb = load i64, ptr addrspace(1) %b, align 8
798 %or = or i64 %loada, %loadb
799 store i64 %or, ptr addrspace(1) %out
803 define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) {
804 ; GFX6-LABEL: scalar_vector_or_i64:
806 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
807 ; GFX6-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd
808 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
809 ; GFX6-NEXT: s_mov_b32 s6, -1
810 ; GFX6-NEXT: s_mov_b32 s10, s6
811 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
812 ; GFX6-NEXT: s_mov_b32 s8, s2
813 ; GFX6-NEXT: s_mov_b32 s9, s3
814 ; GFX6-NEXT: s_mov_b32 s11, s7
815 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
816 ; GFX6-NEXT: s_mov_b32 s4, s0
817 ; GFX6-NEXT: s_mov_b32 s5, s1
818 ; GFX6-NEXT: s_waitcnt vmcnt(0)
819 ; GFX6-NEXT: v_or_b32_e32 v0, s12, v0
820 ; GFX6-NEXT: v_or_b32_e32 v1, s13, v1
821 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
822 ; GFX6-NEXT: s_endpgm
824 ; GFX8-LABEL: scalar_vector_or_i64:
826 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
827 ; GFX8-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
828 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
829 ; GFX8-NEXT: s_mov_b32 s6, -1
830 ; GFX8-NEXT: s_mov_b32 s10, s6
831 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
832 ; GFX8-NEXT: s_mov_b32 s8, s2
833 ; GFX8-NEXT: s_mov_b32 s9, s3
834 ; GFX8-NEXT: s_mov_b32 s11, s7
835 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
836 ; GFX8-NEXT: s_mov_b32 s4, s0
837 ; GFX8-NEXT: s_mov_b32 s5, s1
838 ; GFX8-NEXT: s_waitcnt vmcnt(0)
839 ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0
840 ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1
841 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
842 ; GFX8-NEXT: s_endpgm
844 ; EG-LABEL: scalar_vector_or_i64:
846 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
848 ; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
849 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
852 ; EG-NEXT: Fetch clause starting at 6:
853 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
854 ; EG-NEXT: ALU clause starting at 8:
855 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
856 ; EG-NEXT: ALU clause starting at 9:
857 ; EG-NEXT: OR_INT * T0.Y, T0.Y, KC0[3].X,
858 ; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W,
859 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
860 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
861 %loada = load i64, ptr addrspace(1) %a
862 %or = or i64 %loada, %b
863 store i64 %or, ptr addrspace(1) %out
867 define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
868 ; GFX6-LABEL: vector_or_i64_loadimm:
870 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
871 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
872 ; GFX6-NEXT: s_mov_b32 s6, -1
873 ; GFX6-NEXT: s_mov_b32 s10, s6
874 ; GFX6-NEXT: s_mov_b32 s11, s7
875 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
876 ; GFX6-NEXT: s_mov_b32 s8, s2
877 ; GFX6-NEXT: s_mov_b32 s9, s3
878 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
879 ; GFX6-NEXT: s_mov_b32 s4, s0
880 ; GFX6-NEXT: s_mov_b32 s5, s1
881 ; GFX6-NEXT: s_waitcnt vmcnt(0)
882 ; GFX6-NEXT: v_or_b32_e32 v1, 0x146f, v1
883 ; GFX6-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0
884 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
885 ; GFX6-NEXT: s_endpgm
887 ; GFX8-LABEL: vector_or_i64_loadimm:
889 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
890 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
891 ; GFX8-NEXT: s_mov_b32 s6, -1
892 ; GFX8-NEXT: s_mov_b32 s10, s6
893 ; GFX8-NEXT: s_mov_b32 s11, s7
894 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
895 ; GFX8-NEXT: s_mov_b32 s8, s2
896 ; GFX8-NEXT: s_mov_b32 s9, s3
897 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
898 ; GFX8-NEXT: s_mov_b32 s4, s0
899 ; GFX8-NEXT: s_mov_b32 s5, s1
900 ; GFX8-NEXT: s_waitcnt vmcnt(0)
901 ; GFX8-NEXT: v_or_b32_e32 v1, 0x146f, v1
902 ; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0
903 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
904 ; GFX8-NEXT: s_endpgm
906 ; EG-LABEL: vector_or_i64_loadimm:
908 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
910 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
911 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
914 ; EG-NEXT: Fetch clause starting at 6:
915 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
916 ; EG-NEXT: ALU clause starting at 8:
917 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
918 ; EG-NEXT: ALU clause starting at 9:
919 ; EG-NEXT: OR_INT * T0.Y, T0.Y, literal.x,
920 ; EG-NEXT: 5231(7.330192e-42), 0(0.000000e+00)
921 ; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
922 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
923 ; EG-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45)
924 %loada = load i64, ptr addrspace(1) %a, align 8
925 %or = or i64 %loada, 22470723082367
926 store i64 %or, ptr addrspace(1) %out
930 ; FIXME: The or 0 should really be removed.
931 define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
932 ; GFX6-LABEL: vector_or_i64_imm:
934 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
935 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
936 ; GFX6-NEXT: s_mov_b32 s6, -1
937 ; GFX6-NEXT: s_mov_b32 s10, s6
938 ; GFX6-NEXT: s_mov_b32 s11, s7
939 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
940 ; GFX6-NEXT: s_mov_b32 s8, s2
941 ; GFX6-NEXT: s_mov_b32 s9, s3
942 ; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
943 ; GFX6-NEXT: s_mov_b32 s4, s0
944 ; GFX6-NEXT: s_mov_b32 s5, s1
945 ; GFX6-NEXT: s_waitcnt vmcnt(0)
946 ; GFX6-NEXT: v_or_b32_e32 v0, 8, v0
947 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
948 ; GFX6-NEXT: s_endpgm
950 ; GFX8-LABEL: vector_or_i64_imm:
952 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
953 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
954 ; GFX8-NEXT: s_mov_b32 s6, -1
955 ; GFX8-NEXT: s_mov_b32 s10, s6
956 ; GFX8-NEXT: s_mov_b32 s11, s7
957 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
958 ; GFX8-NEXT: s_mov_b32 s8, s2
959 ; GFX8-NEXT: s_mov_b32 s9, s3
960 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
961 ; GFX8-NEXT: s_mov_b32 s4, s0
962 ; GFX8-NEXT: s_mov_b32 s5, s1
963 ; GFX8-NEXT: s_waitcnt vmcnt(0)
964 ; GFX8-NEXT: v_or_b32_e32 v0, 8, v0
965 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
966 ; GFX8-NEXT: s_endpgm
968 ; EG-LABEL: vector_or_i64_imm:
970 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
972 ; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
973 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
976 ; EG-NEXT: Fetch clause starting at 6:
977 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
978 ; EG-NEXT: ALU clause starting at 8:
979 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
980 ; EG-NEXT: ALU clause starting at 9:
981 ; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
982 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
983 ; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
984 %loada = load i64, ptr addrspace(1) %a, align 8
985 %or = or i64 %loada, 8
986 store i64 %or, ptr addrspace(1) %out
990 define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
991 ; GFX6-LABEL: vector_or_i64_neg_inline_imm:
993 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
994 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
995 ; GFX6-NEXT: s_mov_b32 s6, -1
996 ; GFX6-NEXT: s_mov_b32 s10, s6
997 ; GFX6-NEXT: s_mov_b32 s11, s7
998 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
999 ; GFX6-NEXT: s_mov_b32 s8, s2
1000 ; GFX6-NEXT: s_mov_b32 s9, s3
1001 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
1002 ; GFX6-NEXT: s_mov_b32 s4, s0
1003 ; GFX6-NEXT: s_mov_b32 s5, s1
1004 ; GFX6-NEXT: v_mov_b32_e32 v1, -1
1005 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1006 ; GFX6-NEXT: v_or_b32_e32 v0, -8, v0
1007 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1008 ; GFX6-NEXT: s_endpgm
1010 ; GFX8-LABEL: vector_or_i64_neg_inline_imm:
1012 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1013 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
1014 ; GFX8-NEXT: s_mov_b32 s6, -1
1015 ; GFX8-NEXT: s_mov_b32 s10, s6
1016 ; GFX8-NEXT: s_mov_b32 s11, s7
1017 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1018 ; GFX8-NEXT: s_mov_b32 s8, s2
1019 ; GFX8-NEXT: s_mov_b32 s9, s3
1020 ; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
1021 ; GFX8-NEXT: s_mov_b32 s4, s0
1022 ; GFX8-NEXT: s_mov_b32 s5, s1
1023 ; GFX8-NEXT: v_mov_b32_e32 v1, -1
1024 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1025 ; GFX8-NEXT: v_or_b32_e32 v0, -8, v0
1026 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1027 ; GFX8-NEXT: s_endpgm
1029 ; EG-LABEL: vector_or_i64_neg_inline_imm:
1031 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1033 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
1034 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1037 ; EG-NEXT: Fetch clause starting at 6:
1038 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1039 ; EG-NEXT: ALU clause starting at 8:
1040 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1041 ; EG-NEXT: ALU clause starting at 9:
1042 ; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
1043 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1044 ; EG-NEXT: -8(nan), 2(2.802597e-45)
1045 ; EG-NEXT: MOV * T0.Y, literal.x,
1046 ; EG-NEXT: -1(nan), 0(0.000000e+00)
1047 %loada = load i64, ptr addrspace(1) %a, align 8
1048 %or = or i64 %loada, -8
1049 store i64 %or, ptr addrspace(1) %out
1053 define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
1054 ; GFX6-LABEL: vector_or_i64_neg_literal:
1056 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1057 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1058 ; GFX6-NEXT: s_mov_b32 s6, -1
1059 ; GFX6-NEXT: s_mov_b32 s10, s6
1060 ; GFX6-NEXT: s_mov_b32 s11, s7
1061 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1062 ; GFX6-NEXT: s_mov_b32 s8, s2
1063 ; GFX6-NEXT: s_mov_b32 s9, s3
1064 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
1065 ; GFX6-NEXT: s_mov_b32 s4, s0
1066 ; GFX6-NEXT: s_mov_b32 s5, s1
1067 ; GFX6-NEXT: v_mov_b32_e32 v1, -1
1068 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1069 ; GFX6-NEXT: v_or_b32_e32 v0, 0xffffff38, v0
1070 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1071 ; GFX6-NEXT: s_endpgm
1073 ; GFX8-LABEL: vector_or_i64_neg_literal:
1075 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1076 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
1077 ; GFX8-NEXT: s_mov_b32 s6, -1
1078 ; GFX8-NEXT: s_mov_b32 s10, s6
1079 ; GFX8-NEXT: s_mov_b32 s11, s7
1080 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1081 ; GFX8-NEXT: s_mov_b32 s8, s2
1082 ; GFX8-NEXT: s_mov_b32 s9, s3
1083 ; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
1084 ; GFX8-NEXT: s_mov_b32 s4, s0
1085 ; GFX8-NEXT: s_mov_b32 s5, s1
1086 ; GFX8-NEXT: v_mov_b32_e32 v1, -1
1087 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1088 ; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0
1089 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1090 ; GFX8-NEXT: s_endpgm
1092 ; EG-LABEL: vector_or_i64_neg_literal:
1094 ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
1096 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
1097 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1100 ; EG-NEXT: Fetch clause starting at 6:
1101 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1102 ; EG-NEXT: ALU clause starting at 8:
1103 ; EG-NEXT: MOV * T0.X, KC0[2].Z,
1104 ; EG-NEXT: ALU clause starting at 9:
1105 ; EG-NEXT: OR_INT T0.X, T0.X, literal.x,
1106 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
1107 ; EG-NEXT: -200(nan), 2(2.802597e-45)
1108 ; EG-NEXT: MOV * T0.Y, literal.x,
1109 ; EG-NEXT: -1(nan), 0(0.000000e+00)
1110 %loada = load i64, ptr addrspace(1) %a, align 8
1111 %or = or i64 %loada, -200
1112 store i64 %or, ptr addrspace(1) %out
1116 define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
1117 ; GFX6-LABEL: trunc_i64_or_to_i32:
1119 ; GFX6-NEXT: s_load_dword s6, s[4:5], 0x13
1120 ; GFX6-NEXT: s_load_dword s7, s[4:5], 0x1d
1121 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
1122 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
1123 ; GFX6-NEXT: s_mov_b32 s2, -1
1124 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1125 ; GFX6-NEXT: s_or_b32 s4, s7, s6
1126 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
1127 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
1128 ; GFX6-NEXT: s_endpgm
1130 ; GFX8-LABEL: trunc_i64_or_to_i32:
1132 ; GFX8-NEXT: s_load_dword s6, s[4:5], 0x4c
1133 ; GFX8-NEXT: s_load_dword s7, s[4:5], 0x74
1134 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1135 ; GFX8-NEXT: s_mov_b32 s3, 0xf000
1136 ; GFX8-NEXT: s_mov_b32 s2, -1
1137 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1138 ; GFX8-NEXT: s_or_b32 s4, s7, s6
1139 ; GFX8-NEXT: v_mov_b32_e32 v0, s4
1140 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
1141 ; GFX8-NEXT: s_endpgm
1143 ; EG-LABEL: trunc_i64_or_to_i32:
1145 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1146 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1149 ; EG-NEXT: ALU clause starting at 4:
1150 ; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x,
1151 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1152 ; EG-NEXT: OR_INT * T1.X, KC0[7].Y, KC0[4].W,
1153 %add = or i64 %b, %a
1154 %trunc = trunc i64 %add to i32
1155 store i32 %trunc, ptr addrspace(1) %out, align 8
1159 define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
1160 ; GFX6-LABEL: or_i1:
1162 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
1163 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
1164 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1165 ; GFX6-NEXT: s_mov_b32 s6, -1
1166 ; GFX6-NEXT: s_mov_b32 s10, s6
1167 ; GFX6-NEXT: s_mov_b32 s11, s7
1168 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1169 ; GFX6-NEXT: s_mov_b32 s12, s2
1170 ; GFX6-NEXT: s_mov_b32 s13, s3
1171 ; GFX6-NEXT: s_mov_b32 s14, s6
1172 ; GFX6-NEXT: s_mov_b32 s15, s7
1173 ; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
1174 ; GFX6-NEXT: buffer_load_dword v1, off, s[12:15], 0
1175 ; GFX6-NEXT: s_mov_b32 s4, s0
1176 ; GFX6-NEXT: s_mov_b32 s5, s1
1177 ; GFX6-NEXT: s_waitcnt vmcnt(1)
1178 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
1179 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1180 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
1181 ; GFX6-NEXT: v_max_f32_e32 v0, v1, v0
1182 ; GFX6-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
1183 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1184 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
1185 ; GFX6-NEXT: s_endpgm
1187 ; GFX8-LABEL: or_i1:
1189 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1190 ; GFX8-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
1191 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
1192 ; GFX8-NEXT: s_mov_b32 s6, -1
1193 ; GFX8-NEXT: s_mov_b32 s10, s6
1194 ; GFX8-NEXT: s_mov_b32 s11, s7
1195 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1196 ; GFX8-NEXT: s_mov_b32 s12, s2
1197 ; GFX8-NEXT: s_mov_b32 s13, s3
1198 ; GFX8-NEXT: s_mov_b32 s14, s6
1199 ; GFX8-NEXT: s_mov_b32 s15, s7
1200 ; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
1201 ; GFX8-NEXT: buffer_load_dword v1, off, s[12:15], 0
1202 ; GFX8-NEXT: s_mov_b32 s4, s0
1203 ; GFX8-NEXT: s_mov_b32 s5, s1
1204 ; GFX8-NEXT: s_waitcnt vmcnt(1)
1205 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0
1206 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1
1208 ; GFX8-NEXT: v_max_f32_e32 v0, v1, v0
1209 ; GFX8-NEXT: v_cmp_le_f32_e32 vcc, 0, v0
1210 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
1211 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
1212 ; GFX8-NEXT: s_endpgm
1216 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
1218 ; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[]
1219 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1222 ; EG-NEXT: Fetch clause starting at 6:
1223 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
1224 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1225 ; EG-NEXT: ALU clause starting at 10:
1226 ; EG-NEXT: MOV T0.X, KC0[2].Z,
1227 ; EG-NEXT: MOV * T1.X, KC0[2].W,
1228 ; EG-NEXT: ALU clause starting at 12:
1229 ; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X,
1230 ; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0,
1231 ; EG-NEXT: AND_INT T0.X, PV.W, 1,
1232 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1233 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1234 %a = load float, ptr addrspace(1) %in0
1235 %b = load float, ptr addrspace(1) %in1
1236 %acmp = fcmp oge float %a, 0.000000e+00
1237 %bcmp = fcmp oge float %b, 0.000000e+00
1238 %or = or i1 %acmp, %bcmp
1239 %result = zext i1 %or to i32
1240 store i32 %result, ptr addrspace(1) %out
1244 define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
1245 ; GFX6-LABEL: s_or_i1:
1247 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
1248 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
1249 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
1250 ; GFX6-NEXT: s_mov_b32 s6, -1
1251 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1252 ; GFX6-NEXT: s_cmp_eq_u32 s0, s1
1253 ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
1254 ; GFX6-NEXT: s_cmp_eq_u32 s2, s3
1255 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
1256 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
1257 ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1258 ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0
1259 ; GFX6-NEXT: s_endpgm
1261 ; GFX8-LABEL: s_or_i1:
1263 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
1264 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
1265 ; GFX8-NEXT: s_mov_b32 s7, 0xf000
1266 ; GFX8-NEXT: s_mov_b32 s6, -1
1267 ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1268 ; GFX8-NEXT: s_cmp_eq_u32 s0, s1
1269 ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0
1270 ; GFX8-NEXT: s_cmp_eq_u32 s2, s3
1271 ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0
1272 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
1273 ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1274 ; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0
1275 ; GFX8-NEXT: s_endpgm
1277 ; EG-LABEL: s_or_i1:
1279 ; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[]
1280 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
1283 ; EG-NEXT: ALU clause starting at 4:
1284 ; EG-NEXT: SETE_INT T0.W, KC0[3].X, KC0[3].Y,
1285 ; EG-NEXT: SETE_INT * T1.W, KC0[2].Z, KC0[2].W,
1286 ; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x,
1287 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
1288 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1289 ; EG-NEXT: AND_INT T0.W, PS, 1,
1290 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
1291 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1292 ; EG-NEXT: LSHL T0.X, PV.W, PS,
1293 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
1294 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
1295 ; EG-NEXT: MOV T0.Y, 0.0,
1296 ; EG-NEXT: MOV * T0.Z, 0.0,
1297 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1298 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1299 %cmp0 = icmp eq i32 %a, %b
1300 %cmp1 = icmp eq i32 %c, %d
1301 %or = or i1 %cmp0, %cmp1
1302 store i1 %or, ptr addrspace(1) %out