1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s
3 ; RUN: llc -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM %s
5 ; Loosely based on test/CodeGen/{X86,AArch64}/extract-lowbits.ll,
6 ; but with all 64-bit tests, and tests with loads dropped.
9 ; a) x & (1 << nbits) - 1
10 ; b) x & ~(-1 << nbits)
11 ; c) x & (-1 >> (32 - y))
12 ; d) x << (32 - y) >> (32 - y)
15 ; ---------------------------------------------------------------------------- ;
17 ; ---------------------------------------------------------------------------- ;
19 define amdgpu_kernel void @bzhi32_a0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
20 ; EG-LABEL: bzhi32_a0:
22 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
23 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
26 ; EG-NEXT: ALU clause starting at 4:
27 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
28 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
29 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
31 ; CM-LABEL: bzhi32_a0:
33 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
34 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
37 ; CM-NEXT: ALU clause starting at 4:
38 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
39 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
40 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
41 %onebit = shl i32 1, %numlowbits
42 %mask = add nsw i32 %onebit, -1
43 %masked = and i32 %mask, %val
44 store i32 %masked, i32 addrspace(1)* %out
48 define amdgpu_kernel void @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) {
49 ; EG-LABEL: bzhi32_a1_indexzext:
51 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
53 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
54 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
57 ; EG-NEXT: Fetch clause starting at 6:
58 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
59 ; EG-NEXT: ALU clause starting at 8:
60 ; EG-NEXT: MOV * T0.X, 0.0,
61 ; EG-NEXT: ALU clause starting at 9:
62 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
63 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
64 ; EG-NEXT: BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W,
65 ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
66 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
68 ; CM-LABEL: bzhi32_a1_indexzext:
70 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
72 ; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
73 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
76 ; CM-NEXT: Fetch clause starting at 6:
77 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
78 ; CM-NEXT: ALU clause starting at 8:
79 ; CM-NEXT: MOV * T0.X, 0.0,
80 ; CM-NEXT: ALU clause starting at 9:
81 ; CM-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
82 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
83 ; CM-NEXT: BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W,
84 ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
85 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
86 %conv = zext i8 %numlowbits to i32
87 %onebit = shl i32 1, %conv
88 %mask = add nsw i32 %onebit, -1
89 %masked = and i32 %mask, %val
90 store i32 %masked, i32 addrspace(1)* %out
94 define amdgpu_kernel void @bzhi32_a4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
95 ; EG-LABEL: bzhi32_a4_commutative:
97 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
98 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
101 ; EG-NEXT: ALU clause starting at 4:
102 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
103 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
104 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
106 ; CM-LABEL: bzhi32_a4_commutative:
108 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
109 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
112 ; CM-NEXT: ALU clause starting at 4:
113 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
114 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
115 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
116 %onebit = shl i32 1, %numlowbits
117 %mask = add nsw i32 %onebit, -1
118 %masked = and i32 %val, %mask ; swapped order
119 store i32 %masked, i32 addrspace(1)* %out
123 ; ---------------------------------------------------------------------------- ;
125 ; ---------------------------------------------------------------------------- ;
127 define amdgpu_kernel void @bzhi32_b0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
128 ; EG-LABEL: bzhi32_b0:
130 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
131 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
134 ; EG-NEXT: ALU clause starting at 4:
135 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
136 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
137 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
139 ; CM-LABEL: bzhi32_b0:
141 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
142 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
145 ; CM-NEXT: ALU clause starting at 4:
146 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
147 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
148 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
149 %notmask = shl i32 -1, %numlowbits
150 %mask = xor i32 %notmask, -1
151 %masked = and i32 %mask, %val
152 store i32 %masked, i32 addrspace(1)* %out
156 define amdgpu_kernel void @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) {
157 ; EG-LABEL: bzhi32_b1_indexzext:
159 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
161 ; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
162 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
165 ; EG-NEXT: Fetch clause starting at 6:
166 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
167 ; EG-NEXT: ALU clause starting at 8:
168 ; EG-NEXT: MOV * T0.X, 0.0,
169 ; EG-NEXT: ALU clause starting at 9:
170 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
171 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
172 ; EG-NEXT: BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W,
173 ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
174 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
176 ; CM-LABEL: bzhi32_b1_indexzext:
178 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
180 ; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
181 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
184 ; CM-NEXT: Fetch clause starting at 6:
185 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
186 ; CM-NEXT: ALU clause starting at 8:
187 ; CM-NEXT: MOV * T0.X, 0.0,
188 ; CM-NEXT: ALU clause starting at 9:
189 ; CM-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x,
190 ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
191 ; CM-NEXT: BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W,
192 ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
193 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
194 %conv = zext i8 %numlowbits to i32
195 %notmask = shl i32 -1, %conv
196 %mask = xor i32 %notmask, -1
197 %masked = and i32 %mask, %val
198 store i32 %masked, i32 addrspace(1)* %out
202 define amdgpu_kernel void @bzhi32_b4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
203 ; EG-LABEL: bzhi32_b4_commutative:
205 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
206 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
209 ; EG-NEXT: ALU clause starting at 4:
210 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
211 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
212 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
214 ; CM-LABEL: bzhi32_b4_commutative:
216 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
217 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
220 ; CM-NEXT: ALU clause starting at 4:
221 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
222 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
223 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
224 %notmask = shl i32 -1, %numlowbits
225 %mask = xor i32 %notmask, -1
226 %masked = and i32 %val, %mask ; swapped order
227 store i32 %masked, i32 addrspace(1)* %out
231 ; ---------------------------------------------------------------------------- ;
233 ; ---------------------------------------------------------------------------- ;
235 define amdgpu_kernel void @bzhi32_c0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
236 ; EG-LABEL: bzhi32_c0:
238 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
239 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
242 ; EG-NEXT: ALU clause starting at 4:
243 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
244 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
245 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
247 ; CM-LABEL: bzhi32_c0:
249 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
250 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
253 ; CM-NEXT: ALU clause starting at 4:
254 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
255 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
256 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
257 %numhighbits = sub i32 32, %numlowbits
258 %mask = lshr i32 -1, %numhighbits
259 %masked = and i32 %mask, %val
260 store i32 %masked, i32 addrspace(1)* %out
264 define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) {
265 ; EG-LABEL: bzhi32_c1_indexzext:
267 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
269 ; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
270 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
273 ; EG-NEXT: Fetch clause starting at 6:
274 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
275 ; EG-NEXT: ALU clause starting at 8:
276 ; EG-NEXT: MOV * T0.X, 0.0,
277 ; EG-NEXT: ALU clause starting at 9:
278 ; EG-NEXT: SUB_INT * T0.W, literal.x, T0.X,
279 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
280 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
281 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
282 ; EG-NEXT: LSHR * T0.W, literal.x, PV.W,
283 ; EG-NEXT: -1(nan), 0(0.000000e+00)
284 ; EG-NEXT: AND_INT T0.X, PV.W, KC0[2].Y,
285 ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
286 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
288 ; CM-LABEL: bzhi32_c1_indexzext:
290 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
292 ; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
293 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
296 ; CM-NEXT: Fetch clause starting at 6:
297 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
298 ; CM-NEXT: ALU clause starting at 8:
299 ; CM-NEXT: MOV * T0.X, 0.0,
300 ; CM-NEXT: ALU clause starting at 9:
301 ; CM-NEXT: SUB_INT * T0.W, literal.x, T0.X,
302 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
303 ; CM-NEXT: AND_INT * T0.W, PV.W, literal.x,
304 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
305 ; CM-NEXT: LSHR * T0.W, literal.x, PV.W,
306 ; CM-NEXT: -1(nan), 0(0.000000e+00)
307 ; CM-NEXT: AND_INT * T0.X, PV.W, KC0[2].Y,
308 ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
309 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
310 %numhighbits = sub i8 32, %numlowbits
311 %sh_prom = zext i8 %numhighbits to i32
312 %mask = lshr i32 -1, %sh_prom
313 %masked = and i32 %mask, %val
314 store i32 %masked, i32 addrspace(1)* %out
318 define amdgpu_kernel void @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
319 ; EG-LABEL: bzhi32_c4_commutative:
321 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
322 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
325 ; EG-NEXT: ALU clause starting at 4:
326 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
327 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
328 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
330 ; CM-LABEL: bzhi32_c4_commutative:
332 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
333 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
336 ; CM-NEXT: ALU clause starting at 4:
337 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
338 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
339 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
340 %numhighbits = sub i32 32, %numlowbits
341 %mask = lshr i32 -1, %numhighbits
342 %masked = and i32 %val, %mask ; swapped order
343 store i32 %masked, i32 addrspace(1)* %out
347 ; ---------------------------------------------------------------------------- ;
349 ; ---------------------------------------------------------------------------- ;
351 define amdgpu_kernel void @bzhi32_d0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) {
352 ; EG-LABEL: bzhi32_d0:
354 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
355 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
358 ; EG-NEXT: ALU clause starting at 4:
359 ; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
360 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
361 ; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
363 ; CM-LABEL: bzhi32_d0:
365 ; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
366 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
369 ; CM-NEXT: ALU clause starting at 4:
370 ; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x,
371 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
372 ; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z,
373 %numhighbits = sub i32 32, %numlowbits
374 %highbitscleared = shl i32 %val, %numhighbits
375 %masked = lshr i32 %highbitscleared, %numhighbits
376 store i32 %masked, i32 addrspace(1)* %out
380 define amdgpu_kernel void @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) {
381 ; EG-LABEL: bzhi32_d1_indexzext:
383 ; EG-NEXT: ALU 0, @8, KC0[], KC1[]
385 ; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
386 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
389 ; EG-NEXT: Fetch clause starting at 6:
390 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
391 ; EG-NEXT: ALU clause starting at 8:
392 ; EG-NEXT: MOV * T0.X, 0.0,
393 ; EG-NEXT: ALU clause starting at 9:
394 ; EG-NEXT: SUB_INT * T0.W, literal.x, T0.X,
395 ; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
396 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
397 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
398 ; EG-NEXT: LSHL * T1.W, KC0[2].Y, PV.W,
399 ; EG-NEXT: LSHR T0.X, PV.W, T0.W,
400 ; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
401 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
403 ; CM-LABEL: bzhi32_d1_indexzext:
405 ; CM-NEXT: ALU 0, @8, KC0[], KC1[]
407 ; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
408 ; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
411 ; CM-NEXT: Fetch clause starting at 6:
412 ; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3
413 ; CM-NEXT: ALU clause starting at 8:
414 ; CM-NEXT: MOV * T0.X, 0.0,
415 ; CM-NEXT: ALU clause starting at 9:
416 ; CM-NEXT: SUB_INT * T0.W, literal.x, T0.X,
417 ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
418 ; CM-NEXT: AND_INT * T0.W, PV.W, literal.x,
419 ; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
420 ; CM-NEXT: LSHL * T1.W, KC0[2].Y, PV.W,
421 ; CM-NEXT: LSHR * T0.X, PV.W, T0.W,
422 ; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x,
423 ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
424 %numhighbits = sub i8 32, %numlowbits
425 %sh_prom = zext i8 %numhighbits to i32
426 %highbitscleared = shl i32 %val, %sh_prom
427 %masked = lshr i32 %highbitscleared, %sh_prom
428 store i32 %masked, i32 addrspace(1)* %out