1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; ## Support i16x2 instructions
3 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \
4 ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
5 ; RUN: | FileCheck -allow-deprecated-dag-overlap %s
7 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \
8 ; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
9 ; RUN: | %ptxas-verify -arch=sm_90 \
12 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
14 define <4 x i8> @test_ret_const() #0 {
15 ; CHECK-LABEL: test_ret_const(
17 ; CHECK-NEXT: .reg .b32 %r<2>;
19 ; CHECK-NEXT: // %bb.0:
20 ; CHECK-NEXT: mov.b32 %r1, -66911489;
21 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
23 ret <4 x i8> <i8 -1, i8 2, i8 3, i8 -4>
26 define i8 @test_extract_0(<4 x i8> %a) #0 {
27 ; CHECK-LABEL: test_extract_0(
29 ; CHECK-NEXT: .reg .b32 %r<3>;
31 ; CHECK-NEXT: // %bb.0:
32 ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_0_param_0];
33 ; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
34 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
36 %e = extractelement <4 x i8> %a, i32 0
40 define i8 @test_extract_1(<4 x i8> %a) #0 {
41 ; CHECK-LABEL: test_extract_1(
43 ; CHECK-NEXT: .reg .b32 %r<3>;
45 ; CHECK-NEXT: // %bb.0:
46 ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_1_param_0];
47 ; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8;
48 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
50 %e = extractelement <4 x i8> %a, i32 1
54 define i8 @test_extract_2(<4 x i8> %a) #0 {
55 ; CHECK-LABEL: test_extract_2(
57 ; CHECK-NEXT: .reg .b32 %r<3>;
59 ; CHECK-NEXT: // %bb.0:
60 ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_2_param_0];
61 ; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8;
62 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
64 %e = extractelement <4 x i8> %a, i32 2
68 define i8 @test_extract_3(<4 x i8> %a) #0 {
69 ; CHECK-LABEL: test_extract_3(
71 ; CHECK-NEXT: .reg .b32 %r<3>;
73 ; CHECK-NEXT: // %bb.0:
74 ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_3_param_0];
75 ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
76 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
78 %e = extractelement <4 x i8> %a, i32 3
82 define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
83 ; CHECK-LABEL: test_extract_i(
85 ; CHECK-NEXT: .reg .b32 %r<5>;
86 ; CHECK-NEXT: .reg .b64 %rd<2>;
88 ; CHECK-NEXT: // %bb.0:
89 ; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
90 ; CHECK-NEXT: ld.param.u32 %r1, [test_extract_i_param_0];
91 ; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
92 ; CHECK-NEXT: shl.b32 %r3, %r2, 3;
93 ; CHECK-NEXT: bfe.u32 %r4, %r1, %r3, 8;
94 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
96 %e = extractelement <4 x i8> %a, i64 %idx
100 define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
101 ; CHECK-LABEL: test_add(
103 ; CHECK-NEXT: .reg .b16 %rs<13>;
104 ; CHECK-NEXT: .reg .b32 %r<19>;
106 ; CHECK-NEXT: // %bb.0:
107 ; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1];
108 ; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0];
109 ; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
110 ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
111 ; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
112 ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
113 ; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1;
114 ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
115 ; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8;
116 ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
117 ; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
118 ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
119 ; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4;
120 ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
121 ; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
122 ; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8;
123 ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
124 ; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8;
125 ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
126 ; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7;
127 ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
128 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
129 ; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8;
130 ; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
131 ; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8;
132 ; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
133 ; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10;
134 ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
135 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
136 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
138 %r = add <4 x i8> %a, %b
142 define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
143 ; CHECK-LABEL: test_add_imm_0(
145 ; CHECK-NEXT: .reg .b16 %rs<9>;
146 ; CHECK-NEXT: .reg .b32 %r<14>;
148 ; CHECK-NEXT: // %bb.0:
149 ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0];
150 ; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
151 ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
152 ; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
153 ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
154 ; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
155 ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
156 ; CHECK-NEXT: add.s16 %rs4, %rs3, 2;
157 ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
158 ; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
159 ; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
160 ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
161 ; CHECK-NEXT: add.s16 %rs6, %rs5, 3;
162 ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
163 ; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
164 ; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
165 ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
166 ; CHECK-NEXT: add.s16 %rs8, %rs7, 4;
167 ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
168 ; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
169 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12;
171 %r = add <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
175 define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
176 ; CHECK-LABEL: test_add_imm_1(
178 ; CHECK-NEXT: .reg .b16 %rs<9>;
179 ; CHECK-NEXT: .reg .b32 %r<14>;
181 ; CHECK-NEXT: // %bb.0:
182 ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0];
183 ; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
184 ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
185 ; CHECK-NEXT: add.s16 %rs2, %rs1, 1;
186 ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
187 ; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
188 ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4;
189 ; CHECK-NEXT: add.s16 %rs4, %rs3, 2;
190 ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4;
191 ; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8;
192 ; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
193 ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
194 ; CHECK-NEXT: add.s16 %rs6, %rs5, 3;
195 ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
196 ; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8;
197 ; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8;
198 ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
199 ; CHECK-NEXT: add.s16 %rs8, %rs7, 4;
200 ; CHECK-NEXT: cvt.u32.u16 %r11, %rs8;
201 ; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8;
202 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12;
204 %r = add <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
208 define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
209 ; CHECK-LABEL: test_sub(
211 ; CHECK-NEXT: .reg .b16 %rs<13>;
212 ; CHECK-NEXT: .reg .b32 %r<19>;
214 ; CHECK-NEXT: // %bb.0:
215 ; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1];
216 ; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0];
217 ; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
218 ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
219 ; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
220 ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
221 ; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1;
222 ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
223 ; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8;
224 ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
225 ; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
226 ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
227 ; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4;
228 ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
229 ; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
230 ; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8;
231 ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
232 ; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8;
233 ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
234 ; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7;
235 ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
236 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
237 ; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8;
238 ; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
239 ; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8;
240 ; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
241 ; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10;
242 ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
243 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
244 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
246 %r = sub <4 x i8> %a, %b
250 define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
251 ; CHECK-LABEL: test_smax(
253 ; CHECK-NEXT: .reg .pred %p<5>;
254 ; CHECK-NEXT: .reg .b32 %r<19>;
256 ; CHECK-NEXT: // %bb.0:
257 ; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1];
258 ; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0];
259 ; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
260 ; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
261 ; CHECK-NEXT: setp.gt.s32 %p1, %r3, %r4;
262 ; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
263 ; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
264 ; CHECK-NEXT: setp.gt.s32 %p2, %r5, %r6;
265 ; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
266 ; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
267 ; CHECK-NEXT: setp.gt.s32 %p3, %r7, %r8;
268 ; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
269 ; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
270 ; CHECK-NEXT: setp.gt.s32 %p4, %r9, %r10;
271 ; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
272 ; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
273 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
274 ; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2;
275 ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
276 ; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1;
277 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
278 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
280 %cmp = icmp sgt <4 x i8> %a, %b
281 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
285 define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
286 ; CHECK-LABEL: test_umax(
288 ; CHECK-NEXT: .reg .pred %p<5>;
289 ; CHECK-NEXT: .reg .b32 %r<19>;
291 ; CHECK-NEXT: // %bb.0:
292 ; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1];
293 ; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0];
294 ; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
295 ; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
296 ; CHECK-NEXT: setp.hi.u32 %p1, %r3, %r4;
297 ; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
298 ; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
299 ; CHECK-NEXT: setp.hi.u32 %p2, %r5, %r6;
300 ; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
301 ; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
302 ; CHECK-NEXT: setp.hi.u32 %p3, %r7, %r8;
303 ; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
304 ; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
305 ; CHECK-NEXT: setp.hi.u32 %p4, %r9, %r10;
306 ; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
307 ; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
308 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
309 ; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2;
310 ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
311 ; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1;
312 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
313 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
315 %cmp = icmp ugt <4 x i8> %a, %b
316 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
320 define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
321 ; CHECK-LABEL: test_smin(
323 ; CHECK-NEXT: .reg .pred %p<5>;
324 ; CHECK-NEXT: .reg .b32 %r<19>;
326 ; CHECK-NEXT: // %bb.0:
327 ; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1];
328 ; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0];
329 ; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
330 ; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
331 ; CHECK-NEXT: setp.le.s32 %p1, %r3, %r4;
332 ; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
333 ; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
334 ; CHECK-NEXT: setp.le.s32 %p2, %r5, %r6;
335 ; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
336 ; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
337 ; CHECK-NEXT: setp.le.s32 %p3, %r7, %r8;
338 ; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
339 ; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
340 ; CHECK-NEXT: setp.le.s32 %p4, %r9, %r10;
341 ; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
342 ; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
343 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
344 ; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2;
345 ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
346 ; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1;
347 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
348 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
350 %cmp = icmp sle <4 x i8> %a, %b
351 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
355 define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
356 ; CHECK-LABEL: test_umin(
358 ; CHECK-NEXT: .reg .pred %p<5>;
359 ; CHECK-NEXT: .reg .b32 %r<19>;
361 ; CHECK-NEXT: // %bb.0:
362 ; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1];
363 ; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0];
364 ; CHECK-NEXT: bfe.s32 %r3, %r1, 24, 8;
365 ; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
366 ; CHECK-NEXT: setp.ls.u32 %p1, %r3, %r4;
367 ; CHECK-NEXT: bfe.s32 %r5, %r1, 16, 8;
368 ; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
369 ; CHECK-NEXT: setp.ls.u32 %p2, %r5, %r6;
370 ; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
371 ; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
372 ; CHECK-NEXT: setp.ls.u32 %p3, %r7, %r8;
373 ; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8;
374 ; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
375 ; CHECK-NEXT: setp.ls.u32 %p4, %r9, %r10;
376 ; CHECK-NEXT: selp.b32 %r11, %r9, %r10, %p4;
377 ; CHECK-NEXT: selp.b32 %r12, %r7, %r8, %p3;
378 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
379 ; CHECK-NEXT: selp.b32 %r14, %r5, %r6, %p2;
380 ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
381 ; CHECK-NEXT: selp.b32 %r16, %r3, %r4, %p1;
382 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
383 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
385 %cmp = icmp ule <4 x i8> %a, %b
386 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
390 define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
391 ; CHECK-LABEL: test_eq(
393 ; CHECK-NEXT: .reg .pred %p<5>;
394 ; CHECK-NEXT: .reg .b32 %r<24>;
396 ; CHECK-NEXT: // %bb.0:
397 ; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2];
398 ; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1];
399 ; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0];
400 ; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
401 ; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
402 ; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4;
403 ; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
404 ; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
405 ; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6;
406 ; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
407 ; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8;
408 ; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8;
409 ; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
410 ; CHECK-NEXT: bfe.s32 %r11, %r1, 0, 8;
411 ; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10;
412 ; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8;
413 ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4;
414 ; CHECK-NEXT: bfe.s32 %r14, %r3, 8, 8;
415 ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3;
416 ; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8;
417 ; CHECK-NEXT: bfe.s32 %r17, %r3, 16, 8;
418 ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2;
419 ; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8;
420 ; CHECK-NEXT: bfe.s32 %r20, %r3, 24, 8;
421 ; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1;
422 ; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8;
423 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22;
425 %cmp = icmp eq <4 x i8> %a, %b
426 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
430 define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
431 ; CHECK-LABEL: test_ne(
433 ; CHECK-NEXT: .reg .pred %p<5>;
434 ; CHECK-NEXT: .reg .b32 %r<24>;
436 ; CHECK-NEXT: // %bb.0:
437 ; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2];
438 ; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1];
439 ; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0];
440 ; CHECK-NEXT: bfe.s32 %r4, %r2, 24, 8;
441 ; CHECK-NEXT: bfe.s32 %r5, %r1, 24, 8;
442 ; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4;
443 ; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8;
444 ; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8;
445 ; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6;
446 ; CHECK-NEXT: bfe.s32 %r8, %r2, 8, 8;
447 ; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8;
448 ; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8;
449 ; CHECK-NEXT: bfe.s32 %r10, %r2, 0, 8;
450 ; CHECK-NEXT: bfe.s32 %r11, %r1, 0, 8;
451 ; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10;
452 ; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8;
453 ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4;
454 ; CHECK-NEXT: bfe.s32 %r14, %r3, 8, 8;
455 ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3;
456 ; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8;
457 ; CHECK-NEXT: bfe.s32 %r17, %r3, 16, 8;
458 ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2;
459 ; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8;
460 ; CHECK-NEXT: bfe.s32 %r20, %r3, 24, 8;
461 ; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1;
462 ; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8;
463 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22;
465 %cmp = icmp ne <4 x i8> %a, %b
466 %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c
470 define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
471 ; CHECK-LABEL: test_mul(
473 ; CHECK-NEXT: .reg .b16 %rs<13>;
474 ; CHECK-NEXT: .reg .b32 %r<19>;
476 ; CHECK-NEXT: // %bb.0:
477 ; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1];
478 ; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0];
479 ; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
480 ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
481 ; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
482 ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4;
483 ; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1;
484 ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
485 ; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8;
486 ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6;
487 ; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
488 ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7;
489 ; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4;
490 ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
491 ; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
492 ; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8;
493 ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10;
494 ; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8;
495 ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11;
496 ; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7;
497 ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
498 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
499 ; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8;
500 ; CHECK-NEXT: cvt.u16.u32 %rs10, %r14;
501 ; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8;
502 ; CHECK-NEXT: cvt.u16.u32 %rs11, %r15;
503 ; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10;
504 ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
505 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
506 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17;
508 %r = mul <4 x i8> %a, %b
512 define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 {
513 ; CHECK-LABEL: test_or(
515 ; CHECK-NEXT: .reg .b32 %r<7>;
517 ; CHECK-NEXT: // %bb.0:
518 ; CHECK-NEXT: ld.param.u32 %r3, [test_or_param_1];
519 ; CHECK-NEXT: ld.param.u32 %r4, [test_or_param_0];
520 ; CHECK-NEXT: or.b32 %r5, %r4, %r3;
521 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
523 %r = or <4 x i8> %a, %b
527 define <4 x i8> @test_or_computed(i8 %a) {
528 ; CHECK-LABEL: test_or_computed(
530 ; CHECK-NEXT: .reg .b16 %rs<2>;
531 ; CHECK-NEXT: .reg .b32 %r<9>;
533 ; CHECK-NEXT: // %bb.0:
534 ; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0];
535 ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
536 ; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8;
537 ; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8;
538 ; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8;
539 ; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8;
540 ; CHECK-NEXT: or.b32 %r8, %r6, %r4;
541 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
543 %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
544 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
545 %r = or <4 x i8> %ins.1, %ins.0
549 define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 {
550 ; CHECK-LABEL: test_or_imm_0(
552 ; CHECK-NEXT: .reg .b32 %r<3>;
554 ; CHECK-NEXT: // %bb.0:
555 ; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0];
556 ; CHECK-NEXT: or.b32 %r2, %r1, 67305985;
557 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
559 %r = or <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
563 define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 {
564 ; CHECK-LABEL: test_or_imm_1(
566 ; CHECK-NEXT: .reg .b32 %r<3>;
568 ; CHECK-NEXT: // %bb.0:
569 ; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0];
570 ; CHECK-NEXT: or.b32 %r2, %r1, 67305985;
571 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
573 %r = or <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
577 define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 {
578 ; CHECK-LABEL: test_xor(
580 ; CHECK-NEXT: .reg .b32 %r<7>;
582 ; CHECK-NEXT: // %bb.0:
583 ; CHECK-NEXT: ld.param.u32 %r3, [test_xor_param_1];
584 ; CHECK-NEXT: ld.param.u32 %r4, [test_xor_param_0];
585 ; CHECK-NEXT: xor.b32 %r5, %r4, %r3;
586 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
588 %r = xor <4 x i8> %a, %b
592 define <4 x i8> @test_xor_computed(i8 %a) {
593 ; CHECK-LABEL: test_xor_computed(
595 ; CHECK-NEXT: .reg .b16 %rs<2>;
596 ; CHECK-NEXT: .reg .b32 %r<9>;
598 ; CHECK-NEXT: // %bb.0:
599 ; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0];
600 ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
601 ; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8;
602 ; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8;
603 ; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8;
604 ; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8;
605 ; CHECK-NEXT: xor.b32 %r8, %r6, %r4;
606 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
608 %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
609 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
610 %r = xor <4 x i8> %ins.1, %ins.0
614 define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 {
615 ; CHECK-LABEL: test_xor_imm_0(
617 ; CHECK-NEXT: .reg .b32 %r<3>;
619 ; CHECK-NEXT: // %bb.0:
620 ; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0];
621 ; CHECK-NEXT: xor.b32 %r2, %r1, 67305985;
622 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
624 %r = xor <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
628 define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 {
629 ; CHECK-LABEL: test_xor_imm_1(
631 ; CHECK-NEXT: .reg .b32 %r<3>;
633 ; CHECK-NEXT: // %bb.0:
634 ; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0];
635 ; CHECK-NEXT: xor.b32 %r2, %r1, 67305985;
636 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
638 %r = xor <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
642 define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 {
643 ; CHECK-LABEL: test_and(
645 ; CHECK-NEXT: .reg .b32 %r<7>;
647 ; CHECK-NEXT: // %bb.0:
648 ; CHECK-NEXT: ld.param.u32 %r3, [test_and_param_1];
649 ; CHECK-NEXT: ld.param.u32 %r4, [test_and_param_0];
650 ; CHECK-NEXT: and.b32 %r5, %r4, %r3;
651 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5;
653 %r = and <4 x i8> %a, %b
657 define <4 x i8> @test_and_computed(i8 %a) {
658 ; CHECK-LABEL: test_and_computed(
660 ; CHECK-NEXT: .reg .b16 %rs<2>;
661 ; CHECK-NEXT: .reg .b32 %r<9>;
663 ; CHECK-NEXT: // %bb.0:
664 ; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0];
665 ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
666 ; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8;
667 ; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8;
668 ; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8;
669 ; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8;
670 ; CHECK-NEXT: and.b32 %r8, %r6, %r4;
671 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8;
673 %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
674 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
675 %r = and <4 x i8> %ins.1, %ins.0
679 define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 {
680 ; CHECK-LABEL: test_and_imm_0(
682 ; CHECK-NEXT: .reg .b32 %r<3>;
684 ; CHECK-NEXT: // %bb.0:
685 ; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0];
686 ; CHECK-NEXT: and.b32 %r2, %r1, 67305985;
687 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
689 %r = and <4 x i8> <i8 1, i8 2, i8 3, i8 4>, %a
693 define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 {
694 ; CHECK-LABEL: test_and_imm_1(
696 ; CHECK-NEXT: .reg .b32 %r<3>;
698 ; CHECK-NEXT: // %bb.0:
699 ; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0];
700 ; CHECK-NEXT: and.b32 %r2, %r1, 67305985;
701 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
703 %r = and <4 x i8> %a, <i8 1, i8 2, i8 3, i8 4>
707 define void @test_ldst_v2i8(ptr %a, ptr %b) {
708 ; CHECK-LABEL: test_ldst_v2i8(
710 ; CHECK-NEXT: .reg .b32 %r<2>;
711 ; CHECK-NEXT: .reg .b64 %rd<3>;
713 ; CHECK-NEXT: // %bb.0:
714 ; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2i8_param_1];
715 ; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2i8_param_0];
716 ; CHECK-NEXT: ld.u32 %r1, [%rd1];
717 ; CHECK-NEXT: st.u32 [%rd2], %r1;
719 %t1 = load <4 x i8>, ptr %a
720 store <4 x i8> %t1, ptr %b, align 16
724 define void @test_ldst_v3i8(ptr %a, ptr %b) {
725 ; CHECK-LABEL: test_ldst_v3i8(
727 ; CHECK-NEXT: .reg .b32 %r<4>;
728 ; CHECK-NEXT: .reg .b64 %rd<3>;
730 ; CHECK-NEXT: // %bb.0:
731 ; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3i8_param_1];
732 ; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
733 ; CHECK-NEXT: ld.u32 %r1, [%rd1];
734 ; CHECK-NEXT: st.u16 [%rd2], %r1;
735 ; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8;
736 ; CHECK-NEXT: st.u8 [%rd2+2], %r3;
738 %t1 = load <3 x i8>, ptr %a
739 store <3 x i8> %t1, ptr %b, align 16
743 define void @test_ldst_v4i8(ptr %a, ptr %b) {
744 ; CHECK-LABEL: test_ldst_v4i8(
746 ; CHECK-NEXT: .reg .b32 %r<2>;
747 ; CHECK-NEXT: .reg .b64 %rd<3>;
749 ; CHECK-NEXT: // %bb.0:
750 ; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_param_1];
751 ; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_param_0];
752 ; CHECK-NEXT: ld.u32 %r1, [%rd1];
753 ; CHECK-NEXT: st.u32 [%rd2], %r1;
755 %t1 = load <4 x i8>, ptr %a
756 store <4 x i8> %t1, ptr %b, align 16
760 define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
761 ; CHECK-LABEL: test_ldst_v4i8_unaligned(
763 ; CHECK-NEXT: .reg .b32 %r<5>;
764 ; CHECK-NEXT: .reg .b64 %rd<3>;
766 ; CHECK-NEXT: // %bb.0:
767 ; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1];
768 ; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0];
769 ; CHECK-NEXT: ld.u8 %r1, [%rd1];
770 ; CHECK-NEXT: ld.u8 %r2, [%rd1+1];
771 ; CHECK-NEXT: ld.u8 %r3, [%rd1+2];
772 ; CHECK-NEXT: ld.u8 %r4, [%rd1+3];
773 ; CHECK-NEXT: st.u8 [%rd2+3], %r4;
774 ; CHECK-NEXT: st.u8 [%rd2+2], %r3;
775 ; CHECK-NEXT: st.u8 [%rd2+1], %r2;
776 ; CHECK-NEXT: st.u8 [%rd2], %r1;
778 %t1 = load <4 x i8>, ptr %a, align 1
779 store <4 x i8> %t1, ptr %b, align 1
784 define void @test_ldst_v8i8(ptr %a, ptr %b) {
785 ; CHECK-LABEL: test_ldst_v8i8(
787 ; CHECK-NEXT: .reg .b32 %r<3>;
788 ; CHECK-NEXT: .reg .b64 %rd<3>;
790 ; CHECK-NEXT: // %bb.0:
791 ; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
792 ; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
793 ; CHECK-NEXT: ld.u32 %r1, [%rd1];
794 ; CHECK-NEXT: ld.u32 %r2, [%rd1+4];
795 ; CHECK-NEXT: st.u32 [%rd2+4], %r2;
796 ; CHECK-NEXT: st.u32 [%rd2], %r1;
798 %t1 = load <8 x i8>, ptr %a
799 store <8 x i8> %t1, ptr %b, align 16
803 declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0
805 define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
806 ; CHECK-LABEL: test_call(
808 ; CHECK-NEXT: .reg .b32 %r<5>;
810 ; CHECK-NEXT: // %bb.0:
811 ; CHECK-NEXT: ld.param.u32 %r2, [test_call_param_1];
812 ; CHECK-NEXT: ld.param.u32 %r1, [test_call_param_0];
813 ; CHECK-NEXT: { // callseq 0, 0
814 ; CHECK-NEXT: .reg .b32 temp_param_reg;
815 ; CHECK-NEXT: .param .align 4 .b8 param0[4];
816 ; CHECK-NEXT: st.param.b32 [param0+0], %r1;
817 ; CHECK-NEXT: .param .align 4 .b8 param1[4];
818 ; CHECK-NEXT: st.param.b32 [param1+0], %r2;
819 ; CHECK-NEXT: .param .align 4 .b8 retval0[4];
820 ; CHECK-NEXT: call.uni (retval0),
821 ; CHECK-NEXT: test_callee,
823 ; CHECK-NEXT: param0,
826 ; CHECK-NEXT: ld.param.b32 %r3, [retval0+0];
827 ; CHECK-NEXT: } // callseq 0
828 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
830 %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b)
834 define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
835 ; CHECK-LABEL: test_call_flipped(
837 ; CHECK-NEXT: .reg .b32 %r<5>;
839 ; CHECK-NEXT: // %bb.0:
840 ; CHECK-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1];
841 ; CHECK-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0];
842 ; CHECK-NEXT: { // callseq 1, 0
843 ; CHECK-NEXT: .reg .b32 temp_param_reg;
844 ; CHECK-NEXT: .param .align 4 .b8 param0[4];
845 ; CHECK-NEXT: st.param.b32 [param0+0], %r2;
846 ; CHECK-NEXT: .param .align 4 .b8 param1[4];
847 ; CHECK-NEXT: st.param.b32 [param1+0], %r1;
848 ; CHECK-NEXT: .param .align 4 .b8 retval0[4];
849 ; CHECK-NEXT: call.uni (retval0),
850 ; CHECK-NEXT: test_callee,
852 ; CHECK-NEXT: param0,
855 ; CHECK-NEXT: ld.param.b32 %r3, [retval0+0];
856 ; CHECK-NEXT: } // callseq 1
857 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
859 %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
863 define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
864 ; CHECK-LABEL: test_tailcall_flipped(
866 ; CHECK-NEXT: .reg .b32 %r<5>;
868 ; CHECK-NEXT: // %bb.0:
869 ; CHECK-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1];
870 ; CHECK-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0];
871 ; CHECK-NEXT: { // callseq 2, 0
872 ; CHECK-NEXT: .reg .b32 temp_param_reg;
873 ; CHECK-NEXT: .param .align 4 .b8 param0[4];
874 ; CHECK-NEXT: st.param.b32 [param0+0], %r2;
875 ; CHECK-NEXT: .param .align 4 .b8 param1[4];
876 ; CHECK-NEXT: st.param.b32 [param1+0], %r1;
877 ; CHECK-NEXT: .param .align 4 .b8 retval0[4];
878 ; CHECK-NEXT: call.uni (retval0),
879 ; CHECK-NEXT: test_callee,
881 ; CHECK-NEXT: param0,
884 ; CHECK-NEXT: ld.param.b32 %r3, [retval0+0];
885 ; CHECK-NEXT: } // callseq 2
886 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
888 %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a)
892 define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 {
893 ; CHECK-LABEL: test_select(
895 ; CHECK-NEXT: .reg .pred %p<2>;
896 ; CHECK-NEXT: .reg .b16 %rs<3>;
897 ; CHECK-NEXT: .reg .b32 %r<4>;
899 ; CHECK-NEXT: // %bb.0:
900 ; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
901 ; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
902 ; CHECK-NEXT: setp.eq.b16 %p1, %rs2, 1;
903 ; CHECK-NEXT: ld.param.u32 %r2, [test_select_param_1];
904 ; CHECK-NEXT: ld.param.u32 %r1, [test_select_param_0];
905 ; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1;
906 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
908 %r = select i1 %c, <4 x i8> %a, <4 x i8> %b
912 define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) #0 {
913 ; CHECK-LABEL: test_select_cc(
915 ; CHECK-NEXT: .reg .pred %p<5>;
916 ; CHECK-NEXT: .reg .b32 %r<29>;
918 ; CHECK-NEXT: // %bb.0:
919 ; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3];
920 ; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2];
921 ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1];
922 ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0];
923 ; CHECK-NEXT: bfe.s32 %r5, %r4, 24, 8;
924 ; CHECK-NEXT: bfe.s32 %r6, %r3, 24, 8;
925 ; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5;
926 ; CHECK-NEXT: bfe.s32 %r7, %r4, 16, 8;
927 ; CHECK-NEXT: bfe.s32 %r8, %r3, 16, 8;
928 ; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7;
929 ; CHECK-NEXT: bfe.s32 %r9, %r4, 8, 8;
930 ; CHECK-NEXT: bfe.s32 %r10, %r3, 8, 8;
931 ; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9;
932 ; CHECK-NEXT: bfe.s32 %r11, %r4, 0, 8;
933 ; CHECK-NEXT: bfe.s32 %r12, %r3, 0, 8;
934 ; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11;
935 ; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8;
936 ; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8;
937 ; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4;
938 ; CHECK-NEXT: bfe.s32 %r16, %r2, 8, 8;
939 ; CHECK-NEXT: bfe.s32 %r17, %r1, 8, 8;
940 ; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3;
941 ; CHECK-NEXT: bfi.b32 %r19, %r18, %r15, 8, 8;
942 ; CHECK-NEXT: bfe.s32 %r20, %r2, 16, 8;
943 ; CHECK-NEXT: bfe.s32 %r21, %r1, 16, 8;
944 ; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2;
945 ; CHECK-NEXT: bfi.b32 %r23, %r22, %r19, 16, 8;
946 ; CHECK-NEXT: bfe.s32 %r24, %r2, 24, 8;
947 ; CHECK-NEXT: bfe.s32 %r25, %r1, 24, 8;
948 ; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1;
949 ; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8;
950 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r27;
952 %cc = icmp ne <4 x i8> %c, %d
953 %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
957 define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
958 ; CHECK-LABEL: test_select_cc_i32_i8(
960 ; CHECK-NEXT: .reg .pred %p<5>;
961 ; CHECK-NEXT: .reg .b32 %r<23>;
963 ; CHECK-NEXT: // %bb.0:
964 ; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
965 ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
966 ; CHECK-NEXT: ld.param.u32 %r10, [test_select_cc_i32_i8_param_3];
967 ; CHECK-NEXT: ld.param.u32 %r9, [test_select_cc_i32_i8_param_2];
968 ; CHECK-NEXT: bfe.s32 %r11, %r10, 0, 8;
969 ; CHECK-NEXT: bfe.s32 %r12, %r9, 0, 8;
970 ; CHECK-NEXT: setp.ne.u32 %p1, %r12, %r11;
971 ; CHECK-NEXT: bfe.s32 %r13, %r10, 8, 8;
972 ; CHECK-NEXT: bfe.s32 %r14, %r9, 8, 8;
973 ; CHECK-NEXT: setp.ne.u32 %p2, %r14, %r13;
974 ; CHECK-NEXT: bfe.s32 %r15, %r10, 16, 8;
975 ; CHECK-NEXT: bfe.s32 %r16, %r9, 16, 8;
976 ; CHECK-NEXT: setp.ne.u32 %p3, %r16, %r15;
977 ; CHECK-NEXT: bfe.s32 %r17, %r10, 24, 8;
978 ; CHECK-NEXT: bfe.s32 %r18, %r9, 24, 8;
979 ; CHECK-NEXT: setp.ne.u32 %p4, %r18, %r17;
980 ; CHECK-NEXT: selp.b32 %r19, %r4, %r8, %p4;
981 ; CHECK-NEXT: selp.b32 %r20, %r3, %r7, %p3;
982 ; CHECK-NEXT: selp.b32 %r21, %r2, %r6, %p2;
983 ; CHECK-NEXT: selp.b32 %r22, %r1, %r5, %p1;
984 ; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r22, %r21, %r20, %r19};
986 <4 x i8> %c, <4 x i8> %d) #0 {
987 %cc = icmp ne <4 x i8> %c, %d
988 %r = select <4 x i1> %cc, <4 x i32> %a, <4 x i32> %b
992 define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
993 ; CHECK-LABEL: test_select_cc_i8_i32(
995 ; CHECK-NEXT: .reg .pred %p<5>;
996 ; CHECK-NEXT: .reg .b32 %r<27>;
998 ; CHECK-NEXT: // %bb.0:
999 ; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
1000 ; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
1001 ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1];
1002 ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0];
1003 ; CHECK-NEXT: setp.ne.s32 %p1, %r6, %r10;
1004 ; CHECK-NEXT: setp.ne.s32 %p2, %r5, %r9;
1005 ; CHECK-NEXT: setp.ne.s32 %p3, %r4, %r8;
1006 ; CHECK-NEXT: setp.ne.s32 %p4, %r3, %r7;
1007 ; CHECK-NEXT: bfe.s32 %r11, %r2, 0, 8;
1008 ; CHECK-NEXT: bfe.s32 %r12, %r1, 0, 8;
1009 ; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4;
1010 ; CHECK-NEXT: bfe.s32 %r14, %r2, 8, 8;
1011 ; CHECK-NEXT: bfe.s32 %r15, %r1, 8, 8;
1012 ; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3;
1013 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 8, 8;
1014 ; CHECK-NEXT: bfe.s32 %r18, %r2, 16, 8;
1015 ; CHECK-NEXT: bfe.s32 %r19, %r1, 16, 8;
1016 ; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2;
1017 ; CHECK-NEXT: bfi.b32 %r21, %r20, %r17, 16, 8;
1018 ; CHECK-NEXT: bfe.s32 %r22, %r2, 24, 8;
1019 ; CHECK-NEXT: bfe.s32 %r23, %r1, 24, 8;
1020 ; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1;
1021 ; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8;
1022 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25;
1024 <4 x i32> %c, <4 x i32> %d) #0 {
1025 %cc = icmp ne <4 x i32> %c, %d
1026 %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b
1031 define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
1032 ; CHECK-LABEL: test_trunc_2xi32(
1034 ; CHECK-NEXT: .reg .b32 %r<9>;
1036 ; CHECK-NEXT: // %bb.0:
1037 ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
1038 ; CHECK-NEXT: bfi.b32 %r5, %r2, %r1, 8, 8;
1039 ; CHECK-NEXT: bfi.b32 %r6, %r3, %r5, 16, 8;
1040 ; CHECK-NEXT: bfi.b32 %r7, %r4, %r6, 24, 8;
1041 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7;
1043 %r = trunc <4 x i32> %a to <4 x i8>
1047 define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
1048 ; CHECK-LABEL: test_trunc_2xi64(
1050 ; CHECK-NEXT: .reg .b32 %r<9>;
1051 ; CHECK-NEXT: .reg .b64 %rd<5>;
1053 ; CHECK-NEXT: // %bb.0:
1054 ; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
1055 ; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
1056 ; CHECK-NEXT: cvt.u32.u64 %r1, %rd1;
1057 ; CHECK-NEXT: cvt.u32.u64 %r2, %rd2;
1058 ; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8;
1059 ; CHECK-NEXT: cvt.u32.u64 %r4, %rd3;
1060 ; CHECK-NEXT: bfi.b32 %r5, %r4, %r3, 16, 8;
1061 ; CHECK-NEXT: cvt.u32.u64 %r6, %rd4;
1062 ; CHECK-NEXT: bfi.b32 %r7, %r6, %r5, 24, 8;
1063 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7;
1065 %r = trunc <4 x i64> %a to <4 x i8>
1069 define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
1070 ; CHECK-LABEL: test_zext_2xi32(
1072 ; CHECK-NEXT: .reg .b32 %r<6>;
1074 ; CHECK-NEXT: // %bb.0:
1075 ; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0];
1076 ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
1077 ; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8;
1078 ; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
1079 ; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
1080 ; CHECK-NEXT: st.param.v4.b32 [func_retval0+0], {%r5, %r4, %r3, %r2};
1082 %r = zext <4 x i8> %a to <4 x i32>
1086 define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
1087 ; CHECK-LABEL: test_zext_2xi64(
1089 ; CHECK-NEXT: .reg .b32 %r<6>;
1090 ; CHECK-NEXT: .reg .b64 %rd<9>;
1092 ; CHECK-NEXT: // %bb.0:
1093 ; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0];
1094 ; CHECK-NEXT: bfe.s32 %r2, %r1, 24, 8;
1095 ; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
1096 ; CHECK-NEXT: and.b64 %rd2, %rd1, 255;
1097 ; CHECK-NEXT: bfe.s32 %r3, %r1, 16, 8;
1098 ; CHECK-NEXT: cvt.u64.u32 %rd3, %r3;
1099 ; CHECK-NEXT: and.b64 %rd4, %rd3, 255;
1100 ; CHECK-NEXT: bfe.s32 %r4, %r1, 8, 8;
1101 ; CHECK-NEXT: cvt.u64.u32 %rd5, %r4;
1102 ; CHECK-NEXT: and.b64 %rd6, %rd5, 255;
1103 ; CHECK-NEXT: bfe.s32 %r5, %r1, 0, 8;
1104 ; CHECK-NEXT: cvt.u64.u32 %rd7, %r5;
1105 ; CHECK-NEXT: and.b64 %rd8, %rd7, 255;
1106 ; CHECK-NEXT: st.param.v2.b64 [func_retval0+0], {%rd8, %rd6};
1107 ; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd4, %rd2};
1109 %r = zext <4 x i8> %a to <4 x i64>
1113 define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 {
1114 ; CHECK-LABEL: test_bitcast_i32_to_4xi8(
1116 ; CHECK-NEXT: .reg .b32 %r<3>;
1118 ; CHECK-NEXT: // %bb.0:
1119 ; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0];
1120 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
1122 %r = bitcast i32 %a to <4 x i8>
1126 define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 {
1127 ; CHECK-LABEL: test_bitcast_float_to_4xi8(
1129 ; CHECK-NEXT: .reg .b32 %r<2>;
1130 ; CHECK-NEXT: .reg .f32 %f<2>;
1132 ; CHECK-NEXT: // %bb.0:
1133 ; CHECK-NEXT: ld.param.f32 %f1, [test_bitcast_float_to_4xi8_param_0];
1134 ; CHECK-NEXT: mov.b32 %r1, %f1;
1135 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
1137 %r = bitcast float %a to <4 x i8>
1141 define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 {
1142 ; CHECK-LABEL: test_bitcast_4xi8_to_i32(
1144 ; CHECK-NEXT: .reg .b32 %r<3>;
1146 ; CHECK-NEXT: // %bb.0:
1147 ; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_4xi8_to_i32_param_0];
1148 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
1150 %r = bitcast <4 x i8> %a to i32
1154 define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
1155 ; CHECK-LABEL: test_bitcast_4xi8_to_float(
1157 ; CHECK-NEXT: .reg .b32 %r<3>;
1158 ; CHECK-NEXT: .reg .f32 %f<2>;
1160 ; CHECK-NEXT: // %bb.0:
1161 ; CHECK-NEXT: ld.param.u32 %r2, [test_bitcast_4xi8_to_float_param_0];
1162 ; CHECK-NEXT: mov.b32 %f1, %r2;
1163 ; CHECK-NEXT: st.param.f32 [func_retval0+0], %f1;
1165 %r = bitcast <4 x i8> %a to float
1170 define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
1171 ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf(
1173 ; CHECK-NEXT: .reg .b16 %rs<2>;
1174 ; CHECK-NEXT: .reg .b32 %r<6>;
1176 ; CHECK-NEXT: // %bb.0:
1177 ; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
1178 ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
1179 ; CHECK-NEXT: bfi.b32 %r2, 5, %r1, 8, 8;
1180 ; CHECK-NEXT: bfi.b32 %r3, 6, %r2, 16, 8;
1181 ; CHECK-NEXT: bfi.b32 %r4, 7, %r3, 24, 8;
1182 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4;
1184 %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0
1185 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
1186 %ins.2 = insertelement <4 x i8> %ins.1, i8 6, i32 2
1187 %ins.3 = insertelement <4 x i8> %ins.2, i8 7, i32 3
1188 %r = bitcast <4 x i8> %ins.3 to <2 x half>
1193 define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 {
1194 ; CHECK-LABEL: test_shufflevector(
1196 ; CHECK-NEXT: .reg .b32 %r<4>;
1198 ; CHECK-NEXT: // %bb.0:
1199 ; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0];
1200 ; CHECK-NEXT: // implicit-def: %r3
1201 ; CHECK-NEXT: prmt.b32 %r2, %r1, %r3, 291;
1202 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
1204 %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1208 define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 {
1209 ; CHECK-LABEL: test_shufflevector_2(
1211 ; CHECK-NEXT: .reg .b32 %r<4>;
1213 ; CHECK-NEXT: // %bb.0:
1214 ; CHECK-NEXT: ld.param.u32 %r2, [test_shufflevector_2_param_1];
1215 ; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_2_param_0];
1216 ; CHECK-NEXT: prmt.b32 %r3, %r1, %r2, 9527;
1217 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
1219 %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 7, i32 3, i32 5, i32 2>
1224 define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
1225 ; CHECK-LABEL: test_insertelement(
1227 ; CHECK-NEXT: .reg .b16 %rs<2>;
1228 ; CHECK-NEXT: .reg .b32 %r<5>;
1230 ; CHECK-NEXT: // %bb.0:
1231 ; CHECK-NEXT: ld.param.u8 %rs1, [test_insertelement_param_1];
1232 ; CHECK-NEXT: ld.param.u32 %r1, [test_insertelement_param_0];
1233 ; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
1234 ; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8;
1235 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r3;
1237 %i = insertelement <4 x i8> %a, i8 %x, i64 1
1241 define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
1242 ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8(
1244 ; CHECK-NEXT: .reg .b16 %rs<13>;
1245 ; CHECK-NEXT: .reg .b32 %r<15>;
1247 ; CHECK-NEXT: // %bb.0:
1248 ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0];
1249 ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
1250 ; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2;
1251 ; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1;
1252 ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
1253 ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5;
1254 ; CHECK-NEXT: cvt.u32.u16 %r6, %rs5;
1255 ; CHECK-NEXT: cvt.u32.u16 %r7, %rs6;
1256 ; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8;
1257 ; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4;
1258 ; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8;
1259 ; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7;
1260 ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9};
1261 ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9;
1262 ; CHECK-NEXT: cvt.u32.u16 %r10, %rs11;
1263 ; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8;
1264 ; CHECK-NEXT: cvt.u32.u16 %r12, %rs12;
1265 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8;
1266 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13;
1268 %r = fptosi <4 x half> %a to <4 x i8>
1272 define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
1273 ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8(
1275 ; CHECK-NEXT: .reg .b16 %rs<13>;
1276 ; CHECK-NEXT: .reg .b32 %r<15>;
1278 ; CHECK-NEXT: // %bb.0:
1279 ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0];
1280 ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3;
1281 ; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2;
1282 ; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1;
1283 ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3};
1284 ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5;
1285 ; CHECK-NEXT: cvt.u32.u16 %r6, %rs5;
1286 ; CHECK-NEXT: cvt.u32.u16 %r7, %rs6;
1287 ; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8;
1288 ; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4;
1289 ; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8;
1290 ; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7;
1291 ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9};
1292 ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9;
1293 ; CHECK-NEXT: cvt.u32.u16 %r10, %rs11;
1294 ; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8;
1295 ; CHECK-NEXT: cvt.u32.u16 %r12, %rs12;
1296 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8;
1297 ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13;
1299 %r = fptoui <4 x half> %a to <4 x i8>
1303 define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
1304 ; CHECK-LABEL: test_srem_v4i8(
1306 ; CHECK-NEXT: .reg .b16 %rs<13>;
1307 ; CHECK-NEXT: .reg .b32 %r<18>;
1308 ; CHECK-NEXT: .reg .b64 %rd<4>;
1310 ; CHECK-NEXT: // %bb.0: // %entry
1311 ; CHECK-NEXT: ld.param.u64 %rd3, [test_srem_v4i8_param_2];
1312 ; CHECK-NEXT: ld.param.u64 %rd2, [test_srem_v4i8_param_1];
1313 ; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v4i8_param_0];
1314 ; CHECK-NEXT: ld.u32 %r1, [%rd1];
1315 ; CHECK-NEXT: ld.u32 %r2, [%rd2];
1316 ; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
1317 ; CHECK-NEXT: cvt.s8.s32 %rs1, %r3;
1318 ; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
1319 ; CHECK-NEXT: cvt.s8.s32 %rs2, %r4;
1320 ; CHECK-NEXT: rem.s16 %rs3, %rs2, %rs1;
1321 ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3;
1322 ; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8;
1323 ; CHECK-NEXT: cvt.s8.s32 %rs4, %r6;
1324 ; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8;
1325 ; CHECK-NEXT: cvt.s8.s32 %rs5, %r7;
1326 ; CHECK-NEXT: rem.s16 %rs6, %rs5, %rs4;
1327 ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6;
1328 ; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8;
1329 ; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8;
1330 ; CHECK-NEXT: cvt.s8.s32 %rs7, %r10;
1331 ; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8;
1332 ; CHECK-NEXT: cvt.s8.s32 %rs8, %r11;
1333 ; CHECK-NEXT: rem.s16 %rs9, %rs8, %rs7;
1334 ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9;
1335 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8;
1336 ; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8;
1337 ; CHECK-NEXT: cvt.s8.s32 %rs10, %r14;
1338 ; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8;
1339 ; CHECK-NEXT: cvt.s8.s32 %rs11, %r15;
1340 ; CHECK-NEXT: rem.s16 %rs12, %rs11, %rs10;
1341 ; CHECK-NEXT: cvt.u32.u16 %r16, %rs12;
1342 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8;
1343 ; CHECK-NEXT: st.u32 [%rd3], %r17;
1346 %t57 = load <4 x i8>, ptr %a, align 4
1347 %t59 = load <4 x i8>, ptr %b, align 4
1348 %x = srem <4 x i8> %t57, %t59
1349 store <4 x i8> %x, ptr %c, align 4
1353 ;; v3i8 lowering, especially for unaligned loads is terrible. We end up doing
1354 ;; tons of pointless scalar_to_vector/bitcast/extract_elt on v2i16/v4i8, which
1355 ;; is further complicated by LLVM trying to use i16 as an intermediate type,
1356 ;; because we don't have i8 registers. It's a mess.
1357 ;; Ideally we want to split it into element-wise ops, but legalizer can't handle
1358 ;; odd-sized vectors. TL;DR; don't use odd-sized vectors of v8.
1359 define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
1360 ; CHECK-LABEL: test_srem_v3i8(
1362 ; CHECK-NEXT: .reg .b16 %rs<20>;
1363 ; CHECK-NEXT: .reg .b32 %r<16>;
1364 ; CHECK-NEXT: .reg .b64 %rd<4>;
1366 ; CHECK-NEXT: // %bb.0: // %entry
1367 ; CHECK-NEXT: ld.param.u64 %rd3, [test_srem_v3i8_param_2];
1368 ; CHECK-NEXT: ld.param.u64 %rd2, [test_srem_v3i8_param_1];
1369 ; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v3i8_param_0];
1370 ; CHECK-NEXT: ld.u8 %rs1, [%rd1];
1371 ; CHECK-NEXT: ld.u8 %rs2, [%rd1+1];
1372 ; CHECK-NEXT: shl.b16 %rs3, %rs2, 8;
1373 ; CHECK-NEXT: or.b16 %rs4, %rs3, %rs1;
1374 ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
1375 ; CHECK-NEXT: ld.s8 %rs5, [%rd1+2];
1376 ; CHECK-NEXT: ld.u8 %rs6, [%rd2];
1377 ; CHECK-NEXT: ld.u8 %rs7, [%rd2+1];
1378 ; CHECK-NEXT: shl.b16 %rs8, %rs7, 8;
1379 ; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6;
1380 ; CHECK-NEXT: cvt.u32.u16 %r3, %rs9;
1381 ; CHECK-NEXT: ld.s8 %rs10, [%rd2+2];
1382 ; CHECK-NEXT: bfe.s32 %r5, %r3, 0, 8;
1383 ; CHECK-NEXT: cvt.s8.s32 %rs11, %r5;
1384 ; CHECK-NEXT: bfe.s32 %r6, %r1, 0, 8;
1385 ; CHECK-NEXT: cvt.s8.s32 %rs12, %r6;
1386 ; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11;
1387 ; CHECK-NEXT: cvt.u32.u16 %r7, %rs13;
1388 ; CHECK-NEXT: bfe.s32 %r8, %r3, 8, 8;
1389 ; CHECK-NEXT: cvt.s8.s32 %rs14, %r8;
1390 ; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8;
1391 ; CHECK-NEXT: cvt.s8.s32 %rs15, %r9;
1392 ; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14;
1393 ; CHECK-NEXT: cvt.u32.u16 %r10, %rs16;
1394 ; CHECK-NEXT: bfi.b32 %r11, %r10, %r7, 8, 8;
1395 ; CHECK-NEXT: // implicit-def: %r13
1396 ; CHECK-NEXT: bfi.b32 %r12, %r13, %r11, 16, 8;
1397 ; CHECK-NEXT: // implicit-def: %r15
1398 ; CHECK-NEXT: bfi.b32 %r14, %r15, %r12, 24, 8;
1399 ; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10;
1400 ; CHECK-NEXT: cvt.u16.u32 %rs18, %r14;
1401 ; CHECK-NEXT: st.u8 [%rd3], %rs18;
1402 ; CHECK-NEXT: shr.u16 %rs19, %rs18, 8;
1403 ; CHECK-NEXT: st.u8 [%rd3+1], %rs19;
1404 ; CHECK-NEXT: st.u8 [%rd3+2], %rs17;
1407 %t57 = load <3 x i8>, ptr %a, align 1
1408 %t59 = load <3 x i8>, ptr %b, align 1
1409 %x = srem <3 x i8> %t57, %t59
1410 store <3 x i8> %x, ptr %c, align 1
1414 define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
1415 ; CHECK-LABEL: test_sext_v4i1_to_v4i8(
1417 ; CHECK-NEXT: .reg .pred %p<5>;
1418 ; CHECK-NEXT: .reg .b32 %r<18>;
1419 ; CHECK-NEXT: .reg .b64 %rd<4>;
1421 ; CHECK-NEXT: // %bb.0: // %entry
1422 ; CHECK-NEXT: ld.param.u64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
1423 ; CHECK-NEXT: ld.param.u64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
1424 ; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
1425 ; CHECK-NEXT: ld.u32 %r1, [%rd1];
1426 ; CHECK-NEXT: ld.u32 %r2, [%rd2];
1427 ; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8;
1428 ; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8;
1429 ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3;
1430 ; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8;
1431 ; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8;
1432 ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5;
1433 ; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8;
1434 ; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8;
1435 ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7;
1436 ; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8;
1437 ; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8;
1438 ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9;
1439 ; CHECK-NEXT: selp.s32 %r11, -1, 0, %p4;
1440 ; CHECK-NEXT: selp.s32 %r12, -1, 0, %p3;
1441 ; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8;
1442 ; CHECK-NEXT: selp.s32 %r14, -1, 0, %p2;
1443 ; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8;
1444 ; CHECK-NEXT: selp.s32 %r16, -1, 0, %p1;
1445 ; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8;
1446 ; CHECK-NEXT: st.u32 [%rd3], %r17;
1449 %t1 = load <4 x i8>, ptr %a, align 4
1450 %t2 = load <4 x i8>, ptr %b, align 4
1451 %t5 = icmp ugt <4 x i8> %t1, %t2
1452 %t6 = sext <4 x i1> %t5 to <4 x i8>
1453 store <4 x i8> %t6, ptr %c, align 4
1457 attributes #0 = { nounwind }