1 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s
3 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
5 declare i16 @llvm.ctlz.i16(i16, i1) readnone
6 declare i32 @llvm.ctlz.i32(i32, i1) readnone
7 declare i64 @llvm.ctlz.i64(i64, i1) readnone
9 ; There should be no difference between llvm.ctlz.i32(%a, true) and
10 ; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
12 ; CHECK-LABEL: myctlz(
13 define i32 @myctlz(i32 %a) {
16 ; CHECK-NEXT: st.param.
18 %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
21 ; CHECK-LABEL: myctlz_2(
22 define i32 @myctlz_2(i32 %a) {
25 ; CHECK-NEXT: st.param.
27 %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
31 ; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
32 ; value, so here we have to zero-extend it.
33 ; CHECK-LABEL: myctlz64(
34 define i64 @myctlz64(i64 %a) {
37 ; CHECK-NEXT: cvt.u64.u32
38 ; CHECK-NEXT: st.param.
40 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
43 ; CHECK-LABEL: myctlz64_2(
44 define i64 @myctlz64_2(i64 %a) {
47 ; CHECK-NEXT: cvt.u64.u32
48 ; CHECK-NEXT: st.param.
50 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
54 ; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
55 ; natural return width of ptx's clz.b64 instruction. No conversions should be
56 ; necessary in the PTX.
57 ; CHECK-LABEL: myctlz64_as_32(
58 define i32 @myctlz64_as_32(i64 %a) {
61 ; CHECK-NEXT: st.param.
63 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
64 %trunc = trunc i64 %val to i32
67 ; CHECK-LABEL: myctlz64_as_32_2(
68 define i32 @myctlz64_as_32_2(i64 %a) {
71 ; CHECK-NEXT: st.param.
73 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
74 %trunc = trunc i64 %val to i32
78 ; ctlz.i16 is implemented by extending the input to i32, computing the result,
79 ; and then truncating the result back down to i16. But the NVPTX ABI
80 ; zero-extends i16 return values to i32, so the final truncation doesn't appear
82 ; CHECK-LABEL: myctlz_ret16(
83 define i16 @myctlz_ret16(i16 %a) {
85 ; CHECK-NEXT: cvt.u32.u16
88 ; CHECK-NEXT: st.param.
90 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
93 ; CHECK-LABEL: myctlz_ret16_2(
94 define i16 @myctlz_ret16_2(i16 %a) {
96 ; CHECK-NEXT: cvt.u32.u16
99 ; CHECK-NEXT: st.param.
101 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
105 ; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
107 ; CHECK-LABEL: myctlz_store16(
108 define void @myctlz_store16(i16 %a, i16* %b) {
110 ; CHECK-NEXT: cvt.u32.u16
111 ; CHECK-NEXT: clz.b32
112 ; CHECK-DAG: cvt.u16.u32
114 ; CHECK: st.{{[a-z]}}16
116 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
117 store i16 %val, i16* %b
120 ; CHECK-LABEL: myctlz_store16_2(
121 define void @myctlz_store16_2(i16 %a, i16* %b) {
123 ; CHECK-NEXT: cvt.u32.u16
124 ; CHECK-NEXT: clz.b32
125 ; CHECK-DAG: cvt.u16.u32
127 ; CHECK: st.{{[a-z]}}16
129 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
130 store i16 %val, i16* %b