1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck %s
3 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca -disable-promote-alloca-to-lds< %s | FileCheck -check-prefix=NOLDS %s
5 ; This normally would be fixed by instcombine to be compare to the GEP
8 define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
9 ; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
10 ; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
11 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
12 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0:![0-9]+]]
13 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
14 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
15 ; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
16 ; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
17 ; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
18 ; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
19 ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
20 ; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
21 ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
22 ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
23 ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
24 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 [[TMP14]]
25 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
26 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[B:%.*]]
27 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], [[PTR1]]
28 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
29 ; CHECK-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
30 ; CHECK-NEXT: ret void
32 ; NOLDS-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
33 ; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
34 ; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
35 ; NOLDS-NEXT: [[PTR1:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[B:%.*]]
36 ; NOLDS-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
37 ; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
38 ; NOLDS-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
39 ; NOLDS-NEXT: ret void
41 %alloca = alloca [16 x i32], align 4, addrspace(5)
42 %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
43 %ptr1 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
44 %cmp = icmp eq ptr addrspace(5) %ptr0, %ptr1
45 %zext = zext i1 %cmp to i32
46 store volatile i32 %zext, ptr addrspace(1) %out
50 define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
51 ; CHECK-LABEL: @lds_promoted_alloca_icmp_null_rhs(
52 ; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
53 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
54 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
55 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
56 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
57 ; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
58 ; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
59 ; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
60 ; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
61 ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
62 ; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
63 ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
64 ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
65 ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
66 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 [[TMP14]]
67 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
68 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(3) [[PTR0]], null
69 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
70 ; CHECK-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
71 ; CHECK-NEXT: ret void
73 ; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_rhs(
74 ; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
75 ; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
76 ; NOLDS-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], null
77 ; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
78 ; NOLDS-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
79 ; NOLDS-NEXT: ret void
81 %alloca = alloca [16 x i32], align 4, addrspace(5)
82 %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
83 %cmp = icmp eq ptr addrspace(5) %ptr0, null
84 %zext = zext i1 %cmp to i32
85 store volatile i32 %zext, ptr addrspace(1) %out
89 define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
90 ; CHECK-LABEL: @lds_promoted_alloca_icmp_null_lhs(
91 ; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
92 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
93 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
94 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
95 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
96 ; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
97 ; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.x()
98 ; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.y()
99 ; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 256) i32 @llvm.amdgcn.workitem.id.z()
100 ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
101 ; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
102 ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
103 ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
104 ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
105 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [256 x [16 x i32]], ptr addrspace(3) @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 [[TMP14]]
106 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(3) [[TMP15]], i32 0, i32 [[A:%.*]]
107 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(3) null, [[PTR0]]
108 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
109 ; CHECK-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
110 ; CHECK-NEXT: ret void
112 ; NOLDS-LABEL: @lds_promoted_alloca_icmp_null_lhs(
113 ; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
114 ; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
115 ; NOLDS-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) null, [[PTR0]]
116 ; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
117 ; NOLDS-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
118 ; NOLDS-NEXT: ret void
120 %alloca = alloca [16 x i32], align 4, addrspace(5)
121 %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
122 %cmp = icmp eq ptr addrspace(5) null, %ptr0
123 %zext = zext i1 %cmp to i32
124 store volatile i32 %zext, ptr addrspace(1) %out
128 define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
129 ; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
130 ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
131 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
132 ; CHECK-NEXT: [[PTR1:%.*]] = call ptr addrspace(5) @get_unknown_pointer()
133 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
134 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
135 ; CHECK-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
136 ; CHECK-NEXT: ret void
138 ; NOLDS-LABEL: @lds_promoted_alloca_icmp_unknown_ptr(
139 ; NOLDS-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4, addrspace(5)
140 ; NOLDS-NEXT: [[PTR0:%.*]] = getelementptr inbounds [16 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[A:%.*]]
141 ; NOLDS-NEXT: [[PTR1:%.*]] = call ptr addrspace(5) @get_unknown_pointer()
142 ; NOLDS-NEXT: [[CMP:%.*]] = icmp eq ptr addrspace(5) [[PTR0]], [[PTR1]]
143 ; NOLDS-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
144 ; NOLDS-NEXT: store volatile i32 [[ZEXT]], ptr addrspace(1) [[OUT:%.*]], align 4
145 ; NOLDS-NEXT: ret void
147 %alloca = alloca [16 x i32], align 4, addrspace(5)
148 %ptr0 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
149 %ptr1 = call ptr addrspace(5) @get_unknown_pointer()
150 %cmp = icmp eq ptr addrspace(5) %ptr0, %ptr1
151 %zext = zext i1 %cmp to i32
152 store volatile i32 %zext, ptr addrspace(1) %out
156 declare ptr addrspace(5) @get_unknown_pointer() #0
158 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }