1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-aa-wrapper -amdgpu-aa -instcombine -o - %s | FileCheck %s
4 ; Make sure the optimization from memcpy-from-global.ll happens, but
5 ; the constant source is not a global variable.
7 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
9 ; Simple memcpy to alloca from constant address space argument.
10 define i8 @memcpy_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
11 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca(
12 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
13 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
14 ; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
15 ; CHECK-NEXT: ret i8 [[LOAD]]
17 %alloca = alloca [32 x i8], align 4, addrspace(5)
18 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false)
19 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
20 %load = load i8, ptr addrspace(5) %gep
24 define i8 @memcpy_constant_arg_ptr_to_alloca_load_metadata(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
25 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_metadata(
26 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
27 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
28 ; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1, !noalias [[META0:![0-9]+]]
29 ; CHECK-NEXT: ret i8 [[LOAD]]
31 %alloca = alloca [32 x i8], align 4, addrspace(5)
32 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false)
33 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
34 %load = load i8, ptr addrspace(5) %gep, !noalias !0
38 define i64 @memcpy_constant_arg_ptr_to_alloca_load_alignment(ptr addrspace(4) noalias readonly align 4 dereferenceable(256) %arg, i32 %idx) {
39 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_alignment(
40 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
41 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
42 ; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(4) [[GEP]], align 16
43 ; CHECK-NEXT: ret i64 [[LOAD]]
45 %alloca = alloca [32 x i64], align 4, addrspace(5)
46 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 256, i1 false)
47 %gep = getelementptr inbounds [32 x i64], ptr addrspace(5) %alloca, i32 0, i32 %idx
48 %load = load i64, ptr addrspace(5) %gep, align 16
52 define i64 @memcpy_constant_arg_ptr_to_alloca_load_atomic(ptr addrspace(4) noalias readonly align 8 dereferenceable(256) %arg, i32 %idx) {
53 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_load_atomic(
54 ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i64], align 8, addrspace(5)
55 ; CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 8 dereferenceable(256) [[ALLOCA]], ptr addrspace(4) noundef align 8 dereferenceable(256) [[ARG:%.*]], i64 256, i1 false)
56 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
57 ; CHECK-NEXT: [[LOAD:%.*]] = load atomic i64, ptr addrspace(5) [[GEP]] syncscope("somescope") acquire, align 8
58 ; CHECK-NEXT: ret i64 [[LOAD]]
60 %alloca = alloca [32 x i64], align 8, addrspace(5)
61 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 256, i1 false)
62 %gep = getelementptr inbounds [32 x i64], ptr addrspace(5) %alloca, i32 0, i32 %idx
63 %load = load atomic i64, ptr addrspace(5) %gep syncscope("somescope") acquire, align 8
67 ; Simple memmove to alloca from constant address space argument.
68 define i8 @memmove_constant_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
69 ; CHECK-LABEL: @memmove_constant_arg_ptr_to_alloca(
70 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
71 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
72 ; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
73 ; CHECK-NEXT: ret i8 [[LOAD]]
75 %alloca = alloca [32 x i8], align 4, addrspace(5)
76 call void @llvm.memmove.p5.p4.i32(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i32 32, i1 false)
77 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
78 %load = load i8, ptr addrspace(5) %gep
82 ; Simple memcpy to alloca from byref constant address space argument.
83 define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca(ptr addrspace(4) noalias readonly align 4 byref([32 x i8]) %arg, ptr addrspace(1) %out, i32 %idx) {
84 ; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca(
85 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
86 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
87 ; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
88 ; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1
89 ; CHECK-NEXT: ret void
91 %alloca = alloca [32 x i8], align 4, addrspace(5)
92 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 32, i1 false)
93 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
94 %load = load i8, ptr addrspace(5) %gep
95 store i8 %load, ptr addrspace(1) %out
99 ; Simple memcpy to alloca from byref constant address space argument, but not enough bytes are dereferenceable
100 define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca_too_many_bytes(ptr addrspace(4) noalias readonly align 4 byref([31 x i8]) %arg, ptr addrspace(1) %out, i32 %idx) {
101 ; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca_too_many_bytes(
102 ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5)
103 ; CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(31) [[ALLOCA]], ptr addrspace(4) noundef align 4 dereferenceable(31) [[ARG:%.*]], i64 31, i1 false)
104 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
105 ; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(5) [[GEP]], align 1
106 ; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1
107 ; CHECK-NEXT: ret void
109 %alloca = alloca [32 x i8], align 4, addrspace(5)
110 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %arg, i64 31, i1 false)
111 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
112 %load = load i8, ptr addrspace(5) %gep
113 store i8 %load, ptr addrspace(1) %out
117 ; Simple memcpy to alloca from constant address space intrinsic call
118 define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(ptr addrspace(1) %out, i32 %idx) {
119 ; CHECK-LABEL: @memcpy_constant_intrinsic_ptr_to_alloca(
120 ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5)
121 ; CHECK-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
122 ; CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(32) [[ALLOCA]], ptr addrspace(4) noundef align 16 dereferenceable(32) [[KERNARG_SEGMENT_PTR]], i64 32, i1 false)
123 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[IDX:%.*]]
124 ; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(5) [[GEP]], align 1
125 ; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) [[OUT:%.*]], align 1
126 ; CHECK-NEXT: ret void
128 %alloca = alloca [32 x i8], align 4, addrspace(5)
129 %kernarg.segment.ptr = call dereferenceable(32) align 16 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
130 call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %alloca, ptr addrspace(4) %kernarg.segment.ptr, i64 32, i1 false)
131 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
132 %load = load i8, ptr addrspace(5) %gep
133 store i8 %load, ptr addrspace(1) %out
137 ; Alloca is written through a flat pointer
138 define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
139 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat(
140 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
141 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(4) [[ARG:%.*]], i64 0, i64 [[TMP1]]
142 ; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) [[GEP]], align 1
143 ; CHECK-NEXT: ret i8 [[LOAD]]
145 %alloca = alloca [32 x i8], align 4, addrspace(5)
146 %alloca.cast.asc = addrspacecast ptr addrspace(5) %alloca to ptr
147 call void @llvm.memcpy.p0.p4.i64(ptr %alloca.cast.asc, ptr addrspace(4) %arg, i64 31, i1 false)
148 %gep = getelementptr inbounds [32 x i8], ptr addrspace(5) %alloca, i32 0, i32 %idx
149 %load = load i8, ptr addrspace(5) %gep
153 ; Alloca is only addressed through flat pointer.
154 define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2(ptr addrspace(4) noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) {
155 ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2(
156 ; CHECK-NEXT: [[ALLOCA_CAST_ASC:%.*]] = addrspacecast ptr addrspace(4) [[ARG:%.*]] to ptr
157 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64
158 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], ptr [[ALLOCA_CAST_ASC]], i64 0, i64 [[TMP1]]
159 ; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
160 ; CHECK-NEXT: ret i8 [[LOAD]]
162 %alloca = alloca [32 x i8], align 4, addrspace(5)
163 %alloca.cast.asc = addrspacecast ptr addrspace(5) %alloca to ptr
164 call void @llvm.memcpy.p0.p4.i64(ptr %alloca.cast.asc, ptr addrspace(4) %arg, i64 32, i1 false)
165 %gep = getelementptr inbounds [32 x i8], ptr %alloca.cast.asc, i32 0, i32 %idx
166 %load = load i8, ptr %gep
170 %struct.ty = type { [4 x i32] }
172 define amdgpu_kernel void @byref_infloop(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
173 ; CHECK-LABEL: @byref_infloop(
175 ; CHECK-NEXT: call void @llvm.memcpy.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false)
176 ; CHECK-NEXT: ret void
179 %alloca = alloca [4 x i32], align 4, addrspace(5)
180 call void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false)
181 call void @llvm.memcpy.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false)
185 define amdgpu_kernel void @byref_infloop_metadata(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
186 ; CHECK-LABEL: @byref_infloop_metadata(
188 ; CHECK-NEXT: call void @llvm.memcpy.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false), !noalias [[META0]]
189 ; CHECK-NEXT: ret void
192 %alloca = alloca [4 x i32], align 4, addrspace(5)
193 call void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false), !noalias !0
194 call void @llvm.memcpy.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false), !noalias !0
198 define amdgpu_kernel void @byref_infloop_addrspacecast(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
199 ; CHECK-LABEL: @byref_infloop_addrspacecast(
201 ; CHECK-NEXT: [[ADDRSPACECAST_ALLOCA:%.*]] = addrspacecast ptr addrspace(4) [[ARG:%.*]] to ptr
202 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr noundef nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i64 16, i1 false)
203 ; CHECK-NEXT: ret void
206 %alloca = alloca [4 x i32], align 4, addrspace(5)
207 %addrspacecast.alloca = addrspacecast ptr addrspace(5) %alloca to ptr
208 call void @llvm.memcpy.p0.p4.i64(ptr nonnull align 4 dereferenceable(16) %addrspacecast.alloca, ptr addrspace(4) align 4 dereferenceable(16) %arg, i64 16, i1 false)
209 call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(16) %scratch, ptr nonnull align 4 dereferenceable(16) %addrspacecast.alloca, i64 16, i1 false)
213 define amdgpu_kernel void @byref_infloop_memmove(ptr %scratch, ptr addrspace(4) byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 {
214 ; CHECK-LABEL: @byref_infloop_memmove(
216 ; CHECK-NEXT: call void @llvm.memmove.p0.p4.i32(ptr noundef nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(16) [[ARG:%.*]], i32 16, i1 false)
217 ; CHECK-NEXT: ret void
220 %alloca = alloca [4 x i32], align 4, addrspace(5)
221 call void @llvm.memmove.p5.p4.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(4) align 4 %arg, i32 16, i1 false)
222 call void @llvm.memmove.p0.p5.i32(ptr align 4 %scratch, ptr addrspace(5) align 4 %alloca, i32 16, i1 false)
226 declare void @llvm.memcpy.p0.p5.i32(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i32, i1 immarg) #0
227 declare void @llvm.memcpy.p5.p4.i32(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i32, i1) #0
228 declare void @llvm.memcpy.p0.p4.i64(ptr nocapture, ptr addrspace(4) nocapture, i64, i1) #0
229 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
230 declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i64, i1) #0
231 declare void @llvm.memmove.p5.p4.i32(ptr addrspace(5) nocapture, ptr addrspace(4) nocapture, i32, i1) #0
232 declare void @llvm.memmove.p0.p5.i32(ptr nocapture, ptr addrspace(5) nocapture, i32, i1) #0
233 declare ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #1
235 attributes #0 = { argmemonly nounwind willreturn }
236 attributes #1 = { nounwind readnone speculatable }