1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; Check the default works
3 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering %s | FileCheck -check-prefixes=OPT,MAX1024 %s
4 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering %s | FileCheck -check-prefixes=OPT,MAX1024 %s
6 ; Check the default explicitly set works
7 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
8 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s
9 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
10 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefixes=OPT,ALL %s
12 declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
13 declare void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
14 declare void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) nocapture, ptr addrspace(1) nocapture readonly, i32, i1) #1
15 declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
16 declare void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
18 declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture, ptr addrspace(1) nocapture readonly, i64, i1) #1
19 declare void @llvm.memmove.p1.p3.i32(ptr addrspace(1) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
20 declare void @llvm.memmove.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
21 declare void @llvm.memmove.p3.p0.i32(ptr addrspace(3) nocapture writeonly, ptr nocapture readonly, i32, i1 immarg) #1
22 declare void @llvm.memmove.p3.p3.i32(ptr addrspace(3) nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1 immarg) #1
23 declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
24 declare void @llvm.memmove.p3.p5.i32(ptr addrspace(3) nocapture, ptr addrspace(5) nocapture readonly, i32, i1) #1
25 declare void @llvm.memmove.p5.p3.i32(ptr addrspace(5) nocapture, ptr addrspace(3) nocapture readonly, i32, i1) #1
26 declare void @llvm.memmove.p0.p1.i64(ptr nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1
27 declare void @llvm.memmove.p1.p0.i64(ptr addrspace(1) nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
28 declare void @llvm.memmove.p5.p1.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1
29 declare void @llvm.memmove.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #1
30 declare void @llvm.memmove.p0.p5.i64(ptr nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #1
31 declare void @llvm.memmove.p5.p0.i64(ptr addrspace(5) nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
32 declare void @llvm.memmove.p1.p999.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(999) nocapture readonly, i64, i1 immarg) #1
33 declare void @llvm.memmove.p999.p1.i64(ptr addrspace(999) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg) #1
34 declare void @llvm.memmove.p999.p998.i64(ptr addrspace(999) nocapture writeonly, ptr addrspace(998) nocapture readonly, i64, i1 immarg) #1
36 declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture, i8, i64, i1) #1
38 ; Test the upper bound for sizes to leave
39 define amdgpu_kernel void @max_size_small_static_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
40 ; MAX1024-LABEL: @max_size_small_static_memcpy_caller0(
41 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 1024, i1 false)
42 ; MAX1024-NEXT: ret void
44 ; ALL-LABEL: @max_size_small_static_memcpy_caller0(
45 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
46 ; ALL: load-store-loop:
47 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
48 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
49 ; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
50 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
51 ; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
52 ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
53 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
54 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
58 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1024, i1 false)
62 ; Smallest static size which will be expanded
63 define amdgpu_kernel void @min_size_large_static_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
64 ; OPT-LABEL: @min_size_large_static_memcpy_caller0(
65 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
66 ; OPT: load-store-loop:
67 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
68 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
69 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
70 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
71 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
72 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
73 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
74 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
76 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
77 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1
78 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
79 ; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 1
82 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1025, i1 false)
86 define amdgpu_kernel void @max_size_small_static_memmove_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
87 ; MAX1024-LABEL: @max_size_small_static_memmove_caller0(
88 ; MAX1024-NEXT: call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 1024, i1 false)
89 ; MAX1024-NEXT: ret void
91 ; ALL-LABEL: @max_size_small_static_memmove_caller0(
92 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
93 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
94 ; ALL: memmove_bwd_loop:
95 ; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[TMP0:%.*]] ]
96 ; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 256
97 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
98 ; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1
99 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
100 ; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
101 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
102 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
103 ; ALL: memmove_fwd_loop:
104 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
105 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
106 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1
107 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
108 ; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
109 ; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256
110 ; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1024
111 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
115 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1024, i1 false)
119 define amdgpu_kernel void @min_size_large_static_memmove_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
120 ; OPT-LABEL: @min_size_large_static_memmove_caller0(
121 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
122 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
123 ; OPT: memmove_bwd_residual:
124 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
125 ; OPT-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
126 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
127 ; OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
128 ; OPT-NEXT: br label [[MEMMOVE_BWD_LOOP:%.*]]
129 ; OPT: memmove_bwd_loop:
130 ; OPT-NEXT: [[TMP4:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ]
131 ; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP4]], 256
132 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
133 ; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1
134 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
135 ; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP6]], align 1
136 ; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
137 ; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
138 ; OPT: memmove_fwd_loop:
139 ; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP10:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
140 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
141 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP8]], align 1
142 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
143 ; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP9]], align 1
144 ; OPT-NEXT: [[TMP10]] = add i64 [[FWD_INDEX]], 256
145 ; OPT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 1024
146 ; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
147 ; OPT: memmove_fwd_residual:
148 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
149 ; OPT-NEXT: [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 1
150 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
151 ; OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 1
152 ; OPT-NEXT: br label [[MEMMOVE_DONE]]
156 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1025, i1 false)
160 define amdgpu_kernel void @max_size_small_static_memset_caller0(ptr addrspace(1) %dst, i8 %val) #0 {
161 ; MAX1024-LABEL: @max_size_small_static_memset_caller0(
162 ; MAX1024-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 1024, i1 false)
163 ; MAX1024-NEXT: ret void
165 ; ALL-LABEL: @max_size_small_static_memset_caller0(
166 ; ALL-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
167 ; ALL: loadstoreloop:
168 ; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
169 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
170 ; ALL-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
171 ; ALL-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
172 ; ALL-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024
173 ; ALL-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
177 call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 1024, i1 false)
181 define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1) %dst, i8 %val) #0 {
182 ; OPT-LABEL: @min_size_large_static_memset_caller0(
183 ; OPT-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
184 ; OPT: loadstoreloop:
185 ; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
186 ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
187 ; OPT-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
188 ; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
189 ; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025
190 ; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
194 call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 1025, i1 false)
198 define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
199 ; OPT-LABEL: @variable_memcpy_caller0(
200 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
201 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
202 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
203 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
204 ; OPT: loop-memcpy-expansion:
205 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
206 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
207 ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
208 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
209 ; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
210 ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
211 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
212 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
213 ; OPT: loop-memcpy-residual:
214 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
215 ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
216 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
217 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
218 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
219 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
220 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
221 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
222 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
223 ; OPT: post-loop-memcpy-expansion:
225 ; OPT: loop-memcpy-residual-header:
226 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
227 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
229 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n, i1 false)
233 define amdgpu_kernel void @variable_memcpy_caller1(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
234 ; OPT-LABEL: @variable_memcpy_caller1(
235 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
236 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
237 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
238 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
239 ; OPT: loop-memcpy-expansion:
240 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
241 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
242 ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
243 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
244 ; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
245 ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
246 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
247 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
248 ; OPT: loop-memcpy-residual:
249 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
250 ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
251 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
252 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
253 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
254 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
255 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
256 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
257 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
258 ; OPT: post-loop-memcpy-expansion:
260 ; OPT: loop-memcpy-residual-header:
261 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
262 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
264 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n, i1 false)
268 define amdgpu_kernel void @memcpy_multi_use_one_function(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n, i64 %m) #0 {
269 ; OPT-LABEL: @memcpy_multi_use_one_function(
270 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
271 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
272 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
273 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
274 ; OPT: loop-memcpy-expansion2:
275 ; OPT-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ]
276 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX3]]
277 ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
278 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX3]]
279 ; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
280 ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX3]], 16
281 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
282 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION2]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5]]
283 ; OPT: loop-memcpy-residual4:
284 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL4:%.*]] ]
285 ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX6]]
286 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
287 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
288 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0]], i64 [[TMP10]]
289 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
290 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX6]], 1
291 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
292 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
293 ; OPT: post-loop-memcpy-expansion1:
294 ; OPT-NEXT: [[TMP17:%.*]] = and i64 [[M:%.*]], 15
295 ; OPT-NEXT: [[TMP18:%.*]] = sub i64 [[M]], [[TMP17]]
296 ; OPT-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP18]], 0
297 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
298 ; OPT: loop-memcpy-expansion:
299 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP23:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
300 ; OPT-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[LOOP_INDEX]]
301 ; OPT-NEXT: [[TMP21:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP20]], align 1
302 ; OPT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 [[LOOP_INDEX]]
303 ; OPT-NEXT: store <4 x i32> [[TMP21]], ptr addrspace(1) [[TMP22]], align 1
304 ; OPT-NEXT: [[TMP23]] = add i64 [[LOOP_INDEX]], 16
305 ; OPT-NEXT: [[TMP24:%.*]] = icmp ult i64 [[TMP23]], [[TMP18]]
306 ; OPT-NEXT: br i1 [[TMP24]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
307 ; OPT: loop-memcpy-residual:
308 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
309 ; OPT-NEXT: [[TMP25:%.*]] = add i64 [[TMP18]], [[RESIDUAL_LOOP_INDEX]]
310 ; OPT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP25]]
311 ; OPT-NEXT: [[TMP27:%.*]] = load i8, ptr addrspace(1) [[TMP26]], align 1
312 ; OPT-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 [[TMP25]]
313 ; OPT-NEXT: store i8 [[TMP27]], ptr addrspace(1) [[TMP28]], align 1
314 ; OPT-NEXT: [[TMP29]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
315 ; OPT-NEXT: [[TMP30:%.*]] = icmp ult i64 [[TMP29]], [[TMP17]]
316 ; OPT-NEXT: br i1 [[TMP30]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
317 ; OPT: post-loop-memcpy-expansion:
319 ; OPT: loop-memcpy-residual-header:
320 ; OPT-NEXT: [[TMP31:%.*]] = icmp ne i64 [[TMP17]], 0
321 ; OPT-NEXT: br i1 [[TMP31]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
322 ; OPT: loop-memcpy-residual-header5:
323 ; OPT-NEXT: [[TMP32:%.*]] = icmp ne i64 [[TMP2]], 0
324 ; OPT-NEXT: br i1 [[TMP32]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]]
326 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst0, ptr addrspace(1) %src, i64 %n, i1 false)
327 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %m, i1 false)
331 define amdgpu_kernel void @memcpy_alt_type(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
332 ; OPT-LABEL: @memcpy_alt_type(
333 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
334 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
335 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
336 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
337 ; OPT: loop-memcpy-expansion:
338 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
339 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
340 ; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
341 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
342 ; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
343 ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
344 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
345 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
346 ; OPT: loop-memcpy-residual:
347 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
348 ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
349 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
350 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
351 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[TMP10]]
352 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
353 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
354 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
355 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
356 ; OPT: post-loop-memcpy-expansion:
358 ; OPT: loop-memcpy-residual-header:
359 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
360 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
362 call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n, i1 false)
366 ; One of the uses in the function should be expanded, the other left alone.
367 define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(ptr addrspace(1) %dst0, ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 %n) #0 {
368 ; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small(
369 ; MAX1024-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
370 ; MAX1024-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
371 ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
372 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
373 ; MAX1024: loop-memcpy-expansion:
374 ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
375 ; MAX1024-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
376 ; MAX1024-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
377 ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX]]
378 ; MAX1024-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
379 ; MAX1024-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
380 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
381 ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
382 ; MAX1024: loop-memcpy-residual:
383 ; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
384 ; MAX1024-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
385 ; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
386 ; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
387 ; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0]], i64 [[TMP10]]
388 ; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
389 ; MAX1024-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
390 ; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
391 ; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
392 ; MAX1024: post-loop-memcpy-expansion:
393 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST1:%.*]], ptr addrspace(1) [[SRC]], i64 102, i1 false)
394 ; MAX1024-NEXT: ret void
395 ; MAX1024: loop-memcpy-residual-header:
396 ; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
397 ; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
399 ; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
400 ; ALL-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
401 ; ALL-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
402 ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
403 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
404 ; ALL: loop-memcpy-expansion:
405 ; ALL-NEXT: [[LOOP_INDEX1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
406 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX1]]
407 ; ALL-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
408 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0:%.*]], i64 [[LOOP_INDEX1]]
409 ; ALL-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
410 ; ALL-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX1]], 16
411 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
412 ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
413 ; ALL: loop-memcpy-residual:
414 ; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
415 ; ALL-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
416 ; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
417 ; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
418 ; ALL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST0]], i64 [[TMP10]]
419 ; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
420 ; ALL-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
421 ; ALL-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
422 ; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
423 ; ALL: post-loop-memcpy-expansion:
424 ; ALL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 0
425 ; ALL-NEXT: [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP16]], align 1
426 ; ALL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1:%.*]], i64 0
427 ; ALL-NEXT: store <4 x i32> [[TMP17]], ptr addrspace(1) [[TMP18]], align 1
428 ; ALL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 16
429 ; ALL-NEXT: [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP33]], align 1
430 ; ALL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 16
431 ; ALL-NEXT: store <4 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 1
432 ; ALL-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 32
433 ; ALL-NEXT: [[TMP35:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP34]], align 1
434 ; ALL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 32
435 ; ALL-NEXT: store <4 x i32> [[TMP35]], ptr addrspace(1) [[TMP36]], align 1
436 ; ALL-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 48
437 ; ALL-NEXT: [[TMP38:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP37]], align 1
438 ; ALL-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 48
439 ; ALL-NEXT: store <4 x i32> [[TMP38]], ptr addrspace(1) [[TMP39]], align 1
440 ; ALL-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 64
441 ; ALL-NEXT: [[TMP28:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP40]], align 1
442 ; ALL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 64
443 ; ALL-NEXT: store <4 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 1
444 ; ALL-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 80
445 ; ALL-NEXT: [[TMP31:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP30]], align 1
446 ; ALL-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 80
447 ; ALL-NEXT: store <4 x i32> [[TMP31]], ptr addrspace(1) [[TMP32]], align 1
448 ; ALL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 96
449 ; ALL-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 1
450 ; ALL-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 96
451 ; ALL-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1
452 ; ALL-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 100
453 ; ALL-NEXT: [[TMP25:%.*]] = load i16, ptr addrspace(1) [[TMP24]], align 1
454 ; ALL-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST1]], i64 100
455 ; ALL-NEXT: store i16 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
457 ; ALL: loop-memcpy-residual-header:
458 ; ALL-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP2]], 0
459 ; ALL-NEXT: br i1 [[TMP27]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
461 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst0, ptr addrspace(1) %src, i64 %n, i1 false)
462 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst1, ptr addrspace(1) %src, i64 102, i1 false)
466 define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
467 ; OPT-LABEL: @memcpy_global_align4_global_align4_1028(
468 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
469 ; OPT: load-store-loop:
470 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
471 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
472 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
473 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
474 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
475 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
476 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
477 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
479 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
480 ; OPT-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
481 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
482 ; OPT-NEXT: store i32 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
485 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1028, i1 false)
489 define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
490 ; OPT-LABEL: @memcpy_global_align4_global_align4_1025(
491 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
492 ; OPT: load-store-loop:
493 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
494 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
495 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
496 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
497 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
498 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
499 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
500 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
502 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
503 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 4
504 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
505 ; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
508 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1025, i1 false)
512 define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
513 ; OPT-LABEL: @memcpy_global_align4_global_align4_1026(
514 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
515 ; OPT: load-store-loop:
516 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
517 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
518 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
519 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
520 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
521 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
522 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
523 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
525 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
526 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4
527 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
528 ; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
531 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1026, i1 false)
535 define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
536 ; OPT-LABEL: @memcpy_global_align4_global_align4_1032(
537 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
538 ; OPT: load-store-loop:
539 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
540 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
541 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
542 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
543 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
544 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
545 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
546 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
548 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
549 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
550 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
551 ; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
554 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1032, i1 false)
558 define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
559 ; OPT-LABEL: @memcpy_global_align4_global_align4_1034(
560 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
561 ; OPT: load-store-loop:
562 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
563 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
564 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
565 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
566 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
567 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
568 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
569 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
571 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
572 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
573 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
574 ; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
575 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
576 ; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
577 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
578 ; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
581 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1034, i1 false)
585 define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
586 ; OPT-LABEL: @memcpy_global_align4_global_align4_1035(
587 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
588 ; OPT: load-store-loop:
589 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
590 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
591 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
592 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
593 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
594 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
595 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
596 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
598 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
599 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
600 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
601 ; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
602 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
603 ; OPT-NEXT: [[TMP10:%.*]] = load i16, ptr addrspace(1) [[TMP9]], align 4
604 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
605 ; OPT-NEXT: store i16 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
606 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1034
607 ; OPT-NEXT: [[TMP13:%.*]] = load i8, ptr addrspace(1) [[TMP12]], align 2
608 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1034
609 ; OPT-NEXT: store i8 [[TMP13]], ptr addrspace(1) [[TMP14]], align 2
612 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1035, i1 false)
616 define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
617 ; OPT-LABEL: @memcpy_global_align4_global_align4_1036(
618 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
619 ; OPT: load-store-loop:
620 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
621 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
622 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
623 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
624 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
625 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
626 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
627 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
629 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
630 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
631 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
632 ; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
633 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
634 ; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 4
635 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
636 ; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
639 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1036, i1 false)
643 define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
644 ; OPT-LABEL: @memcpy_global_align4_global_align4_1039(
645 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
646 ; OPT: load-store-loop:
647 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
648 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
649 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
650 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
651 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
652 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
653 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
654 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
656 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
657 ; OPT-NEXT: [[TMP7:%.*]] = load i64, ptr addrspace(1) [[TMP6]], align 4
658 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
659 ; OPT-NEXT: store i64 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
660 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
661 ; OPT-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP9]], align 4
662 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
663 ; OPT-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
664 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
665 ; OPT-NEXT: [[TMP13:%.*]] = load i16, ptr addrspace(1) [[TMP12]], align 4
666 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
667 ; OPT-NEXT: store i16 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
668 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
669 ; OPT-NEXT: [[TMP16:%.*]] = load i8, ptr addrspace(1) [[TMP15]], align 2
670 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
671 ; OPT-NEXT: store i8 [[TMP16]], ptr addrspace(1) [[TMP17]], align 2
674 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1039, i1 false)
678 define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
679 ; OPT-LABEL: @memcpy_global_align2_global_align2_1039(
680 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
681 ; OPT: load-store-loop:
682 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
683 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
684 ; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
685 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
686 ; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
687 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
688 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1038
689 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
691 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
692 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
693 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
694 ; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 2
697 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 %src, i64 1039, i1 false)
701 define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
702 ; OPT-LABEL: @memcpy_global_align4_global_align4_1027(
703 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
704 ; OPT: load-store-loop:
705 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
706 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
707 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 4
708 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
709 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
710 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
711 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1024
712 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
714 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
715 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(1) [[TMP6]], align 4
716 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
717 ; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(1) [[TMP8]], align 4
718 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
719 ; OPT-NEXT: [[TMP10:%.*]] = load i8, ptr addrspace(1) [[TMP9]], align 2
720 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
721 ; OPT-NEXT: store i8 [[TMP10]], ptr addrspace(1) [[TMP11]], align 2
724 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1027, i1 false)
728 define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
729 ; OPT-LABEL: @memcpy_global_align2_global_align4_1027(
730 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
731 ; OPT: load-store-loop:
732 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
733 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
734 ; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
735 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
736 ; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
737 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
738 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
739 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
741 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
742 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
743 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
744 ; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 2
747 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 4 %src, i64 1027, i1 false)
751 define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
752 ; OPT-LABEL: @memcpy_global_align4_global_align2_1027(
753 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
754 ; OPT: load-store-loop:
755 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
756 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
757 ; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 2
758 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
759 ; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
760 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 2
761 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1026
762 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
764 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1026
765 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 2
766 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1026
767 ; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(1) [[TMP8]], align 2
770 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 2 %src, i64 1027, i1 false)
774 define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
775 ; OPT-LABEL: @memcpy_private_align4_private_align4_1027(
776 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
777 ; OPT: load-store-loop:
778 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
779 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
780 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
781 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
782 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
783 ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
784 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
785 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
787 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
788 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4
789 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
790 ; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 4
791 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
792 ; OPT-NEXT: [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 2
793 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
794 ; OPT-NEXT: store i8 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
797 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %dst, ptr addrspace(5) align 4 %src, i32 1027, i1 false)
801 define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
802 ; OPT-LABEL: @memcpy_private_align2_private_align4_1027(
803 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
804 ; OPT: load-store-loop:
805 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
806 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
807 ; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
808 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
809 ; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
810 ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
811 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
812 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
814 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
815 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
816 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
817 ; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(5) [[TMP8]], align 2
820 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 4 %src, i32 1027, i1 false)
824 define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
825 ; OPT-LABEL: @memcpy_private_align1_private_align4_1027(
826 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
827 ; OPT: load-store-loop:
828 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
829 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
830 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 4
831 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
832 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1
833 ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
834 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
835 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
837 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
838 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 4
839 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
840 ; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 1
841 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
842 ; OPT-NEXT: [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 2
843 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
844 ; OPT-NEXT: store i8 [[TMP10]], ptr addrspace(5) [[TMP11]], align 1
847 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 1 %dst, ptr addrspace(5) align 4 %src, i32 1027, i1 false)
851 define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
852 ; OPT-LABEL: @memcpy_private_align4_private_align2_1027(
853 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
854 ; OPT: load-store-loop:
855 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
856 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
857 ; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
858 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
859 ; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
860 ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
861 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
862 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
864 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
865 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
866 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
867 ; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(5) [[TMP8]], align 2
870 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %dst, ptr addrspace(5) align 2 %src, i32 1027, i1 false)
874 define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
875 ; OPT-LABEL: @memcpy_private_align4_private_align1_1027(
876 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
877 ; OPT: load-store-loop:
878 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
879 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
880 ; OPT-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1
881 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
882 ; OPT-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 4
883 ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256
884 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1024
885 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
887 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1024
888 ; OPT-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(5) [[TMP6]], align 1
889 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1024
890 ; OPT-NEXT: store i16 [[TMP7]], ptr addrspace(5) [[TMP8]], align 4
891 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
892 ; OPT-NEXT: [[TMP10:%.*]] = load i8, ptr addrspace(5) [[TMP9]], align 1
893 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
894 ; OPT-NEXT: store i8 [[TMP10]], ptr addrspace(5) [[TMP11]], align 2
897 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %dst, ptr addrspace(5) align 1 %src, i32 1027, i1 false)
901 define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(ptr addrspace(5) %dst, ptr addrspace(5) %src) #0 {
902 ; OPT-LABEL: @memcpy_private_align2_private_align2_1027(
903 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
904 ; OPT: load-store-loop:
905 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
906 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
907 ; OPT-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[TMP1]], align 2
908 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
909 ; OPT-NEXT: store i16 [[TMP2]], ptr addrspace(5) [[TMP3]], align 2
910 ; OPT-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 2
911 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 1026
912 ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
914 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 1026
915 ; OPT-NEXT: [[TMP7:%.*]] = load i8, ptr addrspace(5) [[TMP6]], align 2
916 ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 1026
917 ; OPT-NEXT: store i8 [[TMP7]], ptr addrspace(5) [[TMP8]], align 2
920 call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 2 %dst, ptr addrspace(5) align 2 %src, i32 1027, i1 false)
924 define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
925 ; OPT-LABEL: @memcpy_global_align4_global_align4_variable(
926 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
927 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
928 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
929 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
930 ; OPT: loop-memcpy-expansion:
931 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
932 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
933 ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 4
934 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
935 ; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
936 ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
937 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
938 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
939 ; OPT: loop-memcpy-residual:
940 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
941 ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
942 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
943 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
944 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
945 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
946 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
947 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
948 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
949 ; OPT: post-loop-memcpy-expansion:
951 ; OPT: loop-memcpy-residual-header:
952 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
953 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
955 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 %n, i1 false)
959 define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
960 ; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
961 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 1
962 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
963 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
964 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
965 ; OPT: loop-memcpy-expansion:
966 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
967 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
968 ; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(1) [[TMP5]], align 2
969 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
970 ; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(1) [[TMP7]], align 2
971 ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 2
972 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
973 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
974 ; OPT: loop-memcpy-residual:
975 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
976 ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
977 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
978 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
979 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
980 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
981 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
982 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
983 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
984 ; OPT: post-loop-memcpy-expansion:
986 ; OPT: loop-memcpy-residual-header:
987 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
988 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
990 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 2 %dst, ptr addrspace(1) align 2 %src, i64 %n, i1 false)
994 define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
995 ; OPT-LABEL: @memcpy_global_align1_global_align1_variable(
996 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
997 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[N]], [[TMP2]]
998 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
999 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1000 ; OPT: loop-memcpy-expansion:
1001 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1002 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
1003 ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP5]], align 1
1004 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
1005 ; OPT-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 1
1006 ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
1007 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
1008 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1009 ; OPT: loop-memcpy-residual:
1010 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1011 ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1012 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[TMP10]]
1013 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
1014 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP10]]
1015 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
1016 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
1017 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
1018 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1019 ; OPT: post-loop-memcpy-expansion:
1020 ; OPT-NEXT: ret void
1021 ; OPT: loop-memcpy-residual-header:
1022 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
1023 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1025 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 1 %dst, ptr addrspace(1) align 1 %src, i64 %n, i1 false)
1029 define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
1030 ; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
1031 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
1032 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
1033 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1034 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1035 ; OPT: loop-memcpy-expansion:
1036 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1037 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1038 ; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
1039 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
1040 ; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
1041 ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
1042 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1043 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1044 ; OPT: loop-memcpy-residual:
1045 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1046 ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1047 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1048 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
1049 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1050 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
1051 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1052 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1053 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1054 ; OPT: post-loop-memcpy-expansion:
1055 ; OPT-NEXT: ret void
1056 ; OPT: loop-memcpy-residual-header:
1057 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1058 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1060 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 4 %dst, ptr addrspace(3) align 4 %src, i32 %n, i1 false)
1064 define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
1065 ; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
1066 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 1
1067 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
1068 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1069 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1070 ; OPT: loop-memcpy-expansion:
1071 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1072 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1073 ; OPT-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(3) [[TMP5]], align 2
1074 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
1075 ; OPT-NEXT: store i16 [[TMP6]], ptr addrspace(3) [[TMP7]], align 2
1076 ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 2
1077 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1078 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1079 ; OPT: loop-memcpy-residual:
1080 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1081 ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1082 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1083 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
1084 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1085 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
1086 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1087 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1088 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1089 ; OPT: post-loop-memcpy-expansion:
1090 ; OPT-NEXT: ret void
1091 ; OPT: loop-memcpy-residual-header:
1092 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1093 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1095 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 2 %dst, ptr addrspace(3) align 2 %src, i32 %n, i1 false)
1099 define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %n) #0 {
1100 ; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
1101 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
1102 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
1103 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1104 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1105 ; OPT: loop-memcpy-expansion:
1106 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1107 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1108 ; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
1109 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
1110 ; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 1
1111 ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
1112 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1113 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1114 ; OPT: loop-memcpy-residual:
1115 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1116 ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1117 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1118 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
1119 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1120 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
1121 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1122 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1123 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1124 ; OPT: post-loop-memcpy-expansion:
1125 ; OPT-NEXT: ret void
1126 ; OPT: loop-memcpy-residual-header:
1127 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1128 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1130 call void @llvm.memcpy.p3.p3.i32(ptr addrspace(3) align 1 %dst, ptr addrspace(3) align 1 %src, i32 %n, i1 false)
1134 define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(ptr addrspace(3) %dst, ptr addrspace(1) %src, i32 %n) #0 {
1135 ; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
1136 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
1137 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
1138 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1139 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1140 ; OPT: loop-memcpy-expansion:
1141 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1142 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1143 ; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP5]], align 4
1144 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
1145 ; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(3) [[TMP7]], align 4
1146 ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
1147 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1148 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1149 ; OPT: loop-memcpy-residual:
1150 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1151 ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1152 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i32 [[TMP10]]
1153 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(1) [[TMP11]], align 1
1154 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1155 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1
1156 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1157 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1158 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1159 ; OPT: post-loop-memcpy-expansion:
1160 ; OPT-NEXT: ret void
1161 ; OPT: loop-memcpy-residual-header:
1162 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1163 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1165 call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 %dst, ptr addrspace(1) align 4 %src, i32 %n, i1 false)
1169 define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %n) #0 {
1170 ; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
1171 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[N:%.*]], 7
1172 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[N]], [[TMP2]]
1173 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1174 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1175 ; OPT: loop-memcpy-expansion:
1176 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1177 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1178 ; OPT-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 4
1179 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i32 [[LOOP_INDEX]]
1180 ; OPT-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1181 ; OPT-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
1182 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1183 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1184 ; OPT: loop-memcpy-residual:
1185 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1186 ; OPT-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1187 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1188 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
1189 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i32 [[TMP10]]
1190 ; OPT-NEXT: store i8 [[TMP12]], ptr addrspace(1) [[TMP13]], align 1
1191 ; OPT-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1192 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1193 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1194 ; OPT: post-loop-memcpy-expansion:
1195 ; OPT-NEXT: ret void
1196 ; OPT: loop-memcpy-residual-header:
1197 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1198 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1200 call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %dst, ptr addrspace(3) align 4 %src, i32 %n, i1 false)
1204 define amdgpu_kernel void @memcpy_global_align4_global_align4_16(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
1205 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_16(
1206 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 16, i1 false)
1207 ; MAX1024-NEXT: ret void
1209 ; ALL-LABEL: @memcpy_global_align4_global_align4_16(
1210 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
1211 ; ALL-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP1]], align 4
1212 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
1213 ; ALL-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1214 ; ALL-NEXT: ret void
1216 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 16, i1 false)
1220 define amdgpu_kernel void @memcpy_global_align4_global_align4_12(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
1221 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_12(
1222 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 12, i1 false)
1223 ; MAX1024-NEXT: ret void
1225 ; ALL-LABEL: @memcpy_global_align4_global_align4_12(
1226 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
1227 ; ALL-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
1228 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
1229 ; ALL-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1230 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 8
1231 ; ALL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(1) [[TMP4]], align 4
1232 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 8
1233 ; ALL-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1234 ; ALL-NEXT: ret void
1236 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 12, i1 false)
1240 define amdgpu_kernel void @memcpy_global_align4_global_align4_8(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
1241 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_8(
1242 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 8, i1 false)
1243 ; MAX1024-NEXT: ret void
1245 ; ALL-LABEL: @memcpy_global_align4_global_align4_8(
1246 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
1247 ; ALL-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
1248 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
1249 ; ALL-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1250 ; ALL-NEXT: ret void
1252 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 8, i1 false)
1256 define amdgpu_kernel void @memcpy_global_align4_global_align4_10(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
1257 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_10(
1258 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 10, i1 false)
1259 ; MAX1024-NEXT: ret void
1261 ; ALL-LABEL: @memcpy_global_align4_global_align4_10(
1262 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
1263 ; ALL-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
1264 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
1265 ; ALL-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1266 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 8
1267 ; ALL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 4
1268 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 8
1269 ; ALL-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1270 ; ALL-NEXT: ret void
1272 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 10, i1 false)
1276 define amdgpu_kernel void @memcpy_global_align4_global_align4_4(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
1277 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_4(
1278 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 4, i1 false)
1279 ; MAX1024-NEXT: ret void
1281 ; ALL-LABEL: @memcpy_global_align4_global_align4_4(
1282 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
1283 ; ALL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4
1284 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
1285 ; ALL-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1286 ; ALL-NEXT: ret void
1288 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 4, i1 false)
1292 define amdgpu_kernel void @memcpy_global_align4_global_align4_2(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
1293 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_2(
1294 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 2, i1 false)
1295 ; MAX1024-NEXT: ret void
1297 ; ALL-LABEL: @memcpy_global_align4_global_align4_2(
1298 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
1299 ; ALL-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(1) [[TMP1]], align 4
1300 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
1301 ; ALL-NEXT: store i16 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1302 ; ALL-NEXT: ret void
1304 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 2, i1 false)
1308 define amdgpu_kernel void @memcpy_global_align4_global_align4_1(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
1309 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_1(
1310 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 [[DST:%.*]], ptr addrspace(1) align 4 [[SRC:%.*]], i64 1, i1 false)
1311 ; MAX1024-NEXT: ret void
1313 ; ALL-LABEL: @memcpy_global_align4_global_align4_1(
1314 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 0
1315 ; ALL-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 4
1316 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
1317 ; ALL-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1318 ; ALL-NEXT: ret void
1320 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 4 %dst, ptr addrspace(1) align 4 %src, i64 1, i1 false)
1324 define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrspace(1) %src) {
1325 ; MAX1024-LABEL: @memmove_flat_align1_global_align1(
1326 ; MAX1024-NEXT: call void @llvm.memmove.p0.p1.i64(ptr [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false)
1327 ; MAX1024-NEXT: ret void
1329 ; ALL-LABEL: @memmove_flat_align1_global_align1(
1330 ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(1)
1331 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP1]]
1332 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
1333 ; ALL: memmove_bwd_loop:
1334 ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
1335 ; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256
1336 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
1337 ; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP3]], align 1
1338 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]]
1339 ; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
1340 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
1341 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
1342 ; ALL: memmove_fwd_loop:
1343 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
1344 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
1345 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP6]], align 1
1346 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]]
1347 ; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
1348 ; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256
1349 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
1350 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
1351 ; ALL: memmove_done:
1352 ; ALL-NEXT: ret void
1354 call void @llvm.memmove.p0.p1.i64(ptr %dst, ptr addrspace(1) %src, i64 256, i1 false)
1358 define amdgpu_kernel void @memmove_global_align1_flat_align1(ptr addrspace(1) %dst, ptr %src) {
1359 ; MAX1024-LABEL: @memmove_global_align1_flat_align1(
1360 ; MAX1024-NEXT: call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false)
1361 ; MAX1024-NEXT: ret void
1363 ; ALL-LABEL: @memmove_global_align1_flat_align1(
1364 ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr
1365 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
1366 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
1367 ; ALL: memmove_bwd_loop:
1368 ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
1369 ; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256
1370 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]]
1371 ; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1
1372 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
1373 ; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP4]], align 1
1374 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
1375 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
1376 ; ALL: memmove_fwd_loop:
1377 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
1378 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]]
1379 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1
1380 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
1381 ; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP7]], align 1
1382 ; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256
1383 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
1384 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
1385 ; ALL: memmove_done:
1386 ; ALL-NEXT: ret void
1388 call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) %dst, ptr %src, i64 256, i1 false)
1392 define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addrspace(5) %src) {
1393 ; MAX1024-LABEL: @memmove_flat_align1_private_align1(
1394 ; MAX1024-NEXT: call void @llvm.memmove.p0.p5.i64(ptr [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false)
1395 ; MAX1024-NEXT: ret void
1397 ; ALL-LABEL: @memmove_flat_align1_private_align1(
1398 ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(5)
1399 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[TMP1]]
1400 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
1401 ; ALL: memmove_bwd_loop:
1402 ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
1403 ; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256
1404 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[BWD_INDEX]]
1405 ; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP3]], align 1
1406 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[BWD_INDEX]]
1407 ; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
1408 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
1409 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
1410 ; ALL: memmove_fwd_loop:
1411 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
1412 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i64 [[FWD_INDEX]]
1413 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP6]], align 1
1414 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[FWD_INDEX]]
1415 ; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
1416 ; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256
1417 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
1418 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
1419 ; ALL: memmove_done:
1420 ; ALL-NEXT: ret void
1422 call void @llvm.memmove.p0.p5.i64(ptr %dst, ptr addrspace(5) %src, i64 256, i1 false)
1426 define amdgpu_kernel void @memmove_private_align1_flat_align1(ptr addrspace(5) %dst, ptr %src) {
1427 ; MAX1024-LABEL: @memmove_private_align1_flat_align1(
1428 ; MAX1024-NEXT: call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) [[DST:%.*]], ptr [[SRC:%.*]], i64 256, i1 false)
1429 ; MAX1024-NEXT: ret void
1431 ; ALL-LABEL: @memmove_private_align1_flat_align1(
1432 ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DST:%.*]] to ptr
1433 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
1434 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
1435 ; ALL: memmove_bwd_loop:
1436 ; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
1437 ; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP2]], 256
1438 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[BWD_INDEX]]
1439 ; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr [[TMP3]], align 1
1440 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[BWD_INDEX]]
1441 ; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP4]], align 1
1442 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
1443 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
1444 ; ALL: memmove_fwd_loop:
1445 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
1446 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[FWD_INDEX]]
1447 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr [[TMP6]], align 1
1448 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i64 [[FWD_INDEX]]
1449 ; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP7]], align 1
1450 ; ALL-NEXT: [[TMP8]] = add i64 [[FWD_INDEX]], 256
1451 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 256
1452 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
1453 ; ALL: memmove_done:
1454 ; ALL-NEXT: ret void
1456 call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) %dst, ptr %src, i64 256, i1 false)
1460 define amdgpu_kernel void @memmove_private_align1_global_align1(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
1461 ; MAX1024-LABEL: @memmove_private_align1_global_align1(
1462 ; MAX1024-NEXT: call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 256, i1 false)
1463 ; MAX1024-NEXT: ret void
1465 ; ALL-LABEL: @memmove_private_align1_global_align1(
1466 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
1467 ; ALL: load-store-loop:
1468 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
1469 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
1470 ; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP1]], align 1, !alias.scope [[META0:![0-9]+]]
1471 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i64 [[LOOP_INDEX]]
1472 ; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META0]]
1473 ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
1474 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256
1475 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1476 ; ALL: memcpy-split:
1477 ; ALL-NEXT: ret void
1479 call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) %dst, ptr addrspace(1) %src, i64 256, i1 false)
1483 define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1) %dst, ptr addrspace(5) %src) {
1484 ; MAX1024-LABEL: @memmove_global_align1_private_align1(
1485 ; MAX1024-NEXT: call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i64 256, i1 false)
1486 ; MAX1024-NEXT: ret void
1488 ; ALL-LABEL: @memmove_global_align1_private_align1(
1489 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
1490 ; ALL: load-store-loop:
1491 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
1492 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i64 [[LOOP_INDEX]]
1493 ; ALL-NEXT: [[TMP2:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]]
1494 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
1495 ; ALL-NEXT: store <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1, !noalias [[META3]]
1496 ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
1497 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 256
1498 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1499 ; ALL: memcpy-split:
1500 ; ALL-NEXT: ret void
1502 call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) %dst, ptr addrspace(5) %src, i64 256, i1 false)
1506 define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size) {
1507 ; OPT-LABEL: @memmove_global_align1_p999_align1(
1508 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15
1509 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1510 ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1511 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0
1512 ; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr addrspace(999)
1513 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(999) [[SRC:%.*]], [[TMP4]]
1514 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1515 ; OPT: memmove_copy_backwards:
1516 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1517 ; OPT: memmove_bwd_residual_loop:
1518 ; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1519 ; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1520 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1521 ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(999) [[TMP6]], align 1
1522 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1523 ; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(1) [[TMP7]], align 1
1524 ; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1525 ; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1526 ; OPT: memmove_bwd_middle:
1527 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1528 ; OPT: memmove_bwd_main_loop:
1529 ; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
1530 ; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16
1531 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1532 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP10]], align 1
1533 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]]
1534 ; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP11]], align 1
1535 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1536 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1537 ; OPT: memmove_copy_forward:
1538 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1539 ; OPT: memmove_fwd_main_loop:
1540 ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1541 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1542 ; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP13]], align 1
1543 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]]
1544 ; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(1) [[TMP14]], align 1
1545 ; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16
1546 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]]
1547 ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1548 ; OPT: memmove_fwd_middle:
1549 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1550 ; OPT: memmove_fwd_residual_loop:
1551 ; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1552 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1553 ; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(999) [[TMP17]], align 1
1554 ; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1555 ; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(1) [[TMP18]], align 1
1556 ; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1557 ; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1558 ; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1559 ; OPT: memmove_done:
1560 ; OPT-NEXT: ret void
1562 call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size, i1 false)
1566 define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size) {
1567 ; OPT-LABEL: @memmove_p999_align1_p1_align1(
1568 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15
1569 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1570 ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1571 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0
1572 ; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(1)
1573 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP4]]
1574 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1575 ; OPT: memmove_copy_backwards:
1576 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1577 ; OPT: memmove_bwd_residual_loop:
1578 ; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1579 ; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1580 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1581 ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1
1582 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1583 ; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1
1584 ; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1585 ; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1586 ; OPT: memmove_bwd_middle:
1587 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1588 ; OPT: memmove_bwd_main_loop:
1589 ; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
1590 ; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16
1591 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1592 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP10]], align 1
1593 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
1594 ; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1
1595 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1596 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1597 ; OPT: memmove_copy_forward:
1598 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1599 ; OPT: memmove_fwd_main_loop:
1600 ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1601 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1602 ; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP13]], align 1
1603 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
1604 ; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1
1605 ; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16
1606 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]]
1607 ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1608 ; OPT: memmove_fwd_middle:
1609 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1610 ; OPT: memmove_fwd_residual_loop:
1611 ; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1612 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1613 ; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(1) [[TMP17]], align 1
1614 ; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1615 ; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1
1616 ; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1617 ; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1618 ; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1619 ; OPT: memmove_done:
1620 ; OPT-NEXT: ret void
1622 call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size, i1 false)
1626 define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size) {
1627 ; OPT-LABEL: @memmove_p999_align1_p998_align1(
1628 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE:%.*]], 15
1629 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1630 ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1631 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP3]], 0
1632 ; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(998)
1633 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(998) [[SRC:%.*]], [[TMP4]]
1634 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1635 ; OPT: memmove_copy_backwards:
1636 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1637 ; OPT: memmove_bwd_residual_loop:
1638 ; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1639 ; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1640 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1641 ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(998) [[TMP6]], align 1
1642 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1643 ; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1
1644 ; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1645 ; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1646 ; OPT: memmove_bwd_middle:
1647 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1648 ; OPT: memmove_bwd_main_loop:
1649 ; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
1650 ; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 16
1651 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1652 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP10]], align 1
1653 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
1654 ; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1
1655 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1656 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1657 ; OPT: memmove_copy_forward:
1658 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1659 ; OPT: memmove_fwd_main_loop:
1660 ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1661 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1662 ; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP13]], align 1
1663 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
1664 ; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1
1665 ; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 16
1666 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP3]]
1667 ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1668 ; OPT: memmove_fwd_middle:
1669 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1670 ; OPT: memmove_fwd_residual_loop:
1671 ; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1672 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1673 ; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(998) [[TMP17]], align 1
1674 ; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1675 ; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1
1676 ; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1677 ; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1678 ; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1679 ; OPT: memmove_done:
1680 ; OPT-NEXT: ret void
1682 call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size, i1 false)
1686 define amdgpu_kernel void @memmove_local_align1_private_align1(ptr addrspace(3) %dst, ptr addrspace(5) %src) {
1687 ; MAX1024-LABEL: @memmove_local_align1_private_align1(
1688 ; MAX1024-NEXT: call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false)
1689 ; MAX1024-NEXT: ret void
1691 ; ALL-LABEL: @memmove_local_align1_private_align1(
1692 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
1693 ; ALL: load-store-loop:
1694 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
1695 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1696 ; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP1]], align 1, !alias.scope [[META6:![0-9]+]]
1697 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
1698 ; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(3) [[TMP3]], align 1, !noalias [[META6]]
1699 ; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8
1700 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
1701 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1702 ; ALL: memcpy-split:
1703 ; ALL-NEXT: ret void
1705 call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 256, i1 false)
1709 define amdgpu_kernel void @memmove_local_align1_private_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size) {
1710 ; MAX1024-LABEL: @memmove_local_align1_private_align1_unknown_size(
1711 ; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
1712 ; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
1713 ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1714 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1715 ; MAX1024: loop-memcpy-expansion:
1716 ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1717 ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1718 ; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META0:![0-9]+]]
1719 ; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
1720 ; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META0]]
1721 ; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
1722 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1723 ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1724 ; MAX1024: loop-memcpy-residual:
1725 ; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1726 ; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1727 ; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
1728 ; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META0]]
1729 ; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1730 ; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META0]]
1731 ; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1732 ; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1733 ; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1734 ; MAX1024: post-loop-memcpy-expansion:
1735 ; MAX1024-NEXT: ret void
1736 ; MAX1024: loop-memcpy-residual-header:
1737 ; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1738 ; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1740 ; ALL-LABEL: @memmove_local_align1_private_align1_unknown_size(
1741 ; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
1742 ; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
1743 ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1744 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1745 ; ALL: loop-memcpy-expansion:
1746 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1747 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1748 ; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP7]], align 1, !alias.scope [[META9:![0-9]+]]
1749 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST:%.*]], i32 [[LOOP_INDEX]]
1750 ; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(3) [[TMP6]], align 1, !noalias [[META9]]
1751 ; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
1752 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1753 ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1754 ; ALL: loop-memcpy-residual:
1755 ; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1756 ; ALL-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1757 ; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[TMP10]]
1758 ; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(5) [[TMP11]], align 1, !alias.scope [[META9]]
1759 ; ALL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[TMP10]]
1760 ; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(3) [[TMP13]], align 1, !noalias [[META9]]
1761 ; ALL-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1762 ; ALL-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1763 ; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1764 ; ALL: post-loop-memcpy-expansion:
1765 ; ALL-NEXT: ret void
1766 ; ALL: loop-memcpy-residual-header:
1767 ; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1768 ; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1770 call void @llvm.memmove.p3.p5.i32(ptr addrspace(3) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
1774 define amdgpu_kernel void @memmove_private_align1_local_align1(ptr addrspace(5) %dst, ptr addrspace(3) %src) {
1775 ; MAX1024-LABEL: @memmove_private_align1_local_align1(
1776 ; MAX1024-NEXT: call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
1777 ; MAX1024-NEXT: ret void
1779 ; ALL-LABEL: @memmove_private_align1_local_align1(
1780 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
1781 ; ALL: load-store-loop:
1782 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
1783 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1784 ; ALL-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP1]], align 1, !alias.scope [[META12:![0-9]+]]
1785 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
1786 ; ALL-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[TMP3]], align 1, !noalias [[META12]]
1787 ; ALL-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 8
1788 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 256
1789 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1790 ; ALL: memcpy-split:
1791 ; ALL-NEXT: ret void
1793 call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 256, i1 false)
1797 define amdgpu_kernel void @memmove_private_align1_local_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size) {
1798 ; MAX1024-LABEL: @memmove_private_align1_local_align1_unknown_size(
1799 ; MAX1024-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
1800 ; MAX1024-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
1801 ; MAX1024-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1802 ; MAX1024-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1803 ; MAX1024: loop-memcpy-expansion:
1804 ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1805 ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1806 ; MAX1024-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META3:![0-9]+]]
1807 ; MAX1024-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
1808 ; MAX1024-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META3]]
1809 ; MAX1024-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
1810 ; MAX1024-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1811 ; MAX1024-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1812 ; MAX1024: loop-memcpy-residual:
1813 ; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1814 ; MAX1024-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1815 ; MAX1024-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1816 ; MAX1024-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META3]]
1817 ; MAX1024-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
1818 ; MAX1024-NEXT: store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META3]]
1819 ; MAX1024-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1820 ; MAX1024-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1821 ; MAX1024-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1822 ; MAX1024: post-loop-memcpy-expansion:
1823 ; MAX1024-NEXT: ret void
1824 ; MAX1024: loop-memcpy-residual-header:
1825 ; MAX1024-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1826 ; MAX1024-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1828 ; ALL-LABEL: @memmove_private_align1_local_align1_unknown_size(
1829 ; ALL-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
1830 ; ALL-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
1831 ; ALL-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
1832 ; ALL-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1833 ; ALL: loop-memcpy-expansion:
1834 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1835 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC:%.*]], i32 [[LOOP_INDEX]]
1836 ; ALL-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP7]], align 1, !alias.scope [[META15:![0-9]+]]
1837 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST:%.*]], i32 [[LOOP_INDEX]]
1838 ; ALL-NEXT: store <2 x i32> [[TMP5]], ptr addrspace(5) [[TMP6]], align 1, !noalias [[META15]]
1839 ; ALL-NEXT: [[TMP8]] = add i32 [[LOOP_INDEX]], 8
1840 ; ALL-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], [[TMP3]]
1841 ; ALL-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1842 ; ALL: loop-memcpy-residual:
1843 ; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1844 ; ALL-NEXT: [[TMP10:%.*]] = add i32 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
1845 ; ALL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[TMP10]]
1846 ; ALL-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1, !alias.scope [[META15]]
1847 ; ALL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[TMP10]]
1848 ; ALL-NEXT: store i8 [[TMP12]], ptr addrspace(5) [[TMP13]], align 1, !noalias [[META15]]
1849 ; ALL-NEXT: [[TMP14]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1850 ; ALL-NEXT: [[TMP15:%.*]] = icmp ult i32 [[TMP14]], [[TMP2]]
1851 ; ALL-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1852 ; ALL: post-loop-memcpy-expansion:
1853 ; ALL-NEXT: ret void
1854 ; ALL: loop-memcpy-residual-header:
1855 ; ALL-NEXT: [[TMP16:%.*]] = icmp ne i32 [[TMP2]], 0
1856 ; ALL-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1858 call void @llvm.memmove.p5.p3.i32(ptr addrspace(5) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
1863 define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %dst, ptr addrspace(3) %src) {
1864 ; MAX1024-LABEL: @memmove_flat_align1_local_align1(
1865 ; MAX1024-NEXT: call void @llvm.memmove.p0.p3.i32(ptr [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
1866 ; MAX1024-NEXT: ret void
1868 ; ALL-LABEL: @memmove_flat_align1_local_align1(
1869 ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3)
1870 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP1]]
1871 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
1872 ; ALL: memmove_bwd_loop:
1873 ; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
1874 ; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8
1875 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
1876 ; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP3]], align 1
1877 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_INDEX]]
1878 ; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr [[TMP4]], align 1
1879 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
1880 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
1881 ; ALL: memmove_fwd_loop:
1882 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
1883 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
1884 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP6]], align 1
1885 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_INDEX]]
1886 ; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP7]], align 1
1887 ; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8
1888 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
1889 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
1890 ; ALL: memmove_done:
1891 ; ALL-NEXT: ret void
1893 call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 256, i1 false)
1897 define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size) {
1898 ; OPT-LABEL: @memmove_flat_align1_local_align1_unknown_size(
1899 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
1900 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
1901 ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
1902 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
1903 ; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3)
1904 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP4]]
1905 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1906 ; OPT: memmove_copy_backwards:
1907 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1908 ; OPT: memmove_bwd_residual_loop:
1909 ; OPT-NEXT: [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1910 ; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
1911 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
1912 ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP6]], align 1
1913 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
1914 ; OPT-NEXT: store i8 [[ELEMENT]], ptr [[TMP7]], align 1
1915 ; OPT-NEXT: [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1916 ; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1917 ; OPT: memmove_bwd_middle:
1918 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1919 ; OPT: memmove_bwd_main_loop:
1920 ; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
1921 ; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
1922 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
1923 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP10]], align 1
1924 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[BWD_MAIN_INDEX]]
1925 ; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr [[TMP11]], align 1
1926 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
1927 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1928 ; OPT: memmove_copy_forward:
1929 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1930 ; OPT: memmove_fwd_main_loop:
1931 ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1932 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
1933 ; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP13]], align 1
1934 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_MAIN_INDEX]]
1935 ; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr [[TMP14]], align 1
1936 ; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
1937 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
1938 ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1939 ; OPT: memmove_fwd_middle:
1940 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1941 ; OPT: memmove_fwd_residual_loop:
1942 ; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1943 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
1944 ; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP17]], align 1
1945 ; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
1946 ; OPT-NEXT: store i8 [[ELEMENT3]], ptr [[TMP18]], align 1
1947 ; OPT-NEXT: [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
1948 ; OPT-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
1949 ; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1950 ; OPT: memmove_done:
1951 ; OPT-NEXT: ret void
1953 call void @llvm.memmove.p0.p3.i32(ptr addrspace(0) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
1957 define amdgpu_kernel void @memmove_local_align1_flat_align1(ptr addrspace(3) %dst, ptr addrspace(0) %src) {
1958 ; MAX1024-LABEL: @memmove_local_align1_flat_align1(
1959 ; MAX1024-NEXT: call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) [[DST:%.*]], ptr [[SRC:%.*]], i32 256, i1 false)
1960 ; MAX1024-NEXT: ret void
1962 ; ALL-LABEL: @memmove_local_align1_flat_align1(
1963 ; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
1964 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP1]]
1965 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
1966 ; ALL: memmove_bwd_loop:
1967 ; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
1968 ; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP2]], 8
1969 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_INDEX]]
1970 ; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr [[TMP3]], align 1
1971 ; ALL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
1972 ; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP4]], align 1
1973 ; ALL-NEXT: [[TMP5:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
1974 ; ALL-NEXT: br i1 [[TMP5]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
1975 ; ALL: memmove_fwd_loop:
1976 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP8:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
1977 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_INDEX]]
1978 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP6]], align 1
1979 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
1980 ; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP7]], align 1
1981 ; ALL-NEXT: [[TMP8]] = add i32 [[FWD_INDEX]], 8
1982 ; ALL-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 256
1983 ; ALL-NEXT: br i1 [[TMP9]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
1984 ; ALL: memmove_done:
1985 ; ALL-NEXT: ret void
1987 call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 256, i1 false)
1991 define amdgpu_kernel void @memmove_local_align1_flat_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size) {
1992 ; OPT-LABEL: @memmove_local_align1_flat_align1_unknown_size(
1993 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
1994 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
1995 ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
1996 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
1997 ; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[DST:%.*]] to ptr
1998 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[SRC:%.*]], [[TMP4]]
1999 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
2000 ; OPT: memmove_copy_backwards:
2001 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
2002 ; OPT: memmove_bwd_residual_loop:
2003 ; OPT-NEXT: [[TMP5:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
2004 ; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP5]], 1
2005 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
2006 ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr [[TMP6]], align 1
2007 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
2008 ; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(3) [[TMP7]], align 1
2009 ; OPT-NEXT: [[TMP8:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
2010 ; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
2011 ; OPT: memmove_bwd_middle:
2012 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
2013 ; OPT: memmove_bwd_main_loop:
2014 ; OPT-NEXT: [[TMP9:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
2015 ; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP9]], 8
2016 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[BWD_MAIN_INDEX]]
2017 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr [[TMP10]], align 1
2018 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
2019 ; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP11]], align 1
2020 ; OPT-NEXT: [[TMP12:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
2021 ; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
2022 ; OPT: memmove_copy_forward:
2023 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
2024 ; OPT: memmove_fwd_main_loop:
2025 ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
2026 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_MAIN_INDEX]]
2027 ; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr [[TMP13]], align 1
2028 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
2029 ; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP14]], align 1
2030 ; OPT-NEXT: [[TMP15]] = add i32 [[FWD_MAIN_INDEX]], 8
2031 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP3]]
2032 ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
2033 ; OPT: memmove_fwd_middle:
2034 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
2035 ; OPT: memmove_fwd_residual_loop:
2036 ; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
2037 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
2038 ; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr [[TMP17]], align 1
2039 ; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
2040 ; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP18]], align 1
2041 ; OPT-NEXT: [[TMP19]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
2042 ; OPT-NEXT: [[TMP20:%.*]] = icmp eq i32 [[TMP19]], [[SIZE]]
2043 ; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
2044 ; OPT: memmove_done:
2045 ; OPT-NEXT: ret void
2047 call void @llvm.memmove.p3.p0.i32(ptr addrspace(3) %dst, ptr addrspace(0) %src, i32 %size, i1 false)
2051 define amdgpu_kernel void @memmove_local_align1_local_align1(ptr addrspace(3) %dst, ptr addrspace(3) %src) {
2052 ; MAX1024-LABEL: @memmove_local_align1_local_align1(
2053 ; MAX1024-NEXT: call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) [[DST:%.*]], ptr addrspace(3) [[SRC:%.*]], i32 256, i1 false)
2054 ; MAX1024-NEXT: ret void
2056 ; ALL-LABEL: @memmove_local_align1_local_align1(
2057 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
2058 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
2059 ; ALL: memmove_bwd_loop:
2060 ; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
2061 ; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 8
2062 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_INDEX]]
2063 ; ALL-NEXT: [[ELEMENT:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP2]], align 1
2064 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_INDEX]]
2065 ; ALL-NEXT: store <2 x i32> [[ELEMENT]], ptr addrspace(3) [[TMP3]], align 1
2066 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
2067 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
2068 ; ALL: memmove_fwd_loop:
2069 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
2070 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_INDEX]]
2071 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP5]], align 1
2072 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_INDEX]]
2073 ; ALL-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP6]], align 1
2074 ; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 8
2075 ; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256
2076 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
2077 ; ALL: memmove_done:
2078 ; ALL-NEXT: ret void
2080 call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 256, i1 false)
2084 define amdgpu_kernel void @memmove_local_align1_local_align1_unknown_size(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size) {
2085 ; OPT-LABEL: @memmove_local_align1_local_align1_unknown_size(
2086 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 7
2087 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
2088 ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
2089 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
2090 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[DST:%.*]]
2091 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
2092 ; OPT: memmove_copy_backwards:
2093 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
2094 ; OPT: memmove_bwd_residual_loop:
2095 ; OPT-NEXT: [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
2096 ; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
2097 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
2098 ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(3) [[TMP5]], align 1
2099 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
2100 ; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(3) [[TMP6]], align 1
2101 ; OPT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
2102 ; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
2103 ; OPT: memmove_bwd_middle:
2104 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
2105 ; OPT: memmove_bwd_main_loop:
2106 ; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
2107 ; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 8
2108 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[BWD_MAIN_INDEX]]
2109 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP9]], align 1
2110 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[BWD_MAIN_INDEX]]
2111 ; OPT-NEXT: store <2 x i32> [[ELEMENT1]], ptr addrspace(3) [[TMP10]], align 1
2112 ; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
2113 ; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
2114 ; OPT: memmove_copy_forward:
2115 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
2116 ; OPT: memmove_fwd_main_loop:
2117 ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
2118 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_MAIN_INDEX]]
2119 ; OPT-NEXT: [[ELEMENT2:%.*]] = load <2 x i32>, ptr addrspace(3) [[TMP12]], align 1
2120 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_MAIN_INDEX]]
2121 ; OPT-NEXT: store <2 x i32> [[ELEMENT2]], ptr addrspace(3) [[TMP13]], align 1
2122 ; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 8
2123 ; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]]
2124 ; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
2125 ; OPT: memmove_fwd_middle:
2126 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
2127 ; OPT: memmove_fwd_residual_loop:
2128 ; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
2129 ; OPT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
2130 ; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(3) [[TMP16]], align 1
2131 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
2132 ; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(3) [[TMP17]], align 1
2133 ; OPT-NEXT: [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
2134 ; OPT-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
2135 ; OPT-NEXT: br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
2136 ; OPT: memmove_done:
2137 ; OPT-NEXT: ret void
2139 call void @llvm.memmove.p3.p3.i32(ptr addrspace(3) %dst, ptr addrspace(3) %src, i32 %size, i1 false)
2143 define amdgpu_kernel void @memmove_private_align1_private_align1(ptr addrspace(5) %dst, ptr addrspace(5) %src) {
2144 ; MAX1024-LABEL: @memmove_private_align1_private_align1(
2145 ; MAX1024-NEXT: call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) [[DST:%.*]], ptr addrspace(5) [[SRC:%.*]], i32 256, i1 false)
2146 ; MAX1024-NEXT: ret void
2148 ; ALL-LABEL: @memmove_private_align1_private_align1(
2149 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
2150 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
2151 ; ALL: memmove_bwd_loop:
2152 ; ALL-NEXT: [[TMP1:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 256, [[TMP0:%.*]] ]
2153 ; ALL-NEXT: [[BWD_INDEX]] = sub i32 [[TMP1]], 256
2154 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_INDEX]]
2155 ; ALL-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP2]], align 1
2156 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_INDEX]]
2157 ; ALL-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(5) [[TMP3]], align 1
2158 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i32 [[BWD_INDEX]], 0
2159 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
2160 ; ALL: memmove_fwd_loop:
2161 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i32 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
2162 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_INDEX]]
2163 ; ALL-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(5) [[TMP5]], align 1
2164 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_INDEX]]
2165 ; ALL-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP6]], align 1
2166 ; ALL-NEXT: [[TMP7]] = add i32 [[FWD_INDEX]], 256
2167 ; ALL-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 256
2168 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
2169 ; ALL: memmove_done:
2170 ; ALL-NEXT: ret void
2172 call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 256, i1 false)
2176 define amdgpu_kernel void @memmove_private_align1_private_align1_unknown_size(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size) {
2177 ; OPT-LABEL: @memmove_private_align1_private_align1_unknown_size(
2178 ; OPT-NEXT: [[TMP2:%.*]] = and i32 [[SIZE:%.*]], 15
2179 ; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
2180 ; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
2181 ; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP3]], 0
2182 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[DST:%.*]]
2183 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
2184 ; OPT: memmove_copy_backwards:
2185 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
2186 ; OPT: memmove_bwd_residual_loop:
2187 ; OPT-NEXT: [[TMP4:%.*]] = phi i32 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
2188 ; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i32 [[TMP4]], 1
2189 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_RESIDUAL_INDEX]]
2190 ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(5) [[TMP5]], align 1
2191 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_RESIDUAL_INDEX]]
2192 ; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(5) [[TMP6]], align 1
2193 ; OPT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
2194 ; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
2195 ; OPT: memmove_bwd_middle:
2196 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
2197 ; OPT: memmove_bwd_main_loop:
2198 ; OPT-NEXT: [[TMP8:%.*]] = phi i32 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP3]], [[MEMMOVE_BWD_MIDDLE]] ]
2199 ; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i32 [[TMP8]], 16
2200 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[BWD_MAIN_INDEX]]
2201 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP9]], align 1
2202 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[BWD_MAIN_INDEX]]
2203 ; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(5) [[TMP10]], align 1
2204 ; OPT-NEXT: [[TMP11:%.*]] = icmp eq i32 [[BWD_MAIN_INDEX]], 0
2205 ; OPT-NEXT: br i1 [[TMP11]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
2206 ; OPT: memmove_copy_forward:
2207 ; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
2208 ; OPT: memmove_fwd_main_loop:
2209 ; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i32 [ [[TMP14:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
2210 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_MAIN_INDEX]]
2211 ; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(5) [[TMP12]], align 1
2212 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_MAIN_INDEX]]
2213 ; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(5) [[TMP13]], align 1
2214 ; OPT-NEXT: [[TMP14]] = add i32 [[FWD_MAIN_INDEX]], 16
2215 ; OPT-NEXT: [[TMP15:%.*]] = icmp eq i32 [[TMP14]], [[TMP3]]
2216 ; OPT-NEXT: br i1 [[TMP15]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
2217 ; OPT: memmove_fwd_middle:
2218 ; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
2219 ; OPT: memmove_fwd_residual_loop:
2220 ; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i32 [ [[TMP18:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
2221 ; OPT-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[SRC]], i32 [[FWD_RESIDUAL_INDEX]]
2222 ; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(5) [[TMP16]], align 1
2223 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[DST]], i32 [[FWD_RESIDUAL_INDEX]]
2224 ; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(5) [[TMP17]], align 1
2225 ; OPT-NEXT: [[TMP18]] = add i32 [[FWD_RESIDUAL_INDEX]], 1
2226 ; OPT-NEXT: [[TMP19:%.*]] = icmp eq i32 [[TMP18]], [[SIZE]]
2227 ; OPT-NEXT: br i1 [[TMP19]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
2228 ; OPT: memmove_done:
2229 ; OPT-NEXT: ret void
2231 call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) %dst, ptr addrspace(5) %src, i32 %size, i1 false)
2235 define amdgpu_kernel void @memmove_global_align4_static_residual_empty(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
2236 ; OPT-LABEL: @memmove_global_align4_static_residual_empty(
2237 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
2238 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
2239 ; OPT: memmove_bwd_loop:
2240 ; OPT-NEXT: [[TMP11:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1280, [[TMP0:%.*]] ]
2241 ; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP11]], 256
2242 ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
2243 ; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP2]], align 1
2244 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
2245 ; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
2246 ; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
2247 ; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
2248 ; OPT: memmove_fwd_loop:
2249 ; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
2250 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
2251 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP5]], align 1
2252 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
2253 ; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
2254 ; OPT-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256
2255 ; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 1280
2256 ; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
2257 ; OPT: memmove_done:
2258 ; OPT-NEXT: ret void
2260 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1280, i1 false)
2264 define amdgpu_kernel void @memmove_global_align4_static_residual_full(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
2265 ; OPT-LABEL: @memmove_global_align4_static_residual_full(
2266 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
2267 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
2268 ; OPT: memmove_bwd_residual:
2269 ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
2270 ; OPT-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[TMP1]], align 1
2271 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
2272 ; OPT-NEXT: store i8 [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
2273 ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
2274 ; OPT-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[TMP4]], align 1
2275 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
2276 ; OPT-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[TMP6]], align 1
2277 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
2278 ; OPT-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 1
2279 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
2280 ; OPT-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 1
2281 ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
2282 ; OPT-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(1) [[TMP10]], align 1
2283 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
2284 ; OPT-NEXT: store i64 [[TMP11]], ptr addrspace(1) [[TMP12]], align 1
2285 ; OPT-NEXT: br label [[MEMMOVE_BWD_LOOP:%.*]]
2286 ; OPT: memmove_bwd_loop:
2287 ; OPT-NEXT: [[TMP13:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 1024, [[MEMMOVE_BWD_RESIDUAL]] ]
2288 ; OPT-NEXT: [[BWD_INDEX]] = sub i64 [[TMP13]], 256
2289 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
2290 ; OPT-NEXT: [[ELEMENT:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP14]], align 1
2291 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
2292 ; OPT-NEXT: store <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP15]], align 1
2293 ; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
2294 ; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
2295 ; OPT: memmove_fwd_loop:
2296 ; OPT-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0:%.*]] ]
2297 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
2298 ; OPT-NEXT: [[ELEMENT1:%.*]] = load <64 x i32>, ptr addrspace(1) [[TMP17]], align 1
2299 ; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
2300 ; OPT-NEXT: store <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP18]], align 1
2301 ; OPT-NEXT: [[TMP19]] = add i64 [[FWD_INDEX]], 256
2302 ; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 1024
2303 ; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_FWD_RESIDUAL:%.*]], label [[MEMMOVE_FWD_LOOP]]
2304 ; OPT: memmove_fwd_residual:
2305 ; OPT-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1024
2306 ; OPT-NEXT: [[TMP22:%.*]] = load i64, ptr addrspace(1) [[TMP21]], align 1
2307 ; OPT-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
2308 ; OPT-NEXT: store i64 [[TMP22]], ptr addrspace(1) [[TMP23]], align 1
2309 ; OPT-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1032
2310 ; OPT-NEXT: [[TMP25:%.*]] = load i32, ptr addrspace(1) [[TMP24]], align 1
2311 ; OPT-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1032
2312 ; OPT-NEXT: store i32 [[TMP25]], ptr addrspace(1) [[TMP26]], align 1
2313 ; OPT-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1036
2314 ; OPT-NEXT: [[TMP28:%.*]] = load i16, ptr addrspace(1) [[TMP27]], align 1
2315 ; OPT-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1036
2316 ; OPT-NEXT: store i16 [[TMP28]], ptr addrspace(1) [[TMP29]], align 1
2317 ; OPT-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 1038
2318 ; OPT-NEXT: [[TMP31:%.*]] = load i8, ptr addrspace(1) [[TMP30]], align 1
2319 ; OPT-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1038
2320 ; OPT-NEXT: store i8 [[TMP31]], ptr addrspace(1) [[TMP32]], align 1
2321 ; OPT-NEXT: br label [[MEMMOVE_DONE]]
2322 ; OPT: memmove_done:
2323 ; OPT-NEXT: ret void
2325 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 1039, i1 false)
2329 define void @test_umin(i64 %0, i64 %idxprom, ptr %x, ptr %y) {
2330 ; OPT-LABEL: @test_umin(
2332 ; OPT-NEXT: [[ARRAYIDX:%.*]] = getelementptr [32 x [8 x i64]], ptr [[Y:%.*]], i64 0, i64 [[IDXPROM:%.*]]
2333 ; OPT-NEXT: [[SPEC_SELECT:%.*]] = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56)
2334 ; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SPEC_SELECT]], 15
2335 ; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SPEC_SELECT]], [[TMP2]]
2336 ; OPT-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP3]], 0
2337 ; OPT-NEXT: br i1 [[TMP4]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
2338 ; OPT: loop-memcpy-expansion:
2339 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
2340 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 [[LOOP_INDEX]]
2341 ; OPT-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1
2342 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX]], i64 [[LOOP_INDEX]]
2343 ; OPT-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP7]], align 1
2344 ; OPT-NEXT: [[TMP8]] = add i64 [[LOOP_INDEX]], 16
2345 ; OPT-NEXT: [[TMP9:%.*]] = icmp ult i64 [[TMP8]], [[TMP3]]
2346 ; OPT-NEXT: br i1 [[TMP9]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
2347 ; OPT: loop-memcpy-residual:
2348 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP14:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
2349 ; OPT-NEXT: [[TMP10:%.*]] = add i64 [[TMP3]], [[RESIDUAL_LOOP_INDEX]]
2350 ; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP10]]
2351 ; OPT-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1
2352 ; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX]], i64 [[TMP10]]
2353 ; OPT-NEXT: store i8 [[TMP12]], ptr [[TMP13]], align 1
2354 ; OPT-NEXT: [[TMP14]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
2355 ; OPT-NEXT: [[TMP15:%.*]] = icmp ult i64 [[TMP14]], [[TMP2]]
2356 ; OPT-NEXT: br i1 [[TMP15]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
2357 ; OPT: post-loop-memcpy-expansion:
2358 ; OPT-NEXT: ret void
2359 ; OPT: loop-memcpy-residual-header:
2360 ; OPT-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP2]], 0
2361 ; OPT-NEXT: br i1 [[TMP16]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
2364 %arrayidx = getelementptr [32 x [8 x i64]], ptr %y, i64 0, i64 %idxprom
2365 %spec.select = tail call i64 @llvm.umin.i64(i64 sub (i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) inttoptr (i64 32 to ptr addrspace(4)) to ptr) to i64), i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 56)
2366 tail call void @llvm.memcpy.p0.p0.i64(ptr %arrayidx, ptr %x, i64 %spec.select, i1 false)
2370 define amdgpu_kernel void @memmove_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
2371 ; MAX1024-LABEL: @memmove_volatile(
2372 ; MAX1024-NEXT: call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 512, i1 true)
2373 ; MAX1024-NEXT: ret void
2375 ; ALL-LABEL: @memmove_volatile(
2376 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[DST:%.*]]
2377 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
2378 ; ALL: memmove_bwd_loop:
2379 ; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 512, [[TMP0:%.*]] ]
2380 ; ALL-NEXT: [[BWD_INDEX]] = sub i64 [[TMP1]], 256
2381 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_INDEX]]
2382 ; ALL-NEXT: [[ELEMENT:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP2]], align 1
2383 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_INDEX]]
2384 ; ALL-NEXT: store volatile <64 x i32> [[ELEMENT]], ptr addrspace(1) [[TMP3]], align 1
2385 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[BWD_INDEX]], 0
2386 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_LOOP]]
2387 ; ALL: memmove_fwd_loop:
2388 ; ALL-NEXT: [[FWD_INDEX:%.*]] = phi i64 [ [[TMP7:%.*]], [[MEMMOVE_FWD_LOOP]] ], [ 0, [[TMP0]] ]
2389 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_INDEX]]
2390 ; ALL-NEXT: [[ELEMENT1:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP5]], align 1
2391 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_INDEX]]
2392 ; ALL-NEXT: store volatile <64 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP6]], align 1
2393 ; ALL-NEXT: [[TMP7]] = add i64 [[FWD_INDEX]], 256
2394 ; ALL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP7]], 512
2395 ; ALL-NEXT: br i1 [[TMP8]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_LOOP]]
2396 ; ALL: memmove_done:
2397 ; ALL-NEXT: ret void
2399 call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true)
2403 define amdgpu_kernel void @memcpy_volatile(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
2404 ; MAX1024-LABEL: @memcpy_volatile(
2405 ; MAX1024-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 512, i1 true)
2406 ; MAX1024-NEXT: ret void
2408 ; ALL-LABEL: @memcpy_volatile(
2409 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
2410 ; ALL: load-store-loop:
2411 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
2412 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC:%.*]], i64 [[LOOP_INDEX]]
2413 ; ALL-NEXT: [[TMP2:%.*]] = load volatile <64 x i32>, ptr addrspace(1) [[TMP1]], align 1
2414 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
2415 ; ALL-NEXT: store volatile <64 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 1
2416 ; ALL-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256
2417 ; ALL-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 512
2418 ; ALL-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
2419 ; ALL: memcpy-split:
2420 ; ALL-NEXT: ret void
2422 call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 512, i1 true)
2426 declare i64 @llvm.umin.i64(i64, i64)
2428 attributes #0 = { nounwind }
2429 attributes #1 = { argmemonly nounwind }