1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=1024 %s | FileCheck -check-prefixes=OPT,MAX1024 %s
3 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-lower-intrinsics -amdgpu-mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefixes=OPT,ALL %s
5 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i1) #1
6 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1
7 declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #1
8 declare void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture readonly, i32, i1) #1
9 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1
11 declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i1) #1
12 declare void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #1
13 declare void @llvm.memmove.p5i8.p5i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture readonly, i32, i1) #1
15 declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i1) #1
17 ; Test the upper bound for sizes to leave
18 define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
19 ; MAX1024-LABEL: @max_size_small_static_memcpy_caller0(
20 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 addrspace(1)* [[SRC:%.*]], i64 1024, i1 false)
21 ; MAX1024-NEXT: ret void
23 ; ALL-LABEL: @max_size_small_static_memcpy_caller0(
24 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
25 ; ALL-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
26 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
27 ; ALL: load-store-loop:
28 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
29 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
30 ; ALL-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 1
31 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
32 ; ALL-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 1
33 ; ALL-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
34 ; ALL-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
35 ; ALL-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
39 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false)
43 ; Smallest static size which will be expanded
44 define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
45 ; OPT-LABEL: @min_size_large_static_memcpy_caller0(
46 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
47 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
48 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
49 ; OPT: load-store-loop:
50 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
51 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
52 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 1
53 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
54 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 1
55 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
56 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
57 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
59 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
60 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1024
61 ; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 1
62 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
63 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1024
64 ; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 1
67 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i1 false)
71 define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
72 ; MAX1024-LABEL: @max_size_small_static_memmove_caller0(
73 ; MAX1024-NEXT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 addrspace(1)* [[SRC:%.*]], i64 1024, i1 false)
74 ; MAX1024-NEXT: ret void
76 ; ALL-LABEL: @max_size_small_static_memmove_caller0(
77 ; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult i8 addrspace(1)* [[SRC:%.*]], [[DST:%.*]]
78 ; ALL-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1024, 0
79 ; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
80 ; ALL: copy_backwards:
81 ; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
82 ; ALL: copy_backwards_loop:
83 ; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1024, [[COPY_BACKWARDS]] ]
84 ; ALL-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1
85 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]]
86 ; ALL-NEXT: [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
87 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]]
88 ; ALL-NEXT: store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]], align 1
89 ; ALL-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
90 ; ALL-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
92 ; ALL-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
93 ; ALL: copy_forward_loop:
94 ; ALL-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
95 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]]
96 ; ALL-NEXT: [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]], align 1
97 ; ALL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]]
98 ; ALL-NEXT: store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]], align 1
99 ; ALL-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
100 ; ALL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1024
101 ; ALL-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
105 call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false)
109 define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
110 ; OPT-LABEL: @min_size_large_static_memmove_caller0(
111 ; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult i8 addrspace(1)* [[SRC:%.*]], [[DST:%.*]]
112 ; OPT-NEXT: [[COMPARE_N_TO_0:%.*]] = icmp eq i64 1025, 0
113 ; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[COPY_BACKWARDS:%.*]], label [[COPY_FORWARD:%.*]]
114 ; OPT: copy_backwards:
115 ; OPT-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE:%.*]], label [[COPY_BACKWARDS_LOOP:%.*]]
116 ; OPT: copy_backwards_loop:
117 ; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ]
118 ; OPT-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1
119 ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]]
120 ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
121 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]]
122 ; OPT-NEXT: store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]], align 1
123 ; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
124 ; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
126 ; OPT-NEXT: br i1 [[COMPARE_N_TO_0]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP:%.*]]
127 ; OPT: copy_forward_loop:
128 ; OPT-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
129 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]]
130 ; OPT-NEXT: [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]], align 1
131 ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]]
132 ; OPT-NEXT: store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]], align 1
133 ; OPT-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
134 ; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025
135 ; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
139 call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i1 false)
143 define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
144 ; MAX1024-LABEL: @max_size_small_static_memset_caller0(
145 ; MAX1024-NEXT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* [[DST:%.*]], i8 [[VAL:%.*]], i64 1024, i1 false)
146 ; MAX1024-NEXT: ret void
148 ; ALL-LABEL: @max_size_small_static_memset_caller0(
149 ; ALL-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
150 ; ALL: loadstoreloop:
151 ; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
152 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
153 ; ALL-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]], align 1
154 ; ALL-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
155 ; ALL-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024
156 ; ALL-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
160 call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i1 false)
164 define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
165 ; OPT-LABEL: @min_size_large_static_memset_caller0(
166 ; OPT-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
167 ; OPT: loadstoreloop:
168 ; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
169 ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
170 ; OPT-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]], align 1
171 ; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
172 ; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025
173 ; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
177 call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i1 false)
181 define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
182 ; OPT-LABEL: @variable_memcpy_caller0(
183 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
184 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
185 ; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
186 ; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16
187 ; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
188 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
189 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
190 ; OPT: loop-memcpy-expansion:
191 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
192 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
193 ; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
194 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
195 ; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
196 ; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1
197 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
198 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
199 ; OPT: loop-memcpy-residual:
200 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
201 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
202 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
203 ; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
204 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
205 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
206 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
207 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
208 ; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
209 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
210 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
211 ; OPT: post-loop-memcpy-expansion:
213 ; OPT: loop-memcpy-residual-header:
214 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
215 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
217 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i1 false)
221 define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
222 ; OPT-LABEL: @variable_memcpy_caller1(
223 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
224 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
225 ; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
226 ; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16
227 ; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
228 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
229 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
230 ; OPT: loop-memcpy-expansion:
231 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
232 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
233 ; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
234 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
235 ; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
236 ; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1
237 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
238 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
239 ; OPT: loop-memcpy-residual:
240 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
241 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
242 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
243 ; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
244 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
245 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
246 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
247 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
248 ; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
249 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
250 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
251 ; OPT: post-loop-memcpy-expansion:
253 ; OPT: loop-memcpy-residual-header:
254 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
255 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
257 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i1 false)
261 define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
262 ; OPT-LABEL: @memcpy_multi_use_one_function(
263 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
264 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)*
265 ; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
266 ; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16
267 ; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
268 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
269 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION2:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5:%.*]]
270 ; OPT: loop-memcpy-expansion2:
271 ; OPT-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ]
272 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX3]]
273 ; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
274 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX3]]
275 ; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
276 ; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX3]], 1
277 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
278 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION2]], label [[LOOP_MEMCPY_RESIDUAL_HEADER5]]
279 ; OPT: loop-memcpy-residual4:
280 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX6:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER5]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL4:%.*]] ]
281 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
282 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
283 ; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX6]]
284 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
285 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
286 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
287 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
288 ; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX6]], 1
289 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
290 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1:%.*]]
291 ; OPT: post-loop-memcpy-expansion1:
292 ; OPT-NEXT: [[TMP20:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to <4 x i32> addrspace(1)*
293 ; OPT-NEXT: [[TMP21:%.*]] = bitcast i8 addrspace(1)* [[DST1:%.*]] to <4 x i32> addrspace(1)*
294 ; OPT-NEXT: [[TMP22:%.*]] = udiv i64 [[M:%.*]], 16
295 ; OPT-NEXT: [[TMP23:%.*]] = urem i64 [[M]], 16
296 ; OPT-NEXT: [[TMP24:%.*]] = sub i64 [[M]], [[TMP23]]
297 ; OPT-NEXT: [[TMP25:%.*]] = icmp ne i64 [[TMP22]], 0
298 ; OPT-NEXT: br i1 [[TMP25]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
299 ; OPT: loop-memcpy-expansion:
300 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP29:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
301 ; OPT-NEXT: [[TMP26:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP20]], i64 [[LOOP_INDEX]]
302 ; OPT-NEXT: [[TMP27:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP26]], align 1
303 ; OPT-NEXT: [[TMP28:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP21]], i64 [[LOOP_INDEX]]
304 ; OPT-NEXT: store <4 x i32> [[TMP27]], <4 x i32> addrspace(1)* [[TMP28]], align 1
305 ; OPT-NEXT: [[TMP29]] = add i64 [[LOOP_INDEX]], 1
306 ; OPT-NEXT: [[TMP30:%.*]] = icmp ult i64 [[TMP29]], [[TMP22]]
307 ; OPT-NEXT: br i1 [[TMP30]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
308 ; OPT: loop-memcpy-residual:
309 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP37:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
310 ; OPT-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i8 addrspace(1)*
311 ; OPT-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i8 addrspace(1)*
312 ; OPT-NEXT: [[TMP33:%.*]] = add i64 [[TMP24]], [[RESIDUAL_LOOP_INDEX]]
313 ; OPT-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP31]], i64 [[TMP33]]
314 ; OPT-NEXT: [[TMP35:%.*]] = load i8, i8 addrspace(1)* [[TMP34]], align 1
315 ; OPT-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP32]], i64 [[TMP33]]
316 ; OPT-NEXT: store i8 [[TMP35]], i8 addrspace(1)* [[TMP36]], align 1
317 ; OPT-NEXT: [[TMP37]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
318 ; OPT-NEXT: [[TMP38:%.*]] = icmp ult i64 [[TMP37]], [[TMP23]]
319 ; OPT-NEXT: br i1 [[TMP38]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
320 ; OPT: post-loop-memcpy-expansion:
322 ; OPT: loop-memcpy-residual-header:
323 ; OPT-NEXT: [[TMP39:%.*]] = icmp ne i64 [[TMP23]], 0
324 ; OPT-NEXT: br i1 [[TMP39]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
325 ; OPT: loop-memcpy-residual-header5:
326 ; OPT-NEXT: [[TMP40:%.*]] = icmp ne i64 [[TMP4]], 0
327 ; OPT-NEXT: br i1 [[TMP40]], label [[LOOP_MEMCPY_RESIDUAL4]], label [[POST_LOOP_MEMCPY_EXPANSION1]]
329 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i1 false)
330 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i1 false)
334 define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
335 ; OPT-LABEL: @memcpy_alt_type(
336 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)*
337 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <2 x i32> addrspace(1)*
338 ; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
339 ; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8
340 ; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
341 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
342 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
343 ; OPT: loop-memcpy-expansion:
344 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
345 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
346 ; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 1
347 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP2]], i32 [[LOOP_INDEX]]
348 ; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(1)* [[TMP9]], align 1
349 ; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1
350 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
351 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
352 ; OPT: loop-memcpy-residual:
353 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
354 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)*
355 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
356 ; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
357 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
358 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 1
359 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i32 [[TMP14]]
360 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
361 ; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
362 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
363 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
364 ; OPT: post-loop-memcpy-expansion:
366 ; OPT: loop-memcpy-residual-header:
367 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
368 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
370 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i1 false)
374 ; One of the uses in the function should be expanded, the other left alone.
375 define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
376 ; MAX1024-LABEL: @memcpy_multi_use_one_function_keep_small(
377 ; MAX1024-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
378 ; MAX1024-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)*
379 ; MAX1024-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
380 ; MAX1024-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16
381 ; MAX1024-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
382 ; MAX1024-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
383 ; MAX1024-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
384 ; MAX1024: loop-memcpy-expansion:
385 ; MAX1024-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
386 ; MAX1024-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
387 ; MAX1024-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
388 ; MAX1024-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
389 ; MAX1024-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
390 ; MAX1024-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1
391 ; MAX1024-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
392 ; MAX1024-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
393 ; MAX1024: loop-memcpy-residual:
394 ; MAX1024-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
395 ; MAX1024-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
396 ; MAX1024-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
397 ; MAX1024-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
398 ; MAX1024-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
399 ; MAX1024-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
400 ; MAX1024-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
401 ; MAX1024-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
402 ; MAX1024-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
403 ; MAX1024-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
404 ; MAX1024-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
405 ; MAX1024: post-loop-memcpy-expansion:
406 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* [[DST1:%.*]], i8 addrspace(1)* [[SRC]], i64 102, i1 false)
407 ; MAX1024-NEXT: ret void
408 ; MAX1024: loop-memcpy-residual-header:
409 ; MAX1024-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
410 ; MAX1024-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
412 ; ALL-LABEL: @memcpy_multi_use_one_function_keep_small(
413 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
414 ; ALL-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST0:%.*]] to <4 x i32> addrspace(1)*
415 ; ALL-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
416 ; ALL-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16
417 ; ALL-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
418 ; ALL-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
419 ; ALL-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
420 ; ALL: loop-memcpy-expansion:
421 ; ALL-NEXT: [[LOOP_INDEX1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
422 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX1]]
423 ; ALL-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
424 ; ALL-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX1]]
425 ; ALL-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
426 ; ALL-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX1]], 1
427 ; ALL-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
428 ; ALL-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
429 ; ALL: loop-memcpy-residual:
430 ; ALL-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
431 ; ALL-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
432 ; ALL-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
433 ; ALL-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
434 ; ALL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
435 ; ALL-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
436 ; ALL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
437 ; ALL-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
438 ; ALL-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
439 ; ALL-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
440 ; ALL-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
441 ; ALL: post-loop-memcpy-expansion:
442 ; ALL-NEXT: [[TMP20:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to <4 x i32> addrspace(1)*
443 ; ALL-NEXT: [[TMP21:%.*]] = bitcast i8 addrspace(1)* [[DST1:%.*]] to <4 x i32> addrspace(1)*
444 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
445 ; ALL: load-store-loop:
446 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION]] ], [ [[TMP25:%.*]], [[LOAD_STORE_LOOP]] ]
447 ; ALL-NEXT: [[TMP22:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP20]], i64 [[LOOP_INDEX]]
448 ; ALL-NEXT: [[TMP23:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP22]], align 1
449 ; ALL-NEXT: [[TMP24:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP21]], i64 [[LOOP_INDEX]]
450 ; ALL-NEXT: store <4 x i32> [[TMP23]], <4 x i32> addrspace(1)* [[TMP24]], align 1
451 ; ALL-NEXT: [[TMP25]] = add i64 [[LOOP_INDEX]], 1
452 ; ALL-NEXT: [[TMP26:%.*]] = icmp ult i64 [[TMP25]], 6
453 ; ALL-NEXT: br i1 [[TMP26]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
455 ; ALL-NEXT: [[TMP27:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i32 addrspace(1)*
456 ; ALL-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP27]], i64 24
457 ; ALL-NEXT: [[TMP29:%.*]] = load i32, i32 addrspace(1)* [[TMP28]], align 1
458 ; ALL-NEXT: [[TMP30:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i32 addrspace(1)*
459 ; ALL-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP30]], i64 24
460 ; ALL-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[TMP31]], align 1
461 ; ALL-NEXT: [[TMP32:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP20]] to i16 addrspace(1)*
462 ; ALL-NEXT: [[TMP33:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP32]], i64 50
463 ; ALL-NEXT: [[TMP34:%.*]] = load i16, i16 addrspace(1)* [[TMP33]], align 1
464 ; ALL-NEXT: [[TMP35:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP21]] to i16 addrspace(1)*
465 ; ALL-NEXT: [[TMP36:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP35]], i64 50
466 ; ALL-NEXT: store i16 [[TMP34]], i16 addrspace(1)* [[TMP36]], align 1
468 ; ALL: loop-memcpy-residual-header:
469 ; ALL-NEXT: [[TMP37:%.*]] = icmp ne i64 [[TMP4]], 0
470 ; ALL-NEXT: br i1 [[TMP37]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
472 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i1 false)
473 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i1 false)
477 define amdgpu_kernel void @memcpy_global_align4_global_align4_1028(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
478 ; OPT-LABEL: @memcpy_global_align4_global_align4_1028(
479 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
480 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
481 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
482 ; OPT: load-store-loop:
483 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
484 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
485 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
486 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
487 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
488 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
489 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
490 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
492 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)*
493 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP8]], i64 256
494 ; OPT-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(1)* [[TMP9]], align 4
495 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)*
496 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP11]], i64 256
497 ; OPT-NEXT: store i32 [[TMP10]], i32 addrspace(1)* [[TMP12]], align 4
500 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1028, i1 false)
504 define amdgpu_kernel void @memcpy_global_align4_global_align4_1025(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
505 ; OPT-LABEL: @memcpy_global_align4_global_align4_1025(
506 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
507 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
508 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
509 ; OPT: load-store-loop:
510 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
511 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
512 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
513 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
514 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
515 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
516 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
517 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
519 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
520 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1024
521 ; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 4
522 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
523 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1024
524 ; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 4
527 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1025, i1 false)
531 define amdgpu_kernel void @memcpy_global_align4_global_align4_1026(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
532 ; OPT-LABEL: @memcpy_global_align4_global_align4_1026(
533 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
534 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
535 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
536 ; OPT: load-store-loop:
537 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
538 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
539 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
540 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
541 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
542 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
543 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
544 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
546 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
547 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP8]], i64 512
548 ; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(1)* [[TMP9]], align 4
549 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
550 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP11]], i64 512
551 ; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(1)* [[TMP12]], align 4
554 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1026, i1 false)
558 define amdgpu_kernel void @memcpy_global_align4_global_align4_1032(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
559 ; OPT-LABEL: @memcpy_global_align4_global_align4_1032(
560 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
561 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
562 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
563 ; OPT: load-store-loop:
564 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
565 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
566 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
567 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
568 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
569 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
570 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
571 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
573 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
574 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
575 ; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
576 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
577 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
578 ; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
581 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1032, i1 false)
585 define amdgpu_kernel void @memcpy_global_align4_global_align4_1034(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
586 ; OPT-LABEL: @memcpy_global_align4_global_align4_1034(
587 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
588 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
589 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
590 ; OPT: load-store-loop:
591 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
592 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
593 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
594 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
595 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
596 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
597 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
598 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
600 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
601 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
602 ; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
603 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
604 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
605 ; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
606 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
607 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP13]], i64 516
608 ; OPT-NEXT: [[TMP15:%.*]] = load i16, i16 addrspace(1)* [[TMP14]], align 4
609 ; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
610 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP16]], i64 516
611 ; OPT-NEXT: store i16 [[TMP15]], i16 addrspace(1)* [[TMP17]], align 4
614 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1034, i1 false)
618 define amdgpu_kernel void @memcpy_global_align4_global_align4_1035(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
619 ; OPT-LABEL: @memcpy_global_align4_global_align4_1035(
620 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
621 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
622 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
623 ; OPT: load-store-loop:
624 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
625 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
626 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
627 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
628 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
629 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
630 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
631 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
633 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
634 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
635 ; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
636 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
637 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
638 ; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
639 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
640 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP13]], i64 516
641 ; OPT-NEXT: [[TMP15:%.*]] = load i16, i16 addrspace(1)* [[TMP14]], align 4
642 ; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
643 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP16]], i64 516
644 ; OPT-NEXT: store i16 [[TMP15]], i16 addrspace(1)* [[TMP17]], align 4
645 ; OPT-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
646 ; OPT-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP18]], i64 1034
647 ; OPT-NEXT: [[TMP20:%.*]] = load i8, i8 addrspace(1)* [[TMP19]], align 2
648 ; OPT-NEXT: [[TMP21:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
649 ; OPT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP21]], i64 1034
650 ; OPT-NEXT: store i8 [[TMP20]], i8 addrspace(1)* [[TMP22]], align 2
653 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1035, i1 false)
657 define amdgpu_kernel void @memcpy_global_align4_global_align4_1036(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
658 ; OPT-LABEL: @memcpy_global_align4_global_align4_1036(
659 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
660 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
661 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
662 ; OPT: load-store-loop:
663 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
664 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
665 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
666 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
667 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
668 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
669 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
670 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
672 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
673 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
674 ; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
675 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
676 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
677 ; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
678 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)*
679 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP13]], i64 258
680 ; OPT-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[TMP14]], align 4
681 ; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)*
682 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP16]], i64 258
683 ; OPT-NEXT: store i32 [[TMP15]], i32 addrspace(1)* [[TMP17]], align 4
686 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1036, i1 false)
690 define amdgpu_kernel void @memcpy_global_align4_global_align4_1039(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
691 ; OPT-LABEL: @memcpy_global_align4_global_align4_1039(
692 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
693 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
694 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
695 ; OPT: load-store-loop:
696 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
697 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
698 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
699 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
700 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
701 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
702 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
703 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
705 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i64 addrspace(1)*
706 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP8]], i64 128
707 ; OPT-NEXT: [[TMP10:%.*]] = load i64, i64 addrspace(1)* [[TMP9]], align 4
708 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i64 addrspace(1)*
709 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP11]], i64 128
710 ; OPT-NEXT: store i64 [[TMP10]], i64 addrspace(1)* [[TMP12]], align 4
711 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i32 addrspace(1)*
712 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP13]], i64 258
713 ; OPT-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[TMP14]], align 4
714 ; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i32 addrspace(1)*
715 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP16]], i64 258
716 ; OPT-NEXT: store i32 [[TMP15]], i32 addrspace(1)* [[TMP17]], align 4
717 ; OPT-NEXT: [[TMP18:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
718 ; OPT-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP18]], i64 518
719 ; OPT-NEXT: [[TMP20:%.*]] = load i16, i16 addrspace(1)* [[TMP19]], align 4
720 ; OPT-NEXT: [[TMP21:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
721 ; OPT-NEXT: [[TMP22:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP21]], i64 518
722 ; OPT-NEXT: store i16 [[TMP20]], i16 addrspace(1)* [[TMP22]], align 4
723 ; OPT-NEXT: [[TMP23:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
724 ; OPT-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP23]], i64 1038
725 ; OPT-NEXT: [[TMP25:%.*]] = load i8, i8 addrspace(1)* [[TMP24]], align 2
726 ; OPT-NEXT: [[TMP26:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
727 ; OPT-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP26]], i64 1038
728 ; OPT-NEXT: store i8 [[TMP25]], i8 addrspace(1)* [[TMP27]], align 2
731 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1039, i1 false)
735 define amdgpu_kernel void @memcpy_global_align2_global_align2_1039(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
736 ; OPT-LABEL: @memcpy_global_align2_global_align2_1039(
737 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
738 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
739 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
740 ; OPT: load-store-loop:
741 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
742 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
743 ; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2
744 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
745 ; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2
746 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
747 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 519
748 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
750 ; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)*
751 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1038
752 ; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2
753 ; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)*
754 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1038
755 ; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2
758 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 2 %src, i64 1039, i1 false)
762 define amdgpu_kernel void @memcpy_global_align4_global_align4_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
763 ; OPT-LABEL: @memcpy_global_align4_global_align4_1027(
764 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
765 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
766 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
767 ; OPT: load-store-loop:
768 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
769 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
770 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
771 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
772 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
773 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
774 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 64
775 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
777 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i16 addrspace(1)*
778 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP8]], i64 512
779 ; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(1)* [[TMP9]], align 4
780 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i16 addrspace(1)*
781 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP11]], i64 512
782 ; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(1)* [[TMP12]], align 4
783 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
784 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 1026
785 ; OPT-NEXT: [[TMP15:%.*]] = load i8, i8 addrspace(1)* [[TMP14]], align 2
786 ; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
787 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP16]], i64 1026
788 ; OPT-NEXT: store i8 [[TMP15]], i8 addrspace(1)* [[TMP17]], align 2
791 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1027, i1 false)
795 define amdgpu_kernel void @memcpy_global_align2_global_align4_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
796 ; OPT-LABEL: @memcpy_global_align2_global_align4_1027(
797 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
798 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
799 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
800 ; OPT: load-store-loop:
801 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
802 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
803 ; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2
804 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
805 ; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2
806 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
807 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 513
808 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
810 ; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)*
811 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1026
812 ; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2
813 ; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)*
814 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1026
815 ; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2
818 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 4 %src, i64 1027, i1 false)
822 define amdgpu_kernel void @memcpy_global_align4_global_align2_1027(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
823 ; OPT-LABEL: @memcpy_global_align4_global_align2_1027(
824 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
825 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
826 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
827 ; OPT: load-store-loop:
828 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
829 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
830 ; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(1)* [[TMP3]], align 2
831 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
832 ; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(1)* [[TMP5]], align 2
833 ; OPT-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
834 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 513
835 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
837 ; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)*
838 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP8]], i64 1026
839 ; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(1)* [[TMP9]], align 2
840 ; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)*
841 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP11]], i64 1026
842 ; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(1)* [[TMP12]], align 2
845 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 2 %src, i64 1027, i1 false)
849 define amdgpu_kernel void @memcpy_private_align4_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
850 ; OPT-LABEL: @memcpy_private_align4_private_align4_1027(
851 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)*
852 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)*
853 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
854 ; OPT: load-store-loop:
855 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
856 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
857 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 4
858 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
859 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 4
860 ; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1
861 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64
862 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
864 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)*
865 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512
866 ; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 4
867 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)*
868 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512
869 ; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 4
870 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)*
871 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026
872 ; OPT-NEXT: [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 2
873 ; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)*
874 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026
875 ; OPT-NEXT: store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 2
878 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false)
882 define amdgpu_kernel void @memcpy_private_align2_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
883 ; OPT-LABEL: @memcpy_private_align2_private_align4_1027(
884 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)*
885 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)*
886 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
887 ; OPT: load-store-loop:
888 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
889 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
890 ; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2
891 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
892 ; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2
893 ; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1
894 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513
895 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
897 ; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)*
898 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026
899 ; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2
900 ; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)*
901 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026
902 ; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2
905 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 2 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false)
909 define amdgpu_kernel void @memcpy_private_align1_private_align4_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
910 ; OPT-LABEL: @memcpy_private_align1_private_align4_1027(
911 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)*
912 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)*
913 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
914 ; OPT: load-store-loop:
915 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
916 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
917 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 4
918 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
919 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 1
920 ; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1
921 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64
922 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
924 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)*
925 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512
926 ; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 4
927 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)*
928 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512
929 ; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 1
930 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)*
931 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026
932 ; OPT-NEXT: [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 2
933 ; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)*
934 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026
935 ; OPT-NEXT: store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 1
938 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 1 %dst, i8 addrspace(5)* align 4 %src, i32 1027, i1 false)
942 define amdgpu_kernel void @memcpy_private_align4_private_align2_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
943 ; OPT-LABEL: @memcpy_private_align4_private_align2_1027(
944 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)*
945 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)*
946 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
947 ; OPT: load-store-loop:
948 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
949 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
950 ; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2
951 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
952 ; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2
953 ; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1
954 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513
955 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
957 ; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)*
958 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026
959 ; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2
960 ; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)*
961 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026
962 ; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2
965 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 2 %src, i32 1027, i1 false)
969 define amdgpu_kernel void @memcpy_private_align4_private_align1_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
970 ; OPT-LABEL: @memcpy_private_align4_private_align1_1027(
971 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to <4 x i32> addrspace(5)*
972 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to <4 x i32> addrspace(5)*
973 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
974 ; OPT: load-store-loop:
975 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
976 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
977 ; OPT-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP3]], align 1
978 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
979 ; OPT-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(5)* [[TMP5]], align 4
980 ; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1
981 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 64
982 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
984 ; OPT-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i16 addrspace(5)*
985 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP8]], i32 512
986 ; OPT-NEXT: [[TMP10:%.*]] = load i16, i16 addrspace(5)* [[TMP9]], align 1
987 ; OPT-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i16 addrspace(5)*
988 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP11]], i32 512
989 ; OPT-NEXT: store i16 [[TMP10]], i16 addrspace(5)* [[TMP12]], align 4
990 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP1]] to i8 addrspace(5)*
991 ; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP13]], i32 1026
992 ; OPT-NEXT: [[TMP15:%.*]] = load i8, i8 addrspace(5)* [[TMP14]], align 1
993 ; OPT-NEXT: [[TMP16:%.*]] = bitcast <4 x i32> addrspace(5)* [[TMP2]] to i8 addrspace(5)*
994 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP16]], i32 1026
995 ; OPT-NEXT: store i8 [[TMP15]], i8 addrspace(5)* [[TMP17]], align 2
998 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 4 %dst, i8 addrspace(5)* align 1 %src, i32 1027, i1 false)
1002 define amdgpu_kernel void @memcpy_private_align2_private_align2_1027(i8 addrspace(5)* %dst, i8 addrspace(5)* %src) #0 {
1003 ; OPT-LABEL: @memcpy_private_align2_private_align2_1027(
1004 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[SRC:%.*]] to i16 addrspace(5)*
1005 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(5)* [[DST:%.*]] to i16 addrspace(5)*
1006 ; OPT-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
1007 ; OPT: load-store-loop:
1008 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
1009 ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP1]], i32 [[LOOP_INDEX]]
1010 ; OPT-NEXT: [[TMP4:%.*]] = load i16, i16 addrspace(5)* [[TMP3]], align 2
1011 ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[TMP2]], i32 [[LOOP_INDEX]]
1012 ; OPT-NEXT: store i16 [[TMP4]], i16 addrspace(5)* [[TMP5]], align 2
1013 ; OPT-NEXT: [[TMP6]] = add i32 [[LOOP_INDEX]], 1
1014 ; OPT-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP6]], 513
1015 ; OPT-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1016 ; OPT: memcpy-split:
1017 ; OPT-NEXT: [[TMP8:%.*]] = bitcast i16 addrspace(5)* [[TMP1]] to i8 addrspace(5)*
1018 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP8]], i32 1026
1019 ; OPT-NEXT: [[TMP10:%.*]] = load i8, i8 addrspace(5)* [[TMP9]], align 2
1020 ; OPT-NEXT: [[TMP11:%.*]] = bitcast i16 addrspace(5)* [[TMP2]] to i8 addrspace(5)*
1021 ; OPT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[TMP11]], i32 1026
1022 ; OPT-NEXT: store i8 [[TMP10]], i8 addrspace(5)* [[TMP12]], align 2
1023 ; OPT-NEXT: ret void
1025 call void @llvm.memcpy.p5i8.p5i8.i32(i8 addrspace(5)* align 2 %dst, i8 addrspace(5)* align 2 %src, i32 1027, i1 false)
1029 define amdgpu_kernel void @memcpy_global_align4_global_align4_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
1030 ; OPT-LABEL: @memcpy_global_align4_global_align4_variable(
1031 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
1032 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
1033 ; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
1034 ; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16
1035 ; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
1036 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
1037 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1038 ; OPT: loop-memcpy-expansion:
1039 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1040 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
1041 ; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 4
1042 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
1043 ; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 4
1044 ; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1
1045 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
1046 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1047 ; OPT: loop-memcpy-residual:
1048 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1049 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
1050 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
1051 ; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1052 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
1053 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 4
1054 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
1055 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 4
1056 ; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
1057 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
1058 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1059 ; OPT: post-loop-memcpy-expansion:
1060 ; OPT-NEXT: ret void
1061 ; OPT: loop-memcpy-residual-header:
1062 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
1063 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1065 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 %n, i1 false)
1069 define amdgpu_kernel void @memcpy_global_align2_global_align2_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
1070 ; OPT-LABEL: @memcpy_global_align2_global_align2_variable(
1071 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
1072 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
1073 ; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 2
1074 ; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 2
1075 ; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
1076 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
1077 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1078 ; OPT: loop-memcpy-expansion:
1079 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1080 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
1081 ; OPT-NEXT: [[TMP8:%.*]] = load i16, i16 addrspace(1)* [[TMP7]], align 2
1082 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
1083 ; OPT-NEXT: store i16 [[TMP8]], i16 addrspace(1)* [[TMP9]], align 2
1084 ; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1
1085 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
1086 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1087 ; OPT: loop-memcpy-residual:
1088 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1089 ; OPT-NEXT: [[TMP12:%.*]] = bitcast i16 addrspace(1)* [[TMP1]] to i8 addrspace(1)*
1090 ; OPT-NEXT: [[TMP13:%.*]] = bitcast i16 addrspace(1)* [[TMP2]] to i8 addrspace(1)*
1091 ; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1092 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
1093 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 2
1094 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
1095 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 2
1096 ; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
1097 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
1098 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1099 ; OPT: post-loop-memcpy-expansion:
1100 ; OPT-NEXT: ret void
1101 ; OPT: loop-memcpy-residual-header:
1102 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
1103 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1105 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 2 %dst, i8 addrspace(1)* align 2 %src, i64 %n, i1 false)
1109 define amdgpu_kernel void @memcpy_global_align1_global_align1_variable(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
1110 ; OPT-LABEL: @memcpy_global_align1_global_align1_variable(
1111 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
1112 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
1113 ; OPT-NEXT: [[TMP3:%.*]] = udiv i64 [[N:%.*]], 16
1114 ; OPT-NEXT: [[TMP4:%.*]] = urem i64 [[N]], 16
1115 ; OPT-NEXT: [[TMP5:%.*]] = sub i64 [[N]], [[TMP4]]
1116 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP3]], 0
1117 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1118 ; OPT: loop-memcpy-expansion:
1119 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1120 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
1121 ; OPT-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP7]], align 1
1122 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
1123 ; OPT-NEXT: store <4 x i32> [[TMP8]], <4 x i32> addrspace(1)* [[TMP9]], align 1
1124 ; OPT-NEXT: [[TMP10]] = add i64 [[LOOP_INDEX]], 1
1125 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP3]]
1126 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1127 ; OPT: loop-memcpy-residual:
1128 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1129 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
1130 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
1131 ; OPT-NEXT: [[TMP14:%.*]] = add i64 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1132 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i64 [[TMP14]]
1133 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 1
1134 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i64 [[TMP14]]
1135 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 1
1136 ; OPT-NEXT: [[TMP18]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
1137 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i64 [[TMP18]], [[TMP4]]
1138 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1139 ; OPT: post-loop-memcpy-expansion:
1140 ; OPT-NEXT: ret void
1141 ; OPT: loop-memcpy-residual-header:
1142 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i64 [[TMP4]], 0
1143 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1145 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 1 %dst, i8 addrspace(1)* align 1 %src, i64 %n, i1 false)
1149 define amdgpu_kernel void @memcpy_local_align4_local_align4_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
1150 ; OPT-LABEL: @memcpy_local_align4_local_align4_variable(
1151 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)*
1152 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)*
1153 ; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
1154 ; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8
1155 ; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1156 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1157 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1158 ; OPT: loop-memcpy-expansion:
1159 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1160 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
1161 ; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 4
1162 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]]
1163 ; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 4
1164 ; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1165 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1166 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1167 ; OPT: loop-memcpy-residual:
1168 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1169 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)*
1170 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)*
1171 ; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1172 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
1173 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 4
1174 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]]
1175 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 4
1176 ; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1177 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1178 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1179 ; OPT: post-loop-memcpy-expansion:
1180 ; OPT-NEXT: ret void
1181 ; OPT: loop-memcpy-residual-header:
1182 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1183 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1185 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 4 %dst, i8 addrspace(3)* align 4 %src, i32 %n, i1 false)
1189 define amdgpu_kernel void @memcpy_local_align2_local_align2_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
1190 ; OPT-LABEL: @memcpy_local_align2_local_align2_variable(
1191 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to i16 addrspace(3)*
1192 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to i16 addrspace(3)*
1193 ; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 2
1194 ; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 2
1195 ; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1196 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1197 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1198 ; OPT: loop-memcpy-expansion:
1199 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1200 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
1201 ; OPT-NEXT: [[TMP8:%.*]] = load i16, i16 addrspace(3)* [[TMP7]], align 2
1202 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16 addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]]
1203 ; OPT-NEXT: store i16 [[TMP8]], i16 addrspace(3)* [[TMP9]], align 2
1204 ; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1205 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1206 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1207 ; OPT: loop-memcpy-residual:
1208 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1209 ; OPT-NEXT: [[TMP12:%.*]] = bitcast i16 addrspace(3)* [[TMP1]] to i8 addrspace(3)*
1210 ; OPT-NEXT: [[TMP13:%.*]] = bitcast i16 addrspace(3)* [[TMP2]] to i8 addrspace(3)*
1211 ; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1212 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
1213 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 2
1214 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]]
1215 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 2
1216 ; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1217 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1218 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1219 ; OPT: post-loop-memcpy-expansion:
1220 ; OPT-NEXT: ret void
1221 ; OPT: loop-memcpy-residual-header:
1222 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1223 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1225 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 2 %dst, i8 addrspace(3)* align 2 %src, i32 %n, i1 false)
1229 define amdgpu_kernel void @memcpy_local_align1_local_align1_variable(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
1230 ; OPT-LABEL: @memcpy_local_align1_local_align1_variable(
1231 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)*
1232 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)*
1233 ; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
1234 ; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8
1235 ; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1236 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1237 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1238 ; OPT: loop-memcpy-expansion:
1239 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1240 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
1241 ; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 1
1242 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]]
1243 ; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 1
1244 ; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1245 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1246 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1247 ; OPT: loop-memcpy-residual:
1248 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1249 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)*
1250 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)*
1251 ; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1252 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
1253 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 1
1254 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]]
1255 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 1
1256 ; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1257 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1258 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1259 ; OPT: post-loop-memcpy-expansion:
1260 ; OPT-NEXT: ret void
1261 ; OPT: loop-memcpy-residual-header:
1262 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1263 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1265 call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* align 1 %dst, i8 addrspace(3)* align 1 %src, i32 %n, i1 false)
1269 define amdgpu_kernel void @memcpy_local_align4_global_align4_variable(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %n) #0 {
1270 ; OPT-LABEL: @memcpy_local_align4_global_align4_variable(
1271 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <2 x i32> addrspace(1)*
1272 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(3)* [[DST:%.*]] to <2 x i32> addrspace(3)*
1273 ; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
1274 ; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8
1275 ; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1276 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1277 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1278 ; OPT: loop-memcpy-expansion:
1279 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1280 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], i32 [[LOOP_INDEX]]
1281 ; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP7]], align 4
1282 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP2]], i32 [[LOOP_INDEX]]
1283 ; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(3)* [[TMP9]], align 4
1284 ; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1285 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1286 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1287 ; OPT: loop-memcpy-residual:
1288 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1289 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP1]] to i8 addrspace(1)*
1290 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP2]] to i8 addrspace(3)*
1291 ; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1292 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP12]], i32 [[TMP14]]
1293 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(1)* [[TMP15]], align 4
1294 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP13]], i32 [[TMP14]]
1295 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(3)* [[TMP17]], align 4
1296 ; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1297 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1298 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1299 ; OPT: post-loop-memcpy-expansion:
1300 ; OPT-NEXT: ret void
1301 ; OPT: loop-memcpy-residual-header:
1302 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1303 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1305 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %dst, i8 addrspace(1)* align 4 %src, i32 %n, i1 false)
1309 define amdgpu_kernel void @memcpy_global_align4_local_align4_variable(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
1310 ; OPT-LABEL: @memcpy_global_align4_local_align4_variable(
1311 ; OPT-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[SRC:%.*]] to <2 x i32> addrspace(3)*
1312 ; OPT-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <2 x i32> addrspace(1)*
1313 ; OPT-NEXT: [[TMP3:%.*]] = udiv i32 [[N:%.*]], 8
1314 ; OPT-NEXT: [[TMP4:%.*]] = urem i32 [[N]], 8
1315 ; OPT-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[TMP4]]
1316 ; OPT-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP3]], 0
1317 ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION:%.*]], label [[LOOP_MEMCPY_RESIDUAL_HEADER:%.*]]
1318 ; OPT: loop-memcpy-expansion:
1319 ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP10:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
1320 ; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(3)* [[TMP1]], i32 [[LOOP_INDEX]]
1321 ; OPT-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32> addrspace(3)* [[TMP7]], align 4
1322 ; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* [[TMP2]], i32 [[LOOP_INDEX]]
1323 ; OPT-NEXT: store <2 x i32> [[TMP8]], <2 x i32> addrspace(1)* [[TMP9]], align 4
1324 ; OPT-NEXT: [[TMP10]] = add i32 [[LOOP_INDEX]], 1
1325 ; OPT-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP3]]
1326 ; OPT-NEXT: br i1 [[TMP11]], label [[LOOP_MEMCPY_EXPANSION]], label [[LOOP_MEMCPY_RESIDUAL_HEADER]]
1327 ; OPT: loop-memcpy-residual:
1328 ; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, [[LOOP_MEMCPY_RESIDUAL_HEADER]] ], [ [[TMP18:%.*]], [[LOOP_MEMCPY_RESIDUAL:%.*]] ]
1329 ; OPT-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> addrspace(3)* [[TMP1]] to i8 addrspace(3)*
1330 ; OPT-NEXT: [[TMP13:%.*]] = bitcast <2 x i32> addrspace(1)* [[TMP2]] to i8 addrspace(1)*
1331 ; OPT-NEXT: [[TMP14:%.*]] = add i32 [[TMP5]], [[RESIDUAL_LOOP_INDEX]]
1332 ; OPT-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[TMP12]], i32 [[TMP14]]
1333 ; OPT-NEXT: [[TMP16:%.*]] = load i8, i8 addrspace(3)* [[TMP15]], align 4
1334 ; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP13]], i32 [[TMP14]]
1335 ; OPT-NEXT: store i8 [[TMP16]], i8 addrspace(1)* [[TMP17]], align 4
1336 ; OPT-NEXT: [[TMP18]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
1337 ; OPT-NEXT: [[TMP19:%.*]] = icmp ult i32 [[TMP18]], [[TMP4]]
1338 ; OPT-NEXT: br i1 [[TMP19]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION:%.*]]
1339 ; OPT: post-loop-memcpy-expansion:
1340 ; OPT-NEXT: ret void
1341 ; OPT: loop-memcpy-residual-header:
1342 ; OPT-NEXT: [[TMP20:%.*]] = icmp ne i32 [[TMP4]], 0
1343 ; OPT-NEXT: br i1 [[TMP20]], label [[LOOP_MEMCPY_RESIDUAL]], label [[POST_LOOP_MEMCPY_EXPANSION]]
1345 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %dst, i8 addrspace(3)* align 4 %src, i32 %n, i1 false)
1349 define amdgpu_kernel void @memcpy_global_align4_global_align4_16(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1350 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_16(
1351 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 16, i1 false)
1352 ; MAX1024-NEXT: ret void
1354 ; ALL-LABEL: @memcpy_global_align4_global_align4_16(
1355 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to <4 x i32> addrspace(1)*
1356 ; ALL-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to <4 x i32> addrspace(1)*
1357 ; ALL-NEXT: br label [[LOAD_STORE_LOOP:%.*]]
1358 ; ALL: load-store-loop:
1359 ; ALL-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[LOAD_STORE_LOOP]] ]
1360 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], i64 [[LOOP_INDEX]]
1361 ; ALL-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4
1362 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], i64 [[LOOP_INDEX]]
1363 ; ALL-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
1364 ; ALL-NEXT: [[TMP6]] = add i64 [[LOOP_INDEX]], 1
1365 ; ALL-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 1
1366 ; ALL-NEXT: br i1 [[TMP7]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
1367 ; ALL: memcpy-split:
1368 ; ALL-NEXT: ret void
1370 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 16, i1 false)
1374 define amdgpu_kernel void @memcpy_global_align4_global_align4_12(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1375 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_12(
1376 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 12, i1 false)
1377 ; MAX1024-NEXT: ret void
1379 ; ALL-LABEL: @memcpy_global_align4_global_align4_12(
1380 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)*
1381 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0
1382 ; ALL-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4
1383 ; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)*
1384 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0
1385 ; ALL-NEXT: store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4
1386 ; ALL-NEXT: [[TMP6:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to i32 addrspace(1)*
1387 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i64 2
1388 ; ALL-NEXT: [[TMP8:%.*]] = load i32, i32 addrspace(1)* [[TMP7]], align 4
1389 ; ALL-NEXT: [[TMP9:%.*]] = bitcast i8 addrspace(1)* [[DST]] to i32 addrspace(1)*
1390 ; ALL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP9]], i64 2
1391 ; ALL-NEXT: store i32 [[TMP8]], i32 addrspace(1)* [[TMP10]], align 4
1392 ; ALL-NEXT: ret void
1394 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 12, i1 false)
1398 define amdgpu_kernel void @memcpy_global_align4_global_align4_8(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1399 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_8(
1400 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 8, i1 false)
1401 ; MAX1024-NEXT: ret void
1403 ; ALL-LABEL: @memcpy_global_align4_global_align4_8(
1404 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)*
1405 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0
1406 ; ALL-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4
1407 ; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)*
1408 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0
1409 ; ALL-NEXT: store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4
1410 ; ALL-NEXT: ret void
1412 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 8, i1 false)
1416 define amdgpu_kernel void @memcpy_global_align4_global_align4_10(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1417 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_10(
1418 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 10, i1 false)
1419 ; MAX1024-NEXT: ret void
1421 ; ALL-LABEL: @memcpy_global_align4_global_align4_10(
1422 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i64 addrspace(1)*
1423 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP1]], i64 0
1424 ; ALL-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* [[TMP2]], align 4
1425 ; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i64 addrspace(1)*
1426 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[TMP4]], i64 0
1427 ; ALL-NEXT: store i64 [[TMP3]], i64 addrspace(1)* [[TMP5]], align 4
1428 ; ALL-NEXT: [[TMP6:%.*]] = bitcast i8 addrspace(1)* [[SRC]] to i16 addrspace(1)*
1429 ; ALL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP6]], i64 4
1430 ; ALL-NEXT: [[TMP8:%.*]] = load i16, i16 addrspace(1)* [[TMP7]], align 4
1431 ; ALL-NEXT: [[TMP9:%.*]] = bitcast i8 addrspace(1)* [[DST]] to i16 addrspace(1)*
1432 ; ALL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP9]], i64 4
1433 ; ALL-NEXT: store i16 [[TMP8]], i16 addrspace(1)* [[TMP10]], align 4
1434 ; ALL-NEXT: ret void
1436 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 10, i1 false)
1440 define amdgpu_kernel void @memcpy_global_align4_global_align4_4(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1441 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_4(
1442 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 4, i1 false)
1443 ; MAX1024-NEXT: ret void
1445 ; ALL-LABEL: @memcpy_global_align4_global_align4_4(
1446 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i32 addrspace(1)*
1447 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i64 0
1448 ; ALL-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[TMP2]], align 4
1449 ; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i32 addrspace(1)*
1450 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i64 0
1451 ; ALL-NEXT: store i32 [[TMP3]], i32 addrspace(1)* [[TMP5]], align 4
1452 ; ALL-NEXT: ret void
1454 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 4, i1 false)
1458 define amdgpu_kernel void @memcpy_global_align4_global_align4_2(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1459 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_2(
1460 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 2, i1 false)
1461 ; MAX1024-NEXT: ret void
1463 ; ALL-LABEL: @memcpy_global_align4_global_align4_2(
1464 ; ALL-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[SRC:%.*]] to i16 addrspace(1)*
1465 ; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP1]], i64 0
1466 ; ALL-NEXT: [[TMP3:%.*]] = load i16, i16 addrspace(1)* [[TMP2]], align 4
1467 ; ALL-NEXT: [[TMP4:%.*]] = bitcast i8 addrspace(1)* [[DST:%.*]] to i16 addrspace(1)*
1468 ; ALL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16 addrspace(1)* [[TMP4]], i64 0
1469 ; ALL-NEXT: store i16 [[TMP3]], i16 addrspace(1)* [[TMP5]], align 4
1470 ; ALL-NEXT: ret void
1472 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 2, i1 false)
1476 define amdgpu_kernel void @memcpy_global_align4_global_align4_1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
1477 ; MAX1024-LABEL: @memcpy_global_align4_global_align4_1(
1478 ; MAX1024-NEXT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 [[DST:%.*]], i8 addrspace(1)* align 4 [[SRC:%.*]], i64 1, i1 false)
1479 ; MAX1024-NEXT: ret void
1481 ; ALL-LABEL: @memcpy_global_align4_global_align4_1(
1482 ; ALL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 0
1483 ; ALL-NEXT: [[TMP2:%.*]] = load i8, i8 addrspace(1)* [[TMP1]], align 4
1484 ; ALL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 0
1485 ; ALL-NEXT: store i8 [[TMP2]], i8 addrspace(1)* [[TMP3]], align 4
1486 ; ALL-NEXT: ret void
1488 call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* align 4 %dst, i8 addrspace(1)* align 4 %src, i64 1, i1 false)
1492 attributes #0 = { nounwind }
1493 attributes #1 = { argmemonly nounwind }