1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=i686-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefixes=X86,X86-BWON %s
3 ; RUN: llc -mtriple=i686-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefixes=X86,X86-BWOFF %s
4 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefixes=X64,X64-BWON %s
5 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefixes=X64,X64-BWOFF %s
7 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
8 %struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
9 %struct.C = type { i8, i8, i8, i8, i32, i32, i32, i64 }
11 ; save 1,2,3 ... as one big integer.
12 define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
13 ; X86-LABEL: merge_const_store:
15 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
16 ; X86-NEXT: testl %eax, %eax
17 ; X86-NEXT: jle .LBB0_3
18 ; X86-NEXT: # %bb.1: # %.lr.ph.preheader
19 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
20 ; X86-NEXT: .p2align 4, 0x90
21 ; X86-NEXT: .LBB0_2: # %.lr.ph
22 ; X86-NEXT: # =>This Inner Loop Header: Depth=1
23 ; X86-NEXT: movl $67305985, (%ecx) # imm = 0x4030201
24 ; X86-NEXT: movl $134678021, 4(%ecx) # imm = 0x8070605
25 ; X86-NEXT: addl $8, %ecx
27 ; X86-NEXT: jne .LBB0_2
28 ; X86-NEXT: .LBB0_3: # %._crit_edge
31 ; X64-LABEL: merge_const_store:
33 ; X64-NEXT: testl %edi, %edi
34 ; X64-NEXT: jle .LBB0_3
35 ; X64-NEXT: # %bb.1: # %.lr.ph.preheader
36 ; X64-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
37 ; X64-NEXT: .p2align 4, 0x90
38 ; X64-NEXT: .LBB0_2: # %.lr.ph
39 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
40 ; X64-NEXT: movq %rax, (%rsi)
41 ; X64-NEXT: addq $8, %rsi
43 ; X64-NEXT: jne .LBB0_2
44 ; X64-NEXT: .LBB0_3: # %._crit_edge
46 %1 = icmp sgt i32 %count, 0
47 br i1 %1, label %.lr.ph, label %._crit_edge
49 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
50 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
51 %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
52 store i8 1, i8* %2, align 1
53 %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
54 store i8 2, i8* %3, align 1
55 %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
56 store i8 3, i8* %4, align 1
57 %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
58 store i8 4, i8* %5, align 1
59 %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
60 store i8 5, i8* %6, align 1
61 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
62 store i8 6, i8* %7, align 1
63 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
64 store i8 7, i8* %8, align 1
65 %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
66 store i8 8, i8* %9, align 1
67 %10 = add nsw i32 %i.02, 1
68 %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
69 %exitcond = icmp eq i32 %10, %count
70 br i1 %exitcond, label %._crit_edge, label %.lr.ph
75 ; No vectors because we use noimplicitfloat
76 define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
77 ; X86-LABEL: merge_const_store_no_vec:
79 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
80 ; X86-NEXT: testl %eax, %eax
81 ; X86-NEXT: jle .LBB1_3
82 ; X86-NEXT: # %bb.1: # %.lr.ph.preheader
83 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
84 ; X86-NEXT: .p2align 4, 0x90
85 ; X86-NEXT: .LBB1_2: # %.lr.ph
86 ; X86-NEXT: # =>This Inner Loop Header: Depth=1
87 ; X86-NEXT: movl $0, (%ecx)
88 ; X86-NEXT: movl $0, 4(%ecx)
89 ; X86-NEXT: movl $0, 8(%ecx)
90 ; X86-NEXT: movl $0, 12(%ecx)
91 ; X86-NEXT: movl $0, 16(%ecx)
92 ; X86-NEXT: movl $0, 20(%ecx)
93 ; X86-NEXT: movl $0, 24(%ecx)
94 ; X86-NEXT: movl $0, 28(%ecx)
95 ; X86-NEXT: addl $32, %ecx
97 ; X86-NEXT: jne .LBB1_2
98 ; X86-NEXT: .LBB1_3: # %._crit_edge
101 ; X64-LABEL: merge_const_store_no_vec:
103 ; X64-NEXT: testl %edi, %edi
104 ; X64-NEXT: jle .LBB1_2
105 ; X64-NEXT: .p2align 4, 0x90
106 ; X64-NEXT: .LBB1_1: # %.lr.ph
107 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
108 ; X64-NEXT: movq $0, (%rsi)
109 ; X64-NEXT: movq $0, 8(%rsi)
110 ; X64-NEXT: movq $0, 16(%rsi)
111 ; X64-NEXT: movq $0, 24(%rsi)
112 ; X64-NEXT: addq $32, %rsi
113 ; X64-NEXT: decl %edi
114 ; X64-NEXT: jne .LBB1_1
115 ; X64-NEXT: .LBB1_2: # %._crit_edge
117 %1 = icmp sgt i32 %count, 0
118 br i1 %1, label %.lr.ph, label %._crit_edge
120 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
121 %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
122 %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
123 store i32 0, i32* %2, align 4
124 %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
125 store i32 0, i32* %3, align 4
126 %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
127 store i32 0, i32* %4, align 4
128 %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
129 store i32 0, i32* %5, align 4
130 %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
131 store i32 0, i32* %6, align 4
132 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
133 store i32 0, i32* %7, align 4
134 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
135 store i32 0, i32* %8, align 4
136 %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
137 store i32 0, i32* %9, align 4
138 %10 = add nsw i32 %i.02, 1
139 %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
140 %exitcond = icmp eq i32 %10, %count
141 br i1 %exitcond, label %._crit_edge, label %.lr.ph
146 ; Move the constants using a single vector store.
147 define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
148 ; X86-LABEL: merge_const_store_vec:
150 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
151 ; X86-NEXT: testl %eax, %eax
152 ; X86-NEXT: jle .LBB2_3
153 ; X86-NEXT: # %bb.1: # %.lr.ph.preheader
154 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
155 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
156 ; X86-NEXT: .p2align 4, 0x90
157 ; X86-NEXT: .LBB2_2: # %.lr.ph
158 ; X86-NEXT: # =>This Inner Loop Header: Depth=1
159 ; X86-NEXT: vmovups %ymm0, (%ecx)
160 ; X86-NEXT: addl $32, %ecx
161 ; X86-NEXT: decl %eax
162 ; X86-NEXT: jne .LBB2_2
163 ; X86-NEXT: .LBB2_3: # %._crit_edge
164 ; X86-NEXT: vzeroupper
167 ; X64-LABEL: merge_const_store_vec:
169 ; X64-NEXT: testl %edi, %edi
170 ; X64-NEXT: jle .LBB2_3
171 ; X64-NEXT: # %bb.1: # %.lr.ph.preheader
172 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
173 ; X64-NEXT: .p2align 4, 0x90
174 ; X64-NEXT: .LBB2_2: # %.lr.ph
175 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
176 ; X64-NEXT: vmovups %ymm0, (%rsi)
177 ; X64-NEXT: addq $32, %rsi
178 ; X64-NEXT: decl %edi
179 ; X64-NEXT: jne .LBB2_2
180 ; X64-NEXT: .LBB2_3: # %._crit_edge
181 ; X64-NEXT: vzeroupper
183 %1 = icmp sgt i32 %count, 0
184 br i1 %1, label %.lr.ph, label %._crit_edge
186 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
187 %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
188 %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
189 store i32 0, i32* %2, align 4
190 %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
191 store i32 0, i32* %3, align 4
192 %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
193 store i32 0, i32* %4, align 4
194 %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
195 store i32 0, i32* %5, align 4
196 %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
197 store i32 0, i32* %6, align 4
198 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
199 store i32 0, i32* %7, align 4
200 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
201 store i32 0, i32* %8, align 4
202 %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
203 store i32 0, i32* %9, align 4
204 %10 = add nsw i32 %i.02, 1
205 %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
206 %exitcond = icmp eq i32 %10, %count
207 br i1 %exitcond, label %._crit_edge, label %.lr.ph
212 ; Move the first 4 constants as a single vector. Move the rest as scalars.
213 define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
214 ; X86-BWON-LABEL: merge_nonconst_store:
216 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax
217 ; X86-BWON-NEXT: testl %eax, %eax
218 ; X86-BWON-NEXT: jle .LBB3_3
219 ; X86-BWON-NEXT: # %bb.1: # %.lr.ph.preheader
220 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
221 ; X86-BWON-NEXT: movzbl {{[0-9]+}}(%esp), %edx
222 ; X86-BWON-NEXT: .p2align 4, 0x90
223 ; X86-BWON-NEXT: .LBB3_2: # %.lr.ph
224 ; X86-BWON-NEXT: # =>This Inner Loop Header: Depth=1
225 ; X86-BWON-NEXT: movl $67305985, (%ecx) # imm = 0x4030201
226 ; X86-BWON-NEXT: movb %dl, 4(%ecx)
227 ; X86-BWON-NEXT: movw $1798, 5(%ecx) # imm = 0x706
228 ; X86-BWON-NEXT: movb $8, 7(%ecx)
229 ; X86-BWON-NEXT: addl $8, %ecx
230 ; X86-BWON-NEXT: decl %eax
231 ; X86-BWON-NEXT: jne .LBB3_2
232 ; X86-BWON-NEXT: .LBB3_3: # %._crit_edge
233 ; X86-BWON-NEXT: retl
235 ; X86-BWOFF-LABEL: merge_nonconst_store:
236 ; X86-BWOFF: # %bb.0:
237 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax
238 ; X86-BWOFF-NEXT: testl %eax, %eax
239 ; X86-BWOFF-NEXT: jle .LBB3_3
240 ; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph.preheader
241 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
242 ; X86-BWOFF-NEXT: movb {{[0-9]+}}(%esp), %dl
243 ; X86-BWOFF-NEXT: .p2align 4, 0x90
244 ; X86-BWOFF-NEXT: .LBB3_2: # %.lr.ph
245 ; X86-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
246 ; X86-BWOFF-NEXT: movl $67305985, (%ecx) # imm = 0x4030201
247 ; X86-BWOFF-NEXT: movb %dl, 4(%ecx)
248 ; X86-BWOFF-NEXT: movw $1798, 5(%ecx) # imm = 0x706
249 ; X86-BWOFF-NEXT: movb $8, 7(%ecx)
250 ; X86-BWOFF-NEXT: addl $8, %ecx
251 ; X86-BWOFF-NEXT: decl %eax
252 ; X86-BWOFF-NEXT: jne .LBB3_2
253 ; X86-BWOFF-NEXT: .LBB3_3: # %._crit_edge
254 ; X86-BWOFF-NEXT: retl
256 ; X64-LABEL: merge_nonconst_store:
258 ; X64-NEXT: testl %edi, %edi
259 ; X64-NEXT: jle .LBB3_2
260 ; X64-NEXT: .p2align 4, 0x90
261 ; X64-NEXT: .LBB3_1: # %.lr.ph
262 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
263 ; X64-NEXT: movl $67305985, (%rdx) # imm = 0x4030201
264 ; X64-NEXT: movb %sil, 4(%rdx)
265 ; X64-NEXT: movw $1798, 5(%rdx) # imm = 0x706
266 ; X64-NEXT: movb $8, 7(%rdx)
267 ; X64-NEXT: addq $8, %rdx
268 ; X64-NEXT: decl %edi
269 ; X64-NEXT: jne .LBB3_1
270 ; X64-NEXT: .LBB3_2: # %._crit_edge
272 %1 = icmp sgt i32 %count, 0
273 br i1 %1, label %.lr.ph, label %._crit_edge
275 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
276 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
277 %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
278 store i8 1, i8* %2, align 1
279 %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
280 store i8 2, i8* %3, align 1
281 %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
282 store i8 3, i8* %4, align 1
283 %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
284 store i8 4, i8* %5, align 1
285 %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
286 store i8 %zz, i8* %6, align 1 ; <----------- Not a const;
287 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
288 store i8 6, i8* %7, align 1
289 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
290 store i8 7, i8* %8, align 1
291 %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
292 store i8 8, i8* %9, align 1
293 %10 = add nsw i32 %i.02, 1
294 %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
295 %exitcond = icmp eq i32 %10, %count
296 br i1 %exitcond, label %._crit_edge, label %.lr.ph
301 define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
302 ; X86-BWON-LABEL: merge_loads_i16:
304 ; X86-BWON-NEXT: pushl %esi
305 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
306 ; X86-BWON-NEXT: .cfi_offset %esi, -8
307 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax
308 ; X86-BWON-NEXT: testl %eax, %eax
309 ; X86-BWON-NEXT: jle .LBB4_3
310 ; X86-BWON-NEXT: # %bb.1: # %.lr.ph
311 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
312 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
313 ; X86-BWON-NEXT: .p2align 4, 0x90
314 ; X86-BWON-NEXT: .LBB4_2: # =>This Inner Loop Header: Depth=1
315 ; X86-BWON-NEXT: movzwl (%edx), %esi
316 ; X86-BWON-NEXT: movw %si, (%ecx)
317 ; X86-BWON-NEXT: addl $8, %ecx
318 ; X86-BWON-NEXT: decl %eax
319 ; X86-BWON-NEXT: jne .LBB4_2
320 ; X86-BWON-NEXT: .LBB4_3: # %._crit_edge
321 ; X86-BWON-NEXT: popl %esi
322 ; X86-BWON-NEXT: .cfi_def_cfa_offset 4
323 ; X86-BWON-NEXT: retl
325 ; X86-BWOFF-LABEL: merge_loads_i16:
326 ; X86-BWOFF: # %bb.0:
327 ; X86-BWOFF-NEXT: pushl %esi
328 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
329 ; X86-BWOFF-NEXT: .cfi_offset %esi, -8
330 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax
331 ; X86-BWOFF-NEXT: testl %eax, %eax
332 ; X86-BWOFF-NEXT: jle .LBB4_3
333 ; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph
334 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
335 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
336 ; X86-BWOFF-NEXT: .p2align 4, 0x90
337 ; X86-BWOFF-NEXT: .LBB4_2: # =>This Inner Loop Header: Depth=1
338 ; X86-BWOFF-NEXT: movw (%edx), %si
339 ; X86-BWOFF-NEXT: movw %si, (%ecx)
340 ; X86-BWOFF-NEXT: addl $8, %ecx
341 ; X86-BWOFF-NEXT: decl %eax
342 ; X86-BWOFF-NEXT: jne .LBB4_2
343 ; X86-BWOFF-NEXT: .LBB4_3: # %._crit_edge
344 ; X86-BWOFF-NEXT: popl %esi
345 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
346 ; X86-BWOFF-NEXT: retl
348 ; X64-BWON-LABEL: merge_loads_i16:
350 ; X64-BWON-NEXT: testl %edi, %edi
351 ; X64-BWON-NEXT: jle .LBB4_2
352 ; X64-BWON-NEXT: .p2align 4, 0x90
353 ; X64-BWON-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
354 ; X64-BWON-NEXT: movzwl (%rsi), %eax
355 ; X64-BWON-NEXT: movw %ax, (%rdx)
356 ; X64-BWON-NEXT: addq $8, %rdx
357 ; X64-BWON-NEXT: decl %edi
358 ; X64-BWON-NEXT: jne .LBB4_1
359 ; X64-BWON-NEXT: .LBB4_2: # %._crit_edge
360 ; X64-BWON-NEXT: retq
362 ; X64-BWOFF-LABEL: merge_loads_i16:
363 ; X64-BWOFF: # %bb.0:
364 ; X64-BWOFF-NEXT: testl %edi, %edi
365 ; X64-BWOFF-NEXT: jle .LBB4_2
366 ; X64-BWOFF-NEXT: .p2align 4, 0x90
367 ; X64-BWOFF-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
368 ; X64-BWOFF-NEXT: movw (%rsi), %ax
369 ; X64-BWOFF-NEXT: movw %ax, (%rdx)
370 ; X64-BWOFF-NEXT: addq $8, %rdx
371 ; X64-BWOFF-NEXT: decl %edi
372 ; X64-BWOFF-NEXT: jne .LBB4_1
373 ; X64-BWOFF-NEXT: .LBB4_2: # %._crit_edge
374 ; X64-BWOFF-NEXT: retq
375 %1 = icmp sgt i32 %count, 0
376 br i1 %1, label %.lr.ph, label %._crit_edge
379 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
380 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
383 ; <label>:4 ; preds = %4, %.lr.ph
384 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
385 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
386 %5 = load i8, i8* %2, align 1
387 %6 = load i8, i8* %3, align 1
388 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
389 store i8 %5, i8* %7, align 1
390 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
391 store i8 %6, i8* %8, align 1
392 %9 = add nsw i32 %i.02, 1
393 %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
394 %exitcond = icmp eq i32 %9, %count
395 br i1 %exitcond, label %._crit_edge, label %4
397 ._crit_edge: ; preds = %4, %0
401 ; The loads and the stores are interleaved. Can't merge them.
402 define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
403 ; X86-BWON-LABEL: no_merge_loads:
405 ; X86-BWON-NEXT: pushl %ebx
406 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
407 ; X86-BWON-NEXT: .cfi_offset %ebx, -8
408 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %eax
409 ; X86-BWON-NEXT: testl %eax, %eax
410 ; X86-BWON-NEXT: jle .LBB5_3
411 ; X86-BWON-NEXT: # %bb.1: # %.lr.ph
412 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
413 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
414 ; X86-BWON-NEXT: .p2align 4, 0x90
415 ; X86-BWON-NEXT: .LBB5_2: # %a4
416 ; X86-BWON-NEXT: # =>This Inner Loop Header: Depth=1
417 ; X86-BWON-NEXT: movzbl (%edx), %ebx
418 ; X86-BWON-NEXT: movb %bl, (%ecx)
419 ; X86-BWON-NEXT: movzbl 1(%edx), %ebx
420 ; X86-BWON-NEXT: movb %bl, 1(%ecx)
421 ; X86-BWON-NEXT: addl $8, %ecx
422 ; X86-BWON-NEXT: decl %eax
423 ; X86-BWON-NEXT: jne .LBB5_2
424 ; X86-BWON-NEXT: .LBB5_3: # %._crit_edge
425 ; X86-BWON-NEXT: popl %ebx
426 ; X86-BWON-NEXT: .cfi_def_cfa_offset 4
427 ; X86-BWON-NEXT: retl
429 ; X86-BWOFF-LABEL: no_merge_loads:
430 ; X86-BWOFF: # %bb.0:
431 ; X86-BWOFF-NEXT: pushl %ebx
432 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
433 ; X86-BWOFF-NEXT: .cfi_offset %ebx, -8
434 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %eax
435 ; X86-BWOFF-NEXT: testl %eax, %eax
436 ; X86-BWOFF-NEXT: jle .LBB5_3
437 ; X86-BWOFF-NEXT: # %bb.1: # %.lr.ph
438 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
439 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
440 ; X86-BWOFF-NEXT: .p2align 4, 0x90
441 ; X86-BWOFF-NEXT: .LBB5_2: # %a4
442 ; X86-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
443 ; X86-BWOFF-NEXT: movb (%edx), %bl
444 ; X86-BWOFF-NEXT: movb %bl, (%ecx)
445 ; X86-BWOFF-NEXT: movb 1(%edx), %bl
446 ; X86-BWOFF-NEXT: movb %bl, 1(%ecx)
447 ; X86-BWOFF-NEXT: addl $8, %ecx
448 ; X86-BWOFF-NEXT: decl %eax
449 ; X86-BWOFF-NEXT: jne .LBB5_2
450 ; X86-BWOFF-NEXT: .LBB5_3: # %._crit_edge
451 ; X86-BWOFF-NEXT: popl %ebx
452 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
453 ; X86-BWOFF-NEXT: retl
455 ; X64-BWON-LABEL: no_merge_loads:
457 ; X64-BWON-NEXT: testl %edi, %edi
458 ; X64-BWON-NEXT: jle .LBB5_2
459 ; X64-BWON-NEXT: .p2align 4, 0x90
460 ; X64-BWON-NEXT: .LBB5_1: # %a4
461 ; X64-BWON-NEXT: # =>This Inner Loop Header: Depth=1
462 ; X64-BWON-NEXT: movzbl (%rsi), %eax
463 ; X64-BWON-NEXT: movb %al, (%rdx)
464 ; X64-BWON-NEXT: movzbl 1(%rsi), %eax
465 ; X64-BWON-NEXT: movb %al, 1(%rdx)
466 ; X64-BWON-NEXT: addq $8, %rdx
467 ; X64-BWON-NEXT: decl %edi
468 ; X64-BWON-NEXT: jne .LBB5_1
469 ; X64-BWON-NEXT: .LBB5_2: # %._crit_edge
470 ; X64-BWON-NEXT: retq
472 ; X64-BWOFF-LABEL: no_merge_loads:
473 ; X64-BWOFF: # %bb.0:
474 ; X64-BWOFF-NEXT: testl %edi, %edi
475 ; X64-BWOFF-NEXT: jle .LBB5_2
476 ; X64-BWOFF-NEXT: .p2align 4, 0x90
477 ; X64-BWOFF-NEXT: .LBB5_1: # %a4
478 ; X64-BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
479 ; X64-BWOFF-NEXT: movb (%rsi), %al
480 ; X64-BWOFF-NEXT: movb %al, (%rdx)
481 ; X64-BWOFF-NEXT: movb 1(%rsi), %al
482 ; X64-BWOFF-NEXT: movb %al, 1(%rdx)
483 ; X64-BWOFF-NEXT: addq $8, %rdx
484 ; X64-BWOFF-NEXT: decl %edi
485 ; X64-BWOFF-NEXT: jne .LBB5_1
486 ; X64-BWOFF-NEXT: .LBB5_2: # %._crit_edge
487 ; X64-BWOFF-NEXT: retq
488 %1 = icmp sgt i32 %count, 0
489 br i1 %1, label %.lr.ph, label %._crit_edge
492 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
493 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
496 a4: ; preds = %4, %.lr.ph
497 %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
498 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
499 %a5 = load i8, i8* %2, align 1
500 %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
501 store i8 %a5, i8* %a7, align 1
502 %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
503 %a6 = load i8, i8* %3, align 1
504 store i8 %a6, i8* %a8, align 1
505 %a9 = add nsw i32 %i.02, 1
506 %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
507 %exitcond = icmp eq i32 %a9, %count
508 br i1 %exitcond, label %._crit_edge, label %a4
510 ._crit_edge: ; preds = %4, %0
514 define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
515 ; X86-LABEL: merge_loads_integer:
517 ; X86-NEXT: pushl %edi
518 ; X86-NEXT: .cfi_def_cfa_offset 8
519 ; X86-NEXT: pushl %esi
520 ; X86-NEXT: .cfi_def_cfa_offset 12
521 ; X86-NEXT: .cfi_offset %esi, -12
522 ; X86-NEXT: .cfi_offset %edi, -8
523 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
524 ; X86-NEXT: testl %eax, %eax
525 ; X86-NEXT: jle .LBB6_3
526 ; X86-NEXT: # %bb.1: # %.lr.ph
527 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
528 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
529 ; X86-NEXT: .p2align 4, 0x90
530 ; X86-NEXT: .LBB6_2: # =>This Inner Loop Header: Depth=1
531 ; X86-NEXT: movl (%edx), %esi
532 ; X86-NEXT: movl 4(%edx), %edi
533 ; X86-NEXT: movl %esi, (%ecx)
534 ; X86-NEXT: movl %edi, 4(%ecx)
535 ; X86-NEXT: addl $32, %ecx
536 ; X86-NEXT: decl %eax
537 ; X86-NEXT: jne .LBB6_2
538 ; X86-NEXT: .LBB6_3: # %._crit_edge
539 ; X86-NEXT: popl %esi
540 ; X86-NEXT: .cfi_def_cfa_offset 8
541 ; X86-NEXT: popl %edi
542 ; X86-NEXT: .cfi_def_cfa_offset 4
545 ; X64-LABEL: merge_loads_integer:
547 ; X64-NEXT: testl %edi, %edi
548 ; X64-NEXT: jle .LBB6_2
549 ; X64-NEXT: .p2align 4, 0x90
550 ; X64-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
551 ; X64-NEXT: movq (%rsi), %rax
552 ; X64-NEXT: movq %rax, (%rdx)
553 ; X64-NEXT: addq $32, %rdx
554 ; X64-NEXT: decl %edi
555 ; X64-NEXT: jne .LBB6_1
556 ; X64-NEXT: .LBB6_2: # %._crit_edge
558 %1 = icmp sgt i32 %count, 0
559 br i1 %1, label %.lr.ph, label %._crit_edge
562 %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
563 %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
566 ; <label>:4 ; preds = %4, %.lr.ph
567 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
568 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
569 %5 = load i32, i32* %2
570 %6 = load i32, i32* %3
571 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
572 store i32 %5, i32* %7
573 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
574 store i32 %6, i32* %8
575 %9 = add nsw i32 %i.02, 1
576 %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
577 %exitcond = icmp eq i32 %9, %count
578 br i1 %exitcond, label %._crit_edge, label %4
580 ._crit_edge: ; preds = %4, %0
584 define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
585 ; X86-LABEL: merge_loads_vector:
587 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
588 ; X86-NEXT: testl %eax, %eax
589 ; X86-NEXT: jle .LBB7_3
590 ; X86-NEXT: # %bb.1: # %.lr.ph
591 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
592 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
593 ; X86-NEXT: .p2align 4, 0x90
594 ; X86-NEXT: .LBB7_2: # %block4
595 ; X86-NEXT: # =>This Inner Loop Header: Depth=1
596 ; X86-NEXT: vmovups (%edx), %xmm0
597 ; X86-NEXT: vmovups %xmm0, (%ecx)
598 ; X86-NEXT: addl $32, %ecx
599 ; X86-NEXT: decl %eax
600 ; X86-NEXT: jne .LBB7_2
601 ; X86-NEXT: .LBB7_3: # %._crit_edge
604 ; X64-LABEL: merge_loads_vector:
606 ; X64-NEXT: testl %edi, %edi
607 ; X64-NEXT: jle .LBB7_2
608 ; X64-NEXT: .p2align 4, 0x90
609 ; X64-NEXT: .LBB7_1: # %block4
610 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
611 ; X64-NEXT: vmovups (%rsi), %xmm0
612 ; X64-NEXT: vmovups %xmm0, (%rdx)
613 ; X64-NEXT: addq $32, %rdx
614 ; X64-NEXT: decl %edi
615 ; X64-NEXT: jne .LBB7_1
616 ; X64-NEXT: .LBB7_2: # %._crit_edge
618 %a1 = icmp sgt i32 %count, 0
619 br i1 %a1, label %.lr.ph, label %._crit_edge
622 %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
623 %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
624 %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
625 %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
628 block4: ; preds = %4, %.lr.ph
629 %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
630 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
631 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
632 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
633 %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
634 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
635 %b1 = load i32, i32* %a2
636 %b2 = load i32, i32* %a3
637 %b3 = load i32, i32* %a4
638 %b4 = load i32, i32* %a5
639 store i32 %b1, i32* %a7
640 store i32 %b2, i32* %a8
641 store i32 %b3, i32* %a9
642 store i32 %b4, i32* %a10
643 %c9 = add nsw i32 %i.02, 1
644 %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
645 %exitcond = icmp eq i32 %c9, %count
646 br i1 %exitcond, label %._crit_edge, label %block4
648 ._crit_edge: ; preds = %4, %0
652 ; On x86, even unaligned copies can be merged to vector ops.
653 define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
654 ; X86-LABEL: merge_loads_no_align:
656 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
657 ; X86-NEXT: testl %eax, %eax
658 ; X86-NEXT: jle .LBB8_3
659 ; X86-NEXT: # %bb.1: # %.lr.ph
660 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
661 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
662 ; X86-NEXT: .p2align 4, 0x90
663 ; X86-NEXT: .LBB8_2: # %block4
664 ; X86-NEXT: # =>This Inner Loop Header: Depth=1
665 ; X86-NEXT: vmovups (%edx), %xmm0
666 ; X86-NEXT: vmovups %xmm0, (%ecx)
667 ; X86-NEXT: addl $32, %ecx
668 ; X86-NEXT: decl %eax
669 ; X86-NEXT: jne .LBB8_2
670 ; X86-NEXT: .LBB8_3: # %._crit_edge
673 ; X64-LABEL: merge_loads_no_align:
675 ; X64-NEXT: testl %edi, %edi
676 ; X64-NEXT: jle .LBB8_2
677 ; X64-NEXT: .p2align 4, 0x90
678 ; X64-NEXT: .LBB8_1: # %block4
679 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
680 ; X64-NEXT: vmovups (%rsi), %xmm0
681 ; X64-NEXT: vmovups %xmm0, (%rdx)
682 ; X64-NEXT: addq $32, %rdx
683 ; X64-NEXT: decl %edi
684 ; X64-NEXT: jne .LBB8_1
685 ; X64-NEXT: .LBB8_2: # %._crit_edge
687 %a1 = icmp sgt i32 %count, 0
688 br i1 %a1, label %.lr.ph, label %._crit_edge
691 %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
692 %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
693 %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
694 %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
697 block4: ; preds = %4, %.lr.ph
698 %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
699 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
700 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
701 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
702 %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
703 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
704 %b1 = load i32, i32* %a2, align 1
705 %b2 = load i32, i32* %a3, align 1
706 %b3 = load i32, i32* %a4, align 1
707 %b4 = load i32, i32* %a5, align 1
708 store i32 %b1, i32* %a7, align 1
709 store i32 %b2, i32* %a8, align 1
710 store i32 %b3, i32* %a9, align 1
711 store i32 %b4, i32* %a10, align 1
712 %c9 = add nsw i32 %i.02, 1
713 %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
714 %exitcond = icmp eq i32 %c9, %count
715 br i1 %exitcond, label %._crit_edge, label %block4
717 ._crit_edge: ; preds = %4, %0
721 ; Make sure that we merge the consecutive load/store sequence below and use a
722 ; word (16 bit) instead of a byte copy.
723 define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
724 ; X86-BWON-LABEL: MergeLoadStoreBaseIndexOffset:
726 ; X86-BWON-NEXT: pushl %ebx
727 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
728 ; X86-BWON-NEXT: pushl %edi
729 ; X86-BWON-NEXT: .cfi_def_cfa_offset 12
730 ; X86-BWON-NEXT: pushl %esi
731 ; X86-BWON-NEXT: .cfi_def_cfa_offset 16
732 ; X86-BWON-NEXT: .cfi_offset %esi, -16
733 ; X86-BWON-NEXT: .cfi_offset %edi, -12
734 ; X86-BWON-NEXT: .cfi_offset %ebx, -8
735 ; X86-BWON-NEXT: xorl %eax, %eax
736 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
737 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
738 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %esi
739 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edi
740 ; X86-BWON-NEXT: .p2align 4, 0x90
741 ; X86-BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
742 ; X86-BWON-NEXT: movl (%edi,%eax,8), %ebx
743 ; X86-BWON-NEXT: movzwl (%edx,%ebx), %ebx
744 ; X86-BWON-NEXT: movw %bx, (%esi,%eax,2)
745 ; X86-BWON-NEXT: incl %eax
746 ; X86-BWON-NEXT: cmpl %eax, %ecx
747 ; X86-BWON-NEXT: jne .LBB9_1
748 ; X86-BWON-NEXT: # %bb.2:
749 ; X86-BWON-NEXT: popl %esi
750 ; X86-BWON-NEXT: .cfi_def_cfa_offset 12
751 ; X86-BWON-NEXT: popl %edi
752 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
753 ; X86-BWON-NEXT: popl %ebx
754 ; X86-BWON-NEXT: .cfi_def_cfa_offset 4
755 ; X86-BWON-NEXT: retl
757 ; X86-BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
758 ; X86-BWOFF: # %bb.0:
759 ; X86-BWOFF-NEXT: pushl %ebx
760 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
761 ; X86-BWOFF-NEXT: pushl %edi
762 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
763 ; X86-BWOFF-NEXT: pushl %esi
764 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
765 ; X86-BWOFF-NEXT: .cfi_offset %esi, -16
766 ; X86-BWOFF-NEXT: .cfi_offset %edi, -12
767 ; X86-BWOFF-NEXT: .cfi_offset %ebx, -8
768 ; X86-BWOFF-NEXT: xorl %eax, %eax
769 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
770 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
771 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %esi
772 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edi
773 ; X86-BWOFF-NEXT: .p2align 4, 0x90
774 ; X86-BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
775 ; X86-BWOFF-NEXT: movl (%edi,%eax,8), %ebx
776 ; X86-BWOFF-NEXT: movw (%edx,%ebx), %bx
777 ; X86-BWOFF-NEXT: movw %bx, (%esi,%eax,2)
778 ; X86-BWOFF-NEXT: incl %eax
779 ; X86-BWOFF-NEXT: cmpl %eax, %ecx
780 ; X86-BWOFF-NEXT: jne .LBB9_1
781 ; X86-BWOFF-NEXT: # %bb.2:
782 ; X86-BWOFF-NEXT: popl %esi
783 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
784 ; X86-BWOFF-NEXT: popl %edi
785 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
786 ; X86-BWOFF-NEXT: popl %ebx
787 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
788 ; X86-BWOFF-NEXT: retl
790 ; X64-BWON-LABEL: MergeLoadStoreBaseIndexOffset:
792 ; X64-BWON-NEXT: movl %ecx, %eax
793 ; X64-BWON-NEXT: xorl %ecx, %ecx
794 ; X64-BWON-NEXT: .p2align 4, 0x90
795 ; X64-BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
796 ; X64-BWON-NEXT: movq (%rdi,%rcx,8), %r8
797 ; X64-BWON-NEXT: movzwl (%rdx,%r8), %r8d
798 ; X64-BWON-NEXT: movw %r8w, (%rsi,%rcx,2)
799 ; X64-BWON-NEXT: incq %rcx
800 ; X64-BWON-NEXT: cmpl %ecx, %eax
801 ; X64-BWON-NEXT: jne .LBB9_1
802 ; X64-BWON-NEXT: # %bb.2:
803 ; X64-BWON-NEXT: retq
805 ; X64-BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
806 ; X64-BWOFF: # %bb.0:
807 ; X64-BWOFF-NEXT: movl %ecx, %eax
808 ; X64-BWOFF-NEXT: xorl %ecx, %ecx
809 ; X64-BWOFF-NEXT: .p2align 4, 0x90
810 ; X64-BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
811 ; X64-BWOFF-NEXT: movq (%rdi,%rcx,8), %r8
812 ; X64-BWOFF-NEXT: movw (%rdx,%r8), %r8w
813 ; X64-BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2)
814 ; X64-BWOFF-NEXT: incq %rcx
815 ; X64-BWOFF-NEXT: cmpl %ecx, %eax
816 ; X64-BWOFF-NEXT: jne .LBB9_1
817 ; X64-BWOFF-NEXT: # %bb.2:
818 ; X64-BWOFF-NEXT: retq
822 %.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
823 %.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
824 %.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
825 %2 = getelementptr inbounds i64, i64* %.0, i64 1
826 %3 = load i64, i64* %.0, align 1
827 %4 = getelementptr inbounds i8, i8* %c, i64 %3
828 %5 = load i8, i8* %4, align 1
830 %7 = getelementptr inbounds i8, i8* %c, i64 %6
831 %8 = load i8, i8* %7, align 1
832 store i8 %5, i8* %.08, align 1
833 %9 = getelementptr inbounds i8, i8* %.08, i64 1
834 store i8 %8, i8* %9, align 1
835 %10 = getelementptr inbounds i8, i8* %.08, i64 2
836 %11 = add nsw i32 %.09, -1
837 %12 = icmp eq i32 %11, 0
838 br i1 %12, label %13, label %1
844 ; Make sure that we merge the consecutive load/store sequence below and use a
845 ; word (16 bit) instead of a byte copy for complicated address calculation.
846 define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
847 ; X86-BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
849 ; X86-BWON-NEXT: pushl %ebp
850 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
851 ; X86-BWON-NEXT: pushl %ebx
852 ; X86-BWON-NEXT: .cfi_def_cfa_offset 12
853 ; X86-BWON-NEXT: pushl %edi
854 ; X86-BWON-NEXT: .cfi_def_cfa_offset 16
855 ; X86-BWON-NEXT: pushl %esi
856 ; X86-BWON-NEXT: .cfi_def_cfa_offset 20
857 ; X86-BWON-NEXT: .cfi_offset %esi, -20
858 ; X86-BWON-NEXT: .cfi_offset %edi, -16
859 ; X86-BWON-NEXT: .cfi_offset %ebx, -12
860 ; X86-BWON-NEXT: .cfi_offset %ebp, -8
861 ; X86-BWON-NEXT: xorl %eax, %eax
862 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %esi
863 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edi
864 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ebx
865 ; X86-BWON-NEXT: xorl %ebp, %ebp
866 ; X86-BWON-NEXT: .p2align 4, 0x90
867 ; X86-BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
868 ; X86-BWON-NEXT: movsbl (%edi), %ecx
869 ; X86-BWON-NEXT: movzbl (%esi,%ecx), %edx
870 ; X86-BWON-NEXT: movzbl 1(%esi,%ecx), %ecx
871 ; X86-BWON-NEXT: movb %dl, (%ebx,%eax)
872 ; X86-BWON-NEXT: movl %eax, %edx
873 ; X86-BWON-NEXT: orl $1, %edx
874 ; X86-BWON-NEXT: movb %cl, (%ebx,%edx)
875 ; X86-BWON-NEXT: incl %edi
876 ; X86-BWON-NEXT: addl $2, %eax
877 ; X86-BWON-NEXT: adcl $0, %ebp
878 ; X86-BWON-NEXT: cmpl {{[0-9]+}}(%esp), %eax
879 ; X86-BWON-NEXT: movl %ebp, %ecx
880 ; X86-BWON-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
881 ; X86-BWON-NEXT: jl .LBB10_1
882 ; X86-BWON-NEXT: # %bb.2:
883 ; X86-BWON-NEXT: popl %esi
884 ; X86-BWON-NEXT: .cfi_def_cfa_offset 16
885 ; X86-BWON-NEXT: popl %edi
886 ; X86-BWON-NEXT: .cfi_def_cfa_offset 12
887 ; X86-BWON-NEXT: popl %ebx
888 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
889 ; X86-BWON-NEXT: popl %ebp
890 ; X86-BWON-NEXT: .cfi_def_cfa_offset 4
891 ; X86-BWON-NEXT: retl
893 ; X86-BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
894 ; X86-BWOFF: # %bb.0:
895 ; X86-BWOFF-NEXT: pushl %ebp
896 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
897 ; X86-BWOFF-NEXT: pushl %ebx
898 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
899 ; X86-BWOFF-NEXT: pushl %edi
900 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
901 ; X86-BWOFF-NEXT: pushl %esi
902 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 20
903 ; X86-BWOFF-NEXT: .cfi_offset %esi, -20
904 ; X86-BWOFF-NEXT: .cfi_offset %edi, -16
905 ; X86-BWOFF-NEXT: .cfi_offset %ebx, -12
906 ; X86-BWOFF-NEXT: .cfi_offset %ebp, -8
907 ; X86-BWOFF-NEXT: xorl %eax, %eax
908 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %esi
909 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edi
910 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ebx
911 ; X86-BWOFF-NEXT: xorl %ebp, %ebp
912 ; X86-BWOFF-NEXT: .p2align 4, 0x90
913 ; X86-BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
914 ; X86-BWOFF-NEXT: movsbl (%edi), %ecx
915 ; X86-BWOFF-NEXT: movb (%esi,%ecx), %dl
916 ; X86-BWOFF-NEXT: movb 1(%esi,%ecx), %cl
917 ; X86-BWOFF-NEXT: movb %dl, (%ebx,%eax)
918 ; X86-BWOFF-NEXT: movl %eax, %edx
919 ; X86-BWOFF-NEXT: orl $1, %edx
920 ; X86-BWOFF-NEXT: movb %cl, (%ebx,%edx)
921 ; X86-BWOFF-NEXT: incl %edi
922 ; X86-BWOFF-NEXT: addl $2, %eax
923 ; X86-BWOFF-NEXT: adcl $0, %ebp
924 ; X86-BWOFF-NEXT: cmpl {{[0-9]+}}(%esp), %eax
925 ; X86-BWOFF-NEXT: movl %ebp, %ecx
926 ; X86-BWOFF-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
927 ; X86-BWOFF-NEXT: jl .LBB10_1
928 ; X86-BWOFF-NEXT: # %bb.2:
929 ; X86-BWOFF-NEXT: popl %esi
930 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
931 ; X86-BWOFF-NEXT: popl %edi
932 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
933 ; X86-BWOFF-NEXT: popl %ebx
934 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
935 ; X86-BWOFF-NEXT: popl %ebp
936 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
937 ; X86-BWOFF-NEXT: retl
939 ; X64-BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
941 ; X64-BWON-NEXT: xorl %eax, %eax
942 ; X64-BWON-NEXT: .p2align 4, 0x90
943 ; X64-BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
944 ; X64-BWON-NEXT: movsbq (%rsi), %r8
945 ; X64-BWON-NEXT: movzwl (%rdx,%r8), %r8d
946 ; X64-BWON-NEXT: movw %r8w, (%rdi,%rax)
947 ; X64-BWON-NEXT: incq %rsi
948 ; X64-BWON-NEXT: addq $2, %rax
949 ; X64-BWON-NEXT: cmpq %rcx, %rax
950 ; X64-BWON-NEXT: jl .LBB10_1
951 ; X64-BWON-NEXT: # %bb.2:
952 ; X64-BWON-NEXT: retq
954 ; X64-BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
955 ; X64-BWOFF: # %bb.0:
956 ; X64-BWOFF-NEXT: xorl %eax, %eax
957 ; X64-BWOFF-NEXT: .p2align 4, 0x90
958 ; X64-BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
959 ; X64-BWOFF-NEXT: movsbq (%rsi), %r8
960 ; X64-BWOFF-NEXT: movw (%rdx,%r8), %r8w
961 ; X64-BWOFF-NEXT: movw %r8w, (%rdi,%rax)
962 ; X64-BWOFF-NEXT: incq %rsi
963 ; X64-BWOFF-NEXT: addq $2, %rax
964 ; X64-BWOFF-NEXT: cmpq %rcx, %rax
965 ; X64-BWOFF-NEXT: jl .LBB10_1
966 ; X64-BWOFF-NEXT: # %bb.2:
967 ; X64-BWOFF-NEXT: retq
971 %.09 = phi i64 [ 0, %0 ], [ %13, %1 ]
972 %.08 = phi i8* [ %b, %0 ], [ %12, %1 ]
973 %2 = load i8, i8* %.08, align 1
974 %3 = sext i8 %2 to i64
975 %4 = getelementptr inbounds i8, i8* %c, i64 %3
976 %5 = load i8, i8* %4, align 1
977 %6 = add nsw i64 %3, 1
978 %7 = getelementptr inbounds i8, i8* %c, i64 %6
979 %8 = load i8, i8* %7, align 1
980 %9 = getelementptr inbounds i8, i8* %a, i64 %.09
981 store i8 %5, i8* %9, align 1
983 %11 = getelementptr inbounds i8, i8* %a, i64 %10
984 store i8 %8, i8* %11, align 1
985 %12 = getelementptr inbounds i8, i8* %.08, i64 1
986 %13 = add nuw nsw i64 %.09, 2
987 %14 = icmp slt i64 %13, %n
988 br i1 %14, label %1, label %15
994 ; Make sure that we merge the consecutive load/store sequence below and use a
995 ; word (16 bit) instead of a byte copy even if there are intermediate sign
997 define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
998 ; X86-BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
1000 ; X86-BWON-NEXT: pushl %ebx
1001 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
1002 ; X86-BWON-NEXT: pushl %edi
1003 ; X86-BWON-NEXT: .cfi_def_cfa_offset 12
1004 ; X86-BWON-NEXT: pushl %esi
1005 ; X86-BWON-NEXT: .cfi_def_cfa_offset 16
1006 ; X86-BWON-NEXT: .cfi_offset %esi, -16
1007 ; X86-BWON-NEXT: .cfi_offset %edi, -12
1008 ; X86-BWON-NEXT: .cfi_offset %ebx, -8
1009 ; X86-BWON-NEXT: xorl %eax, %eax
1010 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ecx
1011 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
1012 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %esi
1013 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edi
1014 ; X86-BWON-NEXT: .p2align 4, 0x90
1015 ; X86-BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
1016 ; X86-BWON-NEXT: movsbl (%edi,%eax), %ebx
1017 ; X86-BWON-NEXT: movzwl (%edx,%ebx), %ebx
1018 ; X86-BWON-NEXT: movw %bx, (%esi,%eax,2)
1019 ; X86-BWON-NEXT: incl %eax
1020 ; X86-BWON-NEXT: cmpl %eax, %ecx
1021 ; X86-BWON-NEXT: jne .LBB11_1
1022 ; X86-BWON-NEXT: # %bb.2:
1023 ; X86-BWON-NEXT: popl %esi
1024 ; X86-BWON-NEXT: .cfi_def_cfa_offset 12
1025 ; X86-BWON-NEXT: popl %edi
1026 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
1027 ; X86-BWON-NEXT: popl %ebx
1028 ; X86-BWON-NEXT: .cfi_def_cfa_offset 4
1029 ; X86-BWON-NEXT: retl
1031 ; X86-BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
1032 ; X86-BWOFF: # %bb.0:
1033 ; X86-BWOFF-NEXT: pushl %ebx
1034 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
1035 ; X86-BWOFF-NEXT: pushl %edi
1036 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
1037 ; X86-BWOFF-NEXT: pushl %esi
1038 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
1039 ; X86-BWOFF-NEXT: .cfi_offset %esi, -16
1040 ; X86-BWOFF-NEXT: .cfi_offset %edi, -12
1041 ; X86-BWOFF-NEXT: .cfi_offset %ebx, -8
1042 ; X86-BWOFF-NEXT: xorl %eax, %eax
1043 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ecx
1044 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
1045 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %esi
1046 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edi
1047 ; X86-BWOFF-NEXT: .p2align 4, 0x90
1048 ; X86-BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
1049 ; X86-BWOFF-NEXT: movsbl (%edi,%eax), %ebx
1050 ; X86-BWOFF-NEXT: movw (%edx,%ebx), %bx
1051 ; X86-BWOFF-NEXT: movw %bx, (%esi,%eax,2)
1052 ; X86-BWOFF-NEXT: incl %eax
1053 ; X86-BWOFF-NEXT: cmpl %eax, %ecx
1054 ; X86-BWOFF-NEXT: jne .LBB11_1
1055 ; X86-BWOFF-NEXT: # %bb.2:
1056 ; X86-BWOFF-NEXT: popl %esi
1057 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
1058 ; X86-BWOFF-NEXT: popl %edi
1059 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
1060 ; X86-BWOFF-NEXT: popl %ebx
1061 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
1062 ; X86-BWOFF-NEXT: retl
1064 ; X64-BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
1065 ; X64-BWON: # %bb.0:
1066 ; X64-BWON-NEXT: movl %ecx, %eax
1067 ; X64-BWON-NEXT: xorl %ecx, %ecx
1068 ; X64-BWON-NEXT: .p2align 4, 0x90
1069 ; X64-BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
1070 ; X64-BWON-NEXT: movsbq (%rdi,%rcx), %r8
1071 ; X64-BWON-NEXT: movzwl (%rdx,%r8), %r8d
1072 ; X64-BWON-NEXT: movw %r8w, (%rsi,%rcx,2)
1073 ; X64-BWON-NEXT: incq %rcx
1074 ; X64-BWON-NEXT: cmpl %ecx, %eax
1075 ; X64-BWON-NEXT: jne .LBB11_1
1076 ; X64-BWON-NEXT: # %bb.2:
1077 ; X64-BWON-NEXT: retq
1079 ; X64-BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
1080 ; X64-BWOFF: # %bb.0:
1081 ; X64-BWOFF-NEXT: movl %ecx, %eax
1082 ; X64-BWOFF-NEXT: xorl %ecx, %ecx
1083 ; X64-BWOFF-NEXT: .p2align 4, 0x90
1084 ; X64-BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
1085 ; X64-BWOFF-NEXT: movsbq (%rdi,%rcx), %r8
1086 ; X64-BWOFF-NEXT: movw (%rdx,%r8), %r8w
1087 ; X64-BWOFF-NEXT: movw %r8w, (%rsi,%rcx,2)
1088 ; X64-BWOFF-NEXT: incq %rcx
1089 ; X64-BWOFF-NEXT: cmpl %ecx, %eax
1090 ; X64-BWOFF-NEXT: jne .LBB11_1
1091 ; X64-BWOFF-NEXT: # %bb.2:
1092 ; X64-BWOFF-NEXT: retq
1096 %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
1097 %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
1098 %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
1099 %2 = getelementptr inbounds i8, i8* %.0, i64 1
1100 %3 = load i8, i8* %.0, align 1
1101 %4 = sext i8 %3 to i64
1102 %5 = getelementptr inbounds i8, i8* %c, i64 %4
1103 %6 = load i8, i8* %5, align 1
1105 %8 = getelementptr inbounds i8, i8* %c, i64 %7
1106 %9 = load i8, i8* %8, align 1
1107 store i8 %6, i8* %.08, align 1
1108 %10 = getelementptr inbounds i8, i8* %.08, i64 1
1109 store i8 %9, i8* %10, align 1
1110 %11 = getelementptr inbounds i8, i8* %.08, i64 2
1111 %12 = add nsw i32 %.09, -1
1112 %13 = icmp eq i32 %12, 0
1113 br i1 %13, label %14, label %1
1119 ; However, we can only merge ignore sign extensions when they are on all memory
1121 define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
1122 ; X86-BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
1123 ; X86-BWON: # %bb.0:
1124 ; X86-BWON-NEXT: pushl %ebp
1125 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
1126 ; X86-BWON-NEXT: pushl %ebx
1127 ; X86-BWON-NEXT: .cfi_def_cfa_offset 12
1128 ; X86-BWON-NEXT: pushl %edi
1129 ; X86-BWON-NEXT: .cfi_def_cfa_offset 16
1130 ; X86-BWON-NEXT: pushl %esi
1131 ; X86-BWON-NEXT: .cfi_def_cfa_offset 20
1132 ; X86-BWON-NEXT: .cfi_offset %esi, -20
1133 ; X86-BWON-NEXT: .cfi_offset %edi, -16
1134 ; X86-BWON-NEXT: .cfi_offset %ebx, -12
1135 ; X86-BWON-NEXT: .cfi_offset %ebp, -8
1136 ; X86-BWON-NEXT: xorl %eax, %eax
1137 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %ebp
1138 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edx
1139 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %esi
1140 ; X86-BWON-NEXT: movl {{[0-9]+}}(%esp), %edi
1141 ; X86-BWON-NEXT: .p2align 4, 0x90
1142 ; X86-BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
1143 ; X86-BWON-NEXT: movsbl (%edi,%eax), %ebx
1144 ; X86-BWON-NEXT: movzbl (%edx,%ebx), %ecx
1145 ; X86-BWON-NEXT: incb %bl
1146 ; X86-BWON-NEXT: movsbl %bl, %ebx
1147 ; X86-BWON-NEXT: movb (%edx,%ebx), %ch
1148 ; X86-BWON-NEXT: movb %cl, (%esi,%eax,2)
1149 ; X86-BWON-NEXT: movb %ch, 1(%esi,%eax,2)
1150 ; X86-BWON-NEXT: incl %eax
1151 ; X86-BWON-NEXT: cmpl %eax, %ebp
1152 ; X86-BWON-NEXT: jne .LBB12_1
1153 ; X86-BWON-NEXT: # %bb.2:
1154 ; X86-BWON-NEXT: popl %esi
1155 ; X86-BWON-NEXT: .cfi_def_cfa_offset 16
1156 ; X86-BWON-NEXT: popl %edi
1157 ; X86-BWON-NEXT: .cfi_def_cfa_offset 12
1158 ; X86-BWON-NEXT: popl %ebx
1159 ; X86-BWON-NEXT: .cfi_def_cfa_offset 8
1160 ; X86-BWON-NEXT: popl %ebp
1161 ; X86-BWON-NEXT: .cfi_def_cfa_offset 4
1162 ; X86-BWON-NEXT: retl
1164 ; X86-BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
1165 ; X86-BWOFF: # %bb.0:
1166 ; X86-BWOFF-NEXT: pushl %ebp
1167 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
1168 ; X86-BWOFF-NEXT: pushl %ebx
1169 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
1170 ; X86-BWOFF-NEXT: pushl %edi
1171 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
1172 ; X86-BWOFF-NEXT: pushl %esi
1173 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 20
1174 ; X86-BWOFF-NEXT: .cfi_offset %esi, -20
1175 ; X86-BWOFF-NEXT: .cfi_offset %edi, -16
1176 ; X86-BWOFF-NEXT: .cfi_offset %ebx, -12
1177 ; X86-BWOFF-NEXT: .cfi_offset %ebp, -8
1178 ; X86-BWOFF-NEXT: xorl %eax, %eax
1179 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %ebp
1180 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edx
1181 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %esi
1182 ; X86-BWOFF-NEXT: movl {{[0-9]+}}(%esp), %edi
1183 ; X86-BWOFF-NEXT: .p2align 4, 0x90
1184 ; X86-BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
1185 ; X86-BWOFF-NEXT: movsbl (%edi,%eax), %ebx
1186 ; X86-BWOFF-NEXT: movb (%edx,%ebx), %cl
1187 ; X86-BWOFF-NEXT: incb %bl
1188 ; X86-BWOFF-NEXT: movsbl %bl, %ebx
1189 ; X86-BWOFF-NEXT: movb (%edx,%ebx), %ch
1190 ; X86-BWOFF-NEXT: movb %cl, (%esi,%eax,2)
1191 ; X86-BWOFF-NEXT: movb %ch, 1(%esi,%eax,2)
1192 ; X86-BWOFF-NEXT: incl %eax
1193 ; X86-BWOFF-NEXT: cmpl %eax, %ebp
1194 ; X86-BWOFF-NEXT: jne .LBB12_1
1195 ; X86-BWOFF-NEXT: # %bb.2:
1196 ; X86-BWOFF-NEXT: popl %esi
1197 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 16
1198 ; X86-BWOFF-NEXT: popl %edi
1199 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 12
1200 ; X86-BWOFF-NEXT: popl %ebx
1201 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 8
1202 ; X86-BWOFF-NEXT: popl %ebp
1203 ; X86-BWOFF-NEXT: .cfi_def_cfa_offset 4
1204 ; X86-BWOFF-NEXT: retl
1206 ; X64-BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
1207 ; X64-BWON: # %bb.0:
1208 ; X64-BWON-NEXT: movl %ecx, %eax
1209 ; X64-BWON-NEXT: xorl %ecx, %ecx
1210 ; X64-BWON-NEXT: .p2align 4, 0x90
1211 ; X64-BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
1212 ; X64-BWON-NEXT: movsbq (%rdi,%rcx), %r8
1213 ; X64-BWON-NEXT: movzbl (%rdx,%r8), %r9d
1214 ; X64-BWON-NEXT: incl %r8d
1215 ; X64-BWON-NEXT: movsbq %r8b, %r8
1216 ; X64-BWON-NEXT: movzbl (%rdx,%r8), %r8d
1217 ; X64-BWON-NEXT: movb %r9b, (%rsi,%rcx,2)
1218 ; X64-BWON-NEXT: movb %r8b, 1(%rsi,%rcx,2)
1219 ; X64-BWON-NEXT: incq %rcx
1220 ; X64-BWON-NEXT: cmpl %ecx, %eax
1221 ; X64-BWON-NEXT: jne .LBB12_1
1222 ; X64-BWON-NEXT: # %bb.2:
1223 ; X64-BWON-NEXT: retq
1225 ; X64-BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
1226 ; X64-BWOFF: # %bb.0:
1227 ; X64-BWOFF-NEXT: movl %ecx, %eax
1228 ; X64-BWOFF-NEXT: xorl %ecx, %ecx
1229 ; X64-BWOFF-NEXT: .p2align 4, 0x90
1230 ; X64-BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
1231 ; X64-BWOFF-NEXT: movsbq (%rdi,%rcx), %r8
1232 ; X64-BWOFF-NEXT: movb (%rdx,%r8), %r9b
1233 ; X64-BWOFF-NEXT: incl %r8d
1234 ; X64-BWOFF-NEXT: movsbq %r8b, %r8
1235 ; X64-BWOFF-NEXT: movb (%rdx,%r8), %r8b
1236 ; X64-BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2)
1237 ; X64-BWOFF-NEXT: movb %r8b, 1(%rsi,%rcx,2)
1238 ; X64-BWOFF-NEXT: incq %rcx
1239 ; X64-BWOFF-NEXT: cmpl %ecx, %eax
1240 ; X64-BWOFF-NEXT: jne .LBB12_1
1241 ; X64-BWOFF-NEXT: # %bb.2:
1242 ; X64-BWOFF-NEXT: retq
1246 %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
1247 %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
1248 %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
1249 %2 = getelementptr inbounds i8, i8* %.0, i64 1
1250 %3 = load i8, i8* %.0, align 1
1251 %4 = sext i8 %3 to i64
1252 %5 = getelementptr inbounds i8, i8* %c, i64 %4
1253 %6 = load i8, i8* %5, align 1
1255 %wrap.4 = sext i8 %7 to i64
1256 %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4
1257 %9 = load i8, i8* %8, align 1
1258 store i8 %6, i8* %.08, align 1
1259 %10 = getelementptr inbounds i8, i8* %.08, i64 1
1260 store i8 %9, i8* %10, align 1
1261 %11 = getelementptr inbounds i8, i8* %.08, i64 2
1262 %12 = add nsw i32 %.09, -1
1263 %13 = icmp eq i32 %12, 0
1264 br i1 %13, label %14, label %1
1270 ; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
1271 define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
1272 ; X86-LABEL: merge_vec_element_store:
1274 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1275 ; X86-NEXT: vmovups %ymm0, (%eax)
1276 ; X86-NEXT: vzeroupper
1279 ; X64-LABEL: merge_vec_element_store:
1281 ; X64-NEXT: vmovups %ymm0, (%rdi)
1282 ; X64-NEXT: vzeroupper
1284 %vecext0 = extractelement <8 x float> %v, i32 0
1285 %vecext1 = extractelement <8 x float> %v, i32 1
1286 %vecext2 = extractelement <8 x float> %v, i32 2
1287 %vecext3 = extractelement <8 x float> %v, i32 3
1288 %vecext4 = extractelement <8 x float> %v, i32 4
1289 %vecext5 = extractelement <8 x float> %v, i32 5
1290 %vecext6 = extractelement <8 x float> %v, i32 6
1291 %vecext7 = extractelement <8 x float> %v, i32 7
1292 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
1293 %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2
1294 %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3
1295 %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4
1296 %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5
1297 %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6
1298 %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7
1299 store float %vecext0, float* %ptr, align 4
1300 store float %vecext1, float* %arrayidx1, align 4
1301 store float %vecext2, float* %arrayidx2, align 4
1302 store float %vecext3, float* %arrayidx3, align 4
1303 store float %vecext4, float* %arrayidx4, align 4
1304 store float %vecext5, float* %arrayidx5, align 4
1305 store float %vecext6, float* %arrayidx6, align 4
1306 store float %vecext7, float* %arrayidx7, align 4
1311 ; PR21711 - Merge vector stores into wider vector stores.
1312 ; These should be merged into 32-byte stores.
1313 define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
1314 ; X86-LABEL: merge_vec_extract_stores:
1316 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1317 ; X86-NEXT: vmovups %ymm0, 48(%eax)
1318 ; X86-NEXT: vmovups %ymm1, 80(%eax)
1319 ; X86-NEXT: vzeroupper
1322 ; X64-LABEL: merge_vec_extract_stores:
1324 ; X64-NEXT: vmovups %ymm0, 48(%rdi)
1325 ; X64-NEXT: vmovups %ymm1, 80(%rdi)
1326 ; X64-NEXT: vzeroupper
1328 %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
1329 %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
1330 %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
1331 %idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6
1332 %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1333 %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1334 %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1335 %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1336 store <4 x float> %shuffle0, <4 x float>* %idx0, align 16
1337 store <4 x float> %shuffle1, <4 x float>* %idx1, align 16
1338 store <4 x float> %shuffle2, <4 x float>* %idx2, align 16
1339 store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
1344 ; Merging vector stores when sourced from vector loads.
1345 define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
1346 ; X86-LABEL: merge_vec_stores_from_loads:
1348 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1349 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1350 ; X86-NEXT: vmovups (%ecx), %ymm0
1351 ; X86-NEXT: vmovups %ymm0, (%eax)
1352 ; X86-NEXT: vzeroupper
1355 ; X64-LABEL: merge_vec_stores_from_loads:
1357 ; X64-NEXT: vmovups (%rdi), %ymm0
1358 ; X64-NEXT: vmovups %ymm0, (%rsi)
1359 ; X64-NEXT: vzeroupper
1361 %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
1362 %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
1363 %v0 = load <4 x float>, <4 x float>* %load_idx0
1364 %v1 = load <4 x float>, <4 x float>* %load_idx1
1365 %store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0
1366 %store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
1367 store <4 x float> %v0, <4 x float>* %store_idx0, align 16
1368 store <4 x float> %v1, <4 x float>* %store_idx1, align 16
1373 define void @merge_vec_stores_of_zero(<4 x i32>* %ptr) {
1374 ; X86-LABEL: merge_vec_stores_of_zero:
1376 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1377 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1378 ; X86-NEXT: vmovups %ymm0, 48(%eax)
1379 ; X86-NEXT: vzeroupper
1382 ; X64-LABEL: merge_vec_stores_of_zero:
1384 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1385 ; X64-NEXT: vmovups %ymm0, 48(%rdi)
1386 ; X64-NEXT: vzeroupper
1388 %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
1389 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
1390 store <4 x i32> zeroinitializer, <4 x i32>* %idx0, align 16
1391 store <4 x i32> zeroinitializer, <4 x i32>* %idx1, align 16
1395 define void @merge_vec_stores_of_constant_splat(<4 x i32>* %ptr) {
1396 ; X86-LABEL: merge_vec_stores_of_constant_splat:
1398 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1399 ; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
1400 ; X86-NEXT: vmovaps %xmm0, 48(%eax)
1401 ; X86-NEXT: vmovaps %xmm0, 64(%eax)
1404 ; X64-LABEL: merge_vec_stores_of_constant_splat:
1406 ; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42]
1407 ; X64-NEXT: vmovaps %xmm0, 48(%rdi)
1408 ; X64-NEXT: vmovaps %xmm0, 64(%rdi)
1410 %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
1411 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
1412 store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %idx0, align 16
1413 store <4 x i32> <i32 42, i32 42, i32 42, i32 42>, <4 x i32>* %idx1, align 16
1417 define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
1418 ; X86-LABEL: merge_vec_stores_of_constants:
1420 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1421 ; X86-NEXT: vmovaps {{.*#+}} xmm0 = [25,51,45,0]
1422 ; X86-NEXT: vmovaps %xmm0, 48(%eax)
1423 ; X86-NEXT: vmovaps {{.*#+}} xmm0 = [0,265,26,0]
1424 ; X86-NEXT: vmovaps %xmm0, 64(%eax)
1427 ; X64-LABEL: merge_vec_stores_of_constants:
1429 ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [25,51,45,0]
1430 ; X64-NEXT: vmovaps %xmm0, 48(%rdi)
1431 ; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,265,26,0]
1432 ; X64-NEXT: vmovaps %xmm0, 64(%rdi)
1434 %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
1435 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
1436 store <4 x i32> <i32 25, i32 51, i32 45, i32 0>, <4 x i32>* %idx0, align 16
1437 store <4 x i32> <i32 0, i32 265, i32 26, i32 0>, <4 x i32>* %idx1, align 16
1441 define void @merge_vec_stores_of_constants_with_undefs(<4 x i32>* %ptr) {
1442 ; X86-LABEL: merge_vec_stores_of_constants_with_undefs:
1444 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1445 ; X86-NEXT: vxorps %xmm0, %xmm0, %xmm0
1446 ; X86-NEXT: vmovups %ymm0, 48(%eax)
1447 ; X86-NEXT: vzeroupper
1450 ; X64-LABEL: merge_vec_stores_of_constants_with_undefs:
1452 ; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
1453 ; X64-NEXT: vmovups %ymm0, 48(%rdi)
1454 ; X64-NEXT: vzeroupper
1456 %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
1457 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
1458 store <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, <4 x i32>* %idx0, align 16
1459 store <4 x i32> <i32 0, i32 undef, i32 0, i32 0>, <4 x i32>* %idx1, align 16
1463 ; This is a minimized test based on real code that was failing.
1464 ; This should now be merged.
1465 define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
1466 ; X86-LABEL: merge_vec_element_and_scalar_load:
1468 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1469 ; X86-NEXT: movl (%eax), %ecx
1470 ; X86-NEXT: movl 4(%eax), %edx
1471 ; X86-NEXT: movl %edx, 36(%eax)
1472 ; X86-NEXT: movl %ecx, 32(%eax)
1473 ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
1474 ; X86-NEXT: vmovsd %xmm0, 40(%eax)
1477 ; X64-LABEL: merge_vec_element_and_scalar_load:
1479 ; X64-NEXT: vmovups (%rdi), %xmm0
1480 ; X64-NEXT: vmovups %xmm0, 32(%rdi)
1482 %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
1483 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
1484 %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
1485 %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5
1487 %a0 = load i64, i64* %idx0, align 8
1488 store i64 %a0, i64* %idx4, align 8
1490 %b = bitcast i64* %idx1 to <2 x i64>*
1491 %v = load <2 x i64>, <2 x i64>* %b, align 8
1492 %a1 = extractelement <2 x i64> %v, i32 0
1493 store i64 %a1, i64* %idx5, align 8
1498 ; Don't let a non-consecutive store thwart merging of the last two.
1499 define void @almost_consecutive_stores(i8* %p) {
1500 ; X86-LABEL: almost_consecutive_stores:
1502 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1503 ; X86-NEXT: movb $0, (%eax)
1504 ; X86-NEXT: movb $1, 42(%eax)
1505 ; X86-NEXT: movw $770, 2(%eax) # imm = 0x302
1508 ; X64-LABEL: almost_consecutive_stores:
1510 ; X64-NEXT: movb $0, (%rdi)
1511 ; X64-NEXT: movb $1, 42(%rdi)
1512 ; X64-NEXT: movw $770, 2(%rdi) # imm = 0x302
1515 %p1 = getelementptr i8, i8* %p, i64 42
1517 %p2 = getelementptr i8, i8* %p, i64 2
1519 %p3 = getelementptr i8, i8* %p, i64 3
1524 ; We should be able to merge these.
1525 define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
1526 ; X86-LABEL: merge_bitcast:
1528 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1529 ; X86-NEXT: vmovups %xmm0, (%eax)
1532 ; X64-LABEL: merge_bitcast:
1534 ; X64-NEXT: vmovups %xmm0, (%rdi)
1536 %fv = bitcast <4 x i32> %v to <4 x float>
1537 %vecext1 = extractelement <4 x i32> %v, i32 1
1538 %vecext2 = extractelement <4 x i32> %v, i32 2
1539 %vecext3 = extractelement <4 x i32> %v, i32 3
1540 %f0 = extractelement <4 x float> %fv, i32 0
1541 %f1 = bitcast i32 %vecext1 to float
1542 %f2 = bitcast i32 %vecext2 to float
1543 %f3 = bitcast i32 %vecext3 to float
1544 %idx0 = getelementptr inbounds float, float* %ptr, i64 0
1545 %idx1 = getelementptr inbounds float, float* %ptr, i64 1
1546 %idx2 = getelementptr inbounds float, float* %ptr, i64 2
1547 %idx3 = getelementptr inbounds float, float* %ptr, i64 3
1548 store float %f0, float* %idx0, align 4
1549 store float %f1, float* %idx1, align 4
1550 store float %f2, float* %idx2, align 4
1551 store float %f3, float* %idx3, align 4
1555 ; same as @merge_const_store with heterogeneous types.
1556 define void @merge_const_store_heterogeneous(i32 %count, %struct.C* nocapture %p) nounwind uwtable noinline ssp {
1557 ; X86-LABEL: merge_const_store_heterogeneous:
1559 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1560 ; X86-NEXT: testl %eax, %eax
1561 ; X86-NEXT: jle .LBB23_3
1562 ; X86-NEXT: # %bb.1: # %.lr.ph.preheader
1563 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1564 ; X86-NEXT: .p2align 4, 0x90
1565 ; X86-NEXT: .LBB23_2: # %.lr.ph
1566 ; X86-NEXT: # =>This Inner Loop Header: Depth=1
1567 ; X86-NEXT: movl $67305985, (%ecx) # imm = 0x4030201
1568 ; X86-NEXT: movl $134678021, 4(%ecx) # imm = 0x8070605
1569 ; X86-NEXT: addl $24, %ecx
1570 ; X86-NEXT: decl %eax
1571 ; X86-NEXT: jne .LBB23_2
1572 ; X86-NEXT: .LBB23_3: # %._crit_edge
1575 ; X64-LABEL: merge_const_store_heterogeneous:
1577 ; X64-NEXT: testl %edi, %edi
1578 ; X64-NEXT: jle .LBB23_3
1579 ; X64-NEXT: # %bb.1: # %.lr.ph.preheader
1580 ; X64-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
1581 ; X64-NEXT: .p2align 4, 0x90
1582 ; X64-NEXT: .LBB23_2: # %.lr.ph
1583 ; X64-NEXT: # =>This Inner Loop Header: Depth=1
1584 ; X64-NEXT: movq %rax, (%rsi)
1585 ; X64-NEXT: addq $24, %rsi
1586 ; X64-NEXT: decl %edi
1587 ; X64-NEXT: jne .LBB23_2
1588 ; X64-NEXT: .LBB23_3: # %._crit_edge
1590 %1 = icmp sgt i32 %count, 0
1591 br i1 %1, label %.lr.ph, label %._crit_edge
1593 %i.02 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
1594 %.01 = phi %struct.C* [ %8, %.lr.ph ], [ %p, %0 ]
1595 %2 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 0
1596 store i8 1, i8* %2, align 1
1597 %3 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 1
1598 store i8 2, i8* %3, align 1
1599 %4 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 2
1600 store i8 3, i8* %4, align 1
1601 %5 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 3
1602 store i8 4, i8* %5, align 1
1603 %6 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 4
1604 store i32 134678021, i32* %6, align 1
1605 %7 = add nsw i32 %i.02, 1
1606 %8 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 1
1607 %exitcond = icmp eq i32 %7, %count
1608 br i1 %exitcond, label %._crit_edge, label %.lr.ph
1613 ; Merging heterogeneous integer types.
1614 define void @merge_heterogeneous(%struct.C* nocapture %p, %struct.C* nocapture %q) {
1615 ; X86-LABEL: merge_heterogeneous:
1617 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1618 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1619 ; X86-NEXT: movl (%ecx), %edx
1620 ; X86-NEXT: movl 4(%ecx), %ecx
1621 ; X86-NEXT: movl %edx, (%eax)
1622 ; X86-NEXT: movl %ecx, 4(%eax)
1625 ; X64-LABEL: merge_heterogeneous:
1627 ; X64-NEXT: movq (%rdi), %rax
1628 ; X64-NEXT: movq %rax, (%rsi)
1630 %s0 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 0
1631 %s1 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 1
1632 %s2 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 2
1633 %s3 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 3
1634 %s4 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 4
1635 %d0 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 0
1636 %d1 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 1
1637 %d2 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 2
1638 %d3 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 3
1639 %d4 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 4
1640 %v0 = load i8, i8* %s0, align 1
1641 %v1 = load i8, i8* %s1, align 1
1642 %v2 = load i8, i8* %s2, align 1
1643 %v3 = load i8, i8* %s3, align 1
1644 %v4 = load i32, i32* %s4, align 1
1645 store i8 %v0, i8* %d0, align 1
1646 store i8 %v1, i8* %d1, align 1
1647 store i8 %v2, i8* %d2, align 1
1648 store i8 %v3, i8* %d3, align 1
1649 store i32 %v4, i32* %d4, align 4
1653 define i32 @merge_store_load_store_seq(i32* %buff) {
1654 ; X86-LABEL: merge_store_load_store_seq:
1655 ; X86: # %bb.0: # %entry
1656 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1657 ; X86-NEXT: movl $0, (%ecx)
1658 ; X86-NEXT: movl 4(%ecx), %eax
1659 ; X86-NEXT: movl $0, 4(%ecx)
1662 ; X64-LABEL: merge_store_load_store_seq:
1663 ; X64: # %bb.0: # %entry
1664 ; X64-NEXT: movl 4(%rdi), %eax
1665 ; X64-NEXT: movq $0, (%rdi)
1669 store i32 0, i32* %buff, align 4
1670 %arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1
1671 %0 = load i32, i32* %arrayidx1, align 4
1672 store i32 0, i32* %arrayidx1, align 4
1676 define i32 @merge_store_alias(i32* %buff, i32* %other) {
1677 ; X86-LABEL: merge_store_alias:
1678 ; X86: # %bb.0: # %entry
1679 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1680 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1681 ; X86-NEXT: movl $0, (%ecx)
1682 ; X86-NEXT: movl (%eax), %eax
1683 ; X86-NEXT: movl $0, 4(%ecx)
1686 ; X64-LABEL: merge_store_alias:
1687 ; X64: # %bb.0: # %entry
1688 ; X64-NEXT: movl $0, (%rdi)
1689 ; X64-NEXT: movl (%rsi), %eax
1690 ; X64-NEXT: movl $0, 4(%rdi)
1694 store i32 0, i32* %buff, align 4
1695 %arrayidx1 = getelementptr inbounds i32, i32* %buff, i64 1
1696 %0 = load i32, i32* %other, align 4
1697 store i32 0, i32* %arrayidx1, align 4