1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
3 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
5 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
6 %struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
7 %struct.C = type { i8, i8, i8, i8, i32, i32, i32, i64 }
9 ; save 1,2,3 ... as one big integer.
10 define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
11 ; CHECK-LABEL: merge_const_store:
13 ; CHECK-NEXT: testl %edi, %edi
14 ; CHECK-NEXT: jle .LBB0_3
15 ; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
16 ; CHECK-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
17 ; CHECK-NEXT: .p2align 4, 0x90
18 ; CHECK-NEXT: .LBB0_2: # %.lr.ph
19 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
20 ; CHECK-NEXT: movq %rax, (%rsi)
21 ; CHECK-NEXT: addq $8, %rsi
22 ; CHECK-NEXT: decl %edi
23 ; CHECK-NEXT: jne .LBB0_2
24 ; CHECK-NEXT: .LBB0_3: # %._crit_edge
26 %1 = icmp sgt i32 %count, 0
27 br i1 %1, label %.lr.ph, label %._crit_edge
29 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
30 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
31 %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
32 store i8 1, i8* %2, align 1
33 %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
34 store i8 2, i8* %3, align 1
35 %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
36 store i8 3, i8* %4, align 1
37 %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
38 store i8 4, i8* %5, align 1
39 %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
40 store i8 5, i8* %6, align 1
41 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
42 store i8 6, i8* %7, align 1
43 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
44 store i8 7, i8* %8, align 1
45 %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
46 store i8 8, i8* %9, align 1
47 %10 = add nsw i32 %i.02, 1
48 %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
49 %exitcond = icmp eq i32 %10, %count
50 br i1 %exitcond, label %._crit_edge, label %.lr.ph
55 ; No vectors because we use noimplicitfloat
56 define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
57 ; CHECK-LABEL: merge_const_store_no_vec:
59 ; CHECK-NEXT: testl %edi, %edi
60 ; CHECK-NEXT: jle .LBB1_2
61 ; CHECK-NEXT: .p2align 4, 0x90
62 ; CHECK-NEXT: .LBB1_1: # %.lr.ph
63 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
64 ; CHECK-NEXT: movq $0, (%rsi)
65 ; CHECK-NEXT: movq $0, 8(%rsi)
66 ; CHECK-NEXT: movq $0, 16(%rsi)
67 ; CHECK-NEXT: movq $0, 24(%rsi)
68 ; CHECK-NEXT: addq $32, %rsi
69 ; CHECK-NEXT: decl %edi
70 ; CHECK-NEXT: jne .LBB1_1
71 ; CHECK-NEXT: .LBB1_2: # %._crit_edge
73 %1 = icmp sgt i32 %count, 0
74 br i1 %1, label %.lr.ph, label %._crit_edge
76 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
77 %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
78 %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
79 store i32 0, i32* %2, align 4
80 %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
81 store i32 0, i32* %3, align 4
82 %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
83 store i32 0, i32* %4, align 4
84 %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
85 store i32 0, i32* %5, align 4
86 %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
87 store i32 0, i32* %6, align 4
88 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
89 store i32 0, i32* %7, align 4
90 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
91 store i32 0, i32* %8, align 4
92 %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
93 store i32 0, i32* %9, align 4
94 %10 = add nsw i32 %i.02, 1
95 %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
96 %exitcond = icmp eq i32 %10, %count
97 br i1 %exitcond, label %._crit_edge, label %.lr.ph
102 ; Move the constants using a single vector store.
103 define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
104 ; CHECK-LABEL: merge_const_store_vec:
106 ; CHECK-NEXT: testl %edi, %edi
107 ; CHECK-NEXT: jle .LBB2_3
108 ; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
109 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
110 ; CHECK-NEXT: .p2align 4, 0x90
111 ; CHECK-NEXT: .LBB2_2: # %.lr.ph
112 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
113 ; CHECK-NEXT: vmovups %ymm0, (%rsi)
114 ; CHECK-NEXT: addq $32, %rsi
115 ; CHECK-NEXT: decl %edi
116 ; CHECK-NEXT: jne .LBB2_2
117 ; CHECK-NEXT: .LBB2_3: # %._crit_edge
118 ; CHECK-NEXT: vzeroupper
120 %1 = icmp sgt i32 %count, 0
121 br i1 %1, label %.lr.ph, label %._crit_edge
123 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
124 %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
125 %2 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
126 store i32 0, i32* %2, align 4
127 %3 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
128 store i32 0, i32* %3, align 4
129 %4 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
130 store i32 0, i32* %4, align 4
131 %5 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
132 store i32 0, i32* %5, align 4
133 %6 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 4
134 store i32 0, i32* %6, align 4
135 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 5
136 store i32 0, i32* %7, align 4
137 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 6
138 store i32 0, i32* %8, align 4
139 %9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 7
140 store i32 0, i32* %9, align 4
141 %10 = add nsw i32 %i.02, 1
142 %11 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
143 %exitcond = icmp eq i32 %10, %count
144 br i1 %exitcond, label %._crit_edge, label %.lr.ph
149 ; Move the first 4 constants as a single vector. Move the rest as scalars.
150 define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
151 ; CHECK-LABEL: merge_nonconst_store:
153 ; CHECK-NEXT: testl %edi, %edi
154 ; CHECK-NEXT: jle .LBB3_2
155 ; CHECK-NEXT: .p2align 4, 0x90
156 ; CHECK-NEXT: .LBB3_1: # %.lr.ph
157 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
158 ; CHECK-NEXT: movl $67305985, (%rdx) # imm = 0x4030201
159 ; CHECK-NEXT: movb %sil, 4(%rdx)
160 ; CHECK-NEXT: movw $1798, 5(%rdx) # imm = 0x706
161 ; CHECK-NEXT: movb $8, 7(%rdx)
162 ; CHECK-NEXT: addq $8, %rdx
163 ; CHECK-NEXT: decl %edi
164 ; CHECK-NEXT: jne .LBB3_1
165 ; CHECK-NEXT: .LBB3_2: # %._crit_edge
167 %1 = icmp sgt i32 %count, 0
168 br i1 %1, label %.lr.ph, label %._crit_edge
170 %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
171 %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
172 %2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
173 store i8 1, i8* %2, align 1
174 %3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
175 store i8 2, i8* %3, align 1
176 %4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
177 store i8 3, i8* %4, align 1
178 %5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
179 store i8 4, i8* %5, align 1
180 %6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
181 store i8 %zz, i8* %6, align 1 ; <----------- Not a const;
182 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
183 store i8 6, i8* %7, align 1
184 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
185 store i8 7, i8* %8, align 1
186 %9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
187 store i8 8, i8* %9, align 1
188 %10 = add nsw i32 %i.02, 1
189 %11 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
190 %exitcond = icmp eq i32 %10, %count
191 br i1 %exitcond, label %._crit_edge, label %.lr.ph
196 define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
197 ; BWON-LABEL: merge_loads_i16:
199 ; BWON-NEXT: testl %edi, %edi
200 ; BWON-NEXT: jle .LBB4_2
201 ; BWON-NEXT: .p2align 4, 0x90
202 ; BWON-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
203 ; BWON-NEXT: movzwl (%rsi), %eax
204 ; BWON-NEXT: movw %ax, (%rdx)
205 ; BWON-NEXT: addq $8, %rdx
206 ; BWON-NEXT: decl %edi
207 ; BWON-NEXT: jne .LBB4_1
208 ; BWON-NEXT: .LBB4_2: # %._crit_edge
211 ; BWOFF-LABEL: merge_loads_i16:
213 ; BWOFF-NEXT: testl %edi, %edi
214 ; BWOFF-NEXT: jle .LBB4_2
215 ; BWOFF-NEXT: .p2align 4, 0x90
216 ; BWOFF-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
217 ; BWOFF-NEXT: movw (%rsi), %ax
218 ; BWOFF-NEXT: movw %ax, (%rdx)
219 ; BWOFF-NEXT: addq $8, %rdx
220 ; BWOFF-NEXT: decl %edi
221 ; BWOFF-NEXT: jne .LBB4_1
222 ; BWOFF-NEXT: .LBB4_2: # %._crit_edge
224 %1 = icmp sgt i32 %count, 0
225 br i1 %1, label %.lr.ph, label %._crit_edge
228 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
229 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
232 ; <label>:4 ; preds = %4, %.lr.ph
233 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
234 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
235 %5 = load i8, i8* %2, align 1
236 %6 = load i8, i8* %3, align 1
237 %7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
238 store i8 %5, i8* %7, align 1
239 %8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
240 store i8 %6, i8* %8, align 1
241 %9 = add nsw i32 %i.02, 1
242 %10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
243 %exitcond = icmp eq i32 %9, %count
244 br i1 %exitcond, label %._crit_edge, label %4
246 ._crit_edge: ; preds = %4, %0
250 ; The loads and the stores are interleaved. Can't merge them.
251 define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
252 ; BWON-LABEL: no_merge_loads:
254 ; BWON-NEXT: testl %edi, %edi
255 ; BWON-NEXT: jle .LBB5_2
256 ; BWON-NEXT: .p2align 4, 0x90
257 ; BWON-NEXT: .LBB5_1: # %a4
258 ; BWON-NEXT: # =>This Inner Loop Header: Depth=1
259 ; BWON-NEXT: movzbl (%rsi), %eax
260 ; BWON-NEXT: movb %al, (%rdx)
261 ; BWON-NEXT: movzbl 1(%rsi), %eax
262 ; BWON-NEXT: movb %al, 1(%rdx)
263 ; BWON-NEXT: addq $8, %rdx
264 ; BWON-NEXT: decl %edi
265 ; BWON-NEXT: jne .LBB5_1
266 ; BWON-NEXT: .LBB5_2: # %._crit_edge
269 ; BWOFF-LABEL: no_merge_loads:
271 ; BWOFF-NEXT: testl %edi, %edi
272 ; BWOFF-NEXT: jle .LBB5_2
273 ; BWOFF-NEXT: .p2align 4, 0x90
274 ; BWOFF-NEXT: .LBB5_1: # %a4
275 ; BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
276 ; BWOFF-NEXT: movb (%rsi), %al
277 ; BWOFF-NEXT: movb %al, (%rdx)
278 ; BWOFF-NEXT: movb 1(%rsi), %al
279 ; BWOFF-NEXT: movb %al, 1(%rdx)
280 ; BWOFF-NEXT: addq $8, %rdx
281 ; BWOFF-NEXT: decl %edi
282 ; BWOFF-NEXT: jne .LBB5_1
283 ; BWOFF-NEXT: .LBB5_2: # %._crit_edge
285 %1 = icmp sgt i32 %count, 0
286 br i1 %1, label %.lr.ph, label %._crit_edge
289 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
290 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
293 a4: ; preds = %4, %.lr.ph
294 %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
295 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
296 %a5 = load i8, i8* %2, align 1
297 %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
298 store i8 %a5, i8* %a7, align 1
299 %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
300 %a6 = load i8, i8* %3, align 1
301 store i8 %a6, i8* %a8, align 1
302 %a9 = add nsw i32 %i.02, 1
303 %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
304 %exitcond = icmp eq i32 %a9, %count
305 br i1 %exitcond, label %._crit_edge, label %a4
307 ._crit_edge: ; preds = %4, %0
311 define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
312 ; CHECK-LABEL: merge_loads_integer:
314 ; CHECK-NEXT: testl %edi, %edi
315 ; CHECK-NEXT: jle .LBB6_2
316 ; CHECK-NEXT: .p2align 4, 0x90
317 ; CHECK-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
318 ; CHECK-NEXT: movq (%rsi), %rax
319 ; CHECK-NEXT: movq %rax, (%rdx)
320 ; CHECK-NEXT: addq $32, %rdx
321 ; CHECK-NEXT: decl %edi
322 ; CHECK-NEXT: jne .LBB6_1
323 ; CHECK-NEXT: .LBB6_2: # %._crit_edge
325 %1 = icmp sgt i32 %count, 0
326 br i1 %1, label %.lr.ph, label %._crit_edge
329 %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
330 %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
333 ; <label>:4 ; preds = %4, %.lr.ph
334 %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
335 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
336 %5 = load i32, i32* %2
337 %6 = load i32, i32* %3
338 %7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
339 store i32 %5, i32* %7
340 %8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
341 store i32 %6, i32* %8
342 %9 = add nsw i32 %i.02, 1
343 %10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
344 %exitcond = icmp eq i32 %9, %count
345 br i1 %exitcond, label %._crit_edge, label %4
347 ._crit_edge: ; preds = %4, %0
351 define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
352 ; CHECK-LABEL: merge_loads_vector:
354 ; CHECK-NEXT: testl %edi, %edi
355 ; CHECK-NEXT: jle .LBB7_2
356 ; CHECK-NEXT: .p2align 4, 0x90
357 ; CHECK-NEXT: .LBB7_1: # %block4
358 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
359 ; CHECK-NEXT: vmovups (%rsi), %xmm0
360 ; CHECK-NEXT: vmovups %xmm0, (%rdx)
361 ; CHECK-NEXT: addq $32, %rdx
362 ; CHECK-NEXT: decl %edi
363 ; CHECK-NEXT: jne .LBB7_1
364 ; CHECK-NEXT: .LBB7_2: # %._crit_edge
366 %a1 = icmp sgt i32 %count, 0
367 br i1 %a1, label %.lr.ph, label %._crit_edge
370 %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
371 %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
372 %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
373 %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
376 block4: ; preds = %4, %.lr.ph
377 %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
378 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
379 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
380 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
381 %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
382 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
383 %b1 = load i32, i32* %a2
384 %b2 = load i32, i32* %a3
385 %b3 = load i32, i32* %a4
386 %b4 = load i32, i32* %a5
387 store i32 %b1, i32* %a7
388 store i32 %b2, i32* %a8
389 store i32 %b3, i32* %a9
390 store i32 %b4, i32* %a10
391 %c9 = add nsw i32 %i.02, 1
392 %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
393 %exitcond = icmp eq i32 %c9, %count
394 br i1 %exitcond, label %._crit_edge, label %block4
396 ._crit_edge: ; preds = %4, %0
400 ; On x86, even unaligned copies can be merged to vector ops.
401 define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
402 ; CHECK-LABEL: merge_loads_no_align:
404 ; CHECK-NEXT: testl %edi, %edi
405 ; CHECK-NEXT: jle .LBB8_2
406 ; CHECK-NEXT: .p2align 4, 0x90
407 ; CHECK-NEXT: .LBB8_1: # %block4
408 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
409 ; CHECK-NEXT: vmovups (%rsi), %xmm0
410 ; CHECK-NEXT: vmovups %xmm0, (%rdx)
411 ; CHECK-NEXT: addq $32, %rdx
412 ; CHECK-NEXT: decl %edi
413 ; CHECK-NEXT: jne .LBB8_1
414 ; CHECK-NEXT: .LBB8_2: # %._crit_edge
416 %a1 = icmp sgt i32 %count, 0
417 br i1 %a1, label %.lr.ph, label %._crit_edge
420 %a2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
421 %a3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
422 %a4 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 2
423 %a5 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 3
426 block4: ; preds = %4, %.lr.ph
427 %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
428 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
429 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
430 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
431 %a9 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 2
432 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 3
433 %b1 = load i32, i32* %a2, align 1
434 %b2 = load i32, i32* %a3, align 1
435 %b3 = load i32, i32* %a4, align 1
436 %b4 = load i32, i32* %a5, align 1
437 store i32 %b1, i32* %a7, align 1
438 store i32 %b2, i32* %a8, align 1
439 store i32 %b3, i32* %a9, align 1
440 store i32 %b4, i32* %a10, align 1
441 %c9 = add nsw i32 %i.02, 1
442 %c10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
443 %exitcond = icmp eq i32 %c9, %count
444 br i1 %exitcond, label %._crit_edge, label %block4
446 ._crit_edge: ; preds = %4, %0
450 ; Make sure that we merge the consecutive load/store sequence below and use a
451 ; word (16 bit) instead of a byte copy.
452 define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
453 ; BWON-LABEL: MergeLoadStoreBaseIndexOffset:
455 ; BWON-NEXT: movl %ecx, %r8d
456 ; BWON-NEXT: xorl %ecx, %ecx
457 ; BWON-NEXT: .p2align 4, 0x90
458 ; BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
459 ; BWON-NEXT: movq (%rdi,%rcx,8), %rax
460 ; BWON-NEXT: movzwl (%rdx,%rax), %eax
461 ; BWON-NEXT: movw %ax, (%rsi,%rcx,2)
462 ; BWON-NEXT: incq %rcx
463 ; BWON-NEXT: cmpl %ecx, %r8d
464 ; BWON-NEXT: jne .LBB9_1
465 ; BWON-NEXT: # %bb.2:
468 ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
470 ; BWOFF-NEXT: movl %ecx, %r8d
471 ; BWOFF-NEXT: xorl %ecx, %ecx
472 ; BWOFF-NEXT: .p2align 4, 0x90
473 ; BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
474 ; BWOFF-NEXT: movq (%rdi,%rcx,8), %rax
475 ; BWOFF-NEXT: movw (%rdx,%rax), %ax
476 ; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2)
477 ; BWOFF-NEXT: incq %rcx
478 ; BWOFF-NEXT: cmpl %ecx, %r8d
479 ; BWOFF-NEXT: jne .LBB9_1
480 ; BWOFF-NEXT: # %bb.2:
485 %.09 = phi i32 [ %n, %0 ], [ %11, %1 ]
486 %.08 = phi i8* [ %b, %0 ], [ %10, %1 ]
487 %.0 = phi i64* [ %a, %0 ], [ %2, %1 ]
488 %2 = getelementptr inbounds i64, i64* %.0, i64 1
489 %3 = load i64, i64* %.0, align 1
490 %4 = getelementptr inbounds i8, i8* %c, i64 %3
491 %5 = load i8, i8* %4, align 1
493 %7 = getelementptr inbounds i8, i8* %c, i64 %6
494 %8 = load i8, i8* %7, align 1
495 store i8 %5, i8* %.08, align 1
496 %9 = getelementptr inbounds i8, i8* %.08, i64 1
497 store i8 %8, i8* %9, align 1
498 %10 = getelementptr inbounds i8, i8* %.08, i64 2
499 %11 = add nsw i32 %.09, -1
500 %12 = icmp eq i32 %11, 0
501 br i1 %12, label %13, label %1
507 ; Make sure that we merge the consecutive load/store sequence below and use a
508 ; word (16 bit) instead of a byte copy for complicated address calculation.
509 define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
510 ; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
512 ; BWON-NEXT: xorl %r8d, %r8d
513 ; BWON-NEXT: .p2align 4, 0x90
514 ; BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
515 ; BWON-NEXT: movsbq (%rsi), %rax
516 ; BWON-NEXT: movzwl (%rdx,%rax), %eax
517 ; BWON-NEXT: movw %ax, (%rdi,%r8)
518 ; BWON-NEXT: incq %rsi
519 ; BWON-NEXT: addq $2, %r8
520 ; BWON-NEXT: cmpq %rcx, %r8
521 ; BWON-NEXT: jl .LBB10_1
522 ; BWON-NEXT: # %bb.2:
525 ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
527 ; BWOFF-NEXT: xorl %r8d, %r8d
528 ; BWOFF-NEXT: .p2align 4, 0x90
529 ; BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
530 ; BWOFF-NEXT: movsbq (%rsi), %rax
531 ; BWOFF-NEXT: movw (%rdx,%rax), %ax
532 ; BWOFF-NEXT: movw %ax, (%rdi,%r8)
533 ; BWOFF-NEXT: incq %rsi
534 ; BWOFF-NEXT: addq $2, %r8
535 ; BWOFF-NEXT: cmpq %rcx, %r8
536 ; BWOFF-NEXT: jl .LBB10_1
537 ; BWOFF-NEXT: # %bb.2:
542 %.09 = phi i64 [ 0, %0 ], [ %13, %1 ]
543 %.08 = phi i8* [ %b, %0 ], [ %12, %1 ]
544 %2 = load i8, i8* %.08, align 1
545 %3 = sext i8 %2 to i64
546 %4 = getelementptr inbounds i8, i8* %c, i64 %3
547 %5 = load i8, i8* %4, align 1
548 %6 = add nsw i64 %3, 1
549 %7 = getelementptr inbounds i8, i8* %c, i64 %6
550 %8 = load i8, i8* %7, align 1
551 %9 = getelementptr inbounds i8, i8* %a, i64 %.09
552 store i8 %5, i8* %9, align 1
554 %11 = getelementptr inbounds i8, i8* %a, i64 %10
555 store i8 %8, i8* %11, align 1
556 %12 = getelementptr inbounds i8, i8* %.08, i64 1
557 %13 = add nuw nsw i64 %.09, 2
558 %14 = icmp slt i64 %13, %n
559 br i1 %14, label %1, label %15
565 ; Make sure that we merge the consecutive load/store sequence below and use a
566 ; word (16 bit) instead of a byte copy even if there are intermediate sign
568 define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
569 ; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
571 ; BWON-NEXT: movl %ecx, %r8d
572 ; BWON-NEXT: xorl %ecx, %ecx
573 ; BWON-NEXT: .p2align 4, 0x90
574 ; BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
575 ; BWON-NEXT: movsbq (%rdi,%rcx), %rax
576 ; BWON-NEXT: movzwl (%rdx,%rax), %eax
577 ; BWON-NEXT: movw %ax, (%rsi,%rcx,2)
578 ; BWON-NEXT: incq %rcx
579 ; BWON-NEXT: cmpl %ecx, %r8d
580 ; BWON-NEXT: jne .LBB11_1
581 ; BWON-NEXT: # %bb.2:
584 ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
586 ; BWOFF-NEXT: movl %ecx, %r8d
587 ; BWOFF-NEXT: xorl %ecx, %ecx
588 ; BWOFF-NEXT: .p2align 4, 0x90
589 ; BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
590 ; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax
591 ; BWOFF-NEXT: movw (%rdx,%rax), %ax
592 ; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2)
593 ; BWOFF-NEXT: incq %rcx
594 ; BWOFF-NEXT: cmpl %ecx, %r8d
595 ; BWOFF-NEXT: jne .LBB11_1
596 ; BWOFF-NEXT: # %bb.2:
601 %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
602 %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
603 %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
604 %2 = getelementptr inbounds i8, i8* %.0, i64 1
605 %3 = load i8, i8* %.0, align 1
606 %4 = sext i8 %3 to i64
607 %5 = getelementptr inbounds i8, i8* %c, i64 %4
608 %6 = load i8, i8* %5, align 1
610 %8 = getelementptr inbounds i8, i8* %c, i64 %7
611 %9 = load i8, i8* %8, align 1
612 store i8 %6, i8* %.08, align 1
613 %10 = getelementptr inbounds i8, i8* %.08, i64 1
614 store i8 %9, i8* %10, align 1
615 %11 = getelementptr inbounds i8, i8* %.08, i64 2
616 %12 = add nsw i32 %.09, -1
617 %13 = icmp eq i32 %12, 0
618 br i1 %13, label %14, label %1
624 ; However, we can only merge ignore sign extensions when they are on all memory
626 define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
627 ; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
629 ; BWON-NEXT: movl %ecx, %r8d
630 ; BWON-NEXT: xorl %ecx, %ecx
631 ; BWON-NEXT: .p2align 4, 0x90
632 ; BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
633 ; BWON-NEXT: movsbq (%rdi,%rcx), %rax
634 ; BWON-NEXT: movzbl (%rdx,%rax), %r9d
635 ; BWON-NEXT: incl %eax
636 ; BWON-NEXT: movsbq %al, %rax
637 ; BWON-NEXT: movzbl (%rdx,%rax), %eax
638 ; BWON-NEXT: movb %r9b, (%rsi,%rcx,2)
639 ; BWON-NEXT: movb %al, 1(%rsi,%rcx,2)
640 ; BWON-NEXT: incq %rcx
641 ; BWON-NEXT: cmpl %ecx, %r8d
642 ; BWON-NEXT: jne .LBB12_1
643 ; BWON-NEXT: # %bb.2:
646 ; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
648 ; BWOFF-NEXT: movl %ecx, %r8d
649 ; BWOFF-NEXT: xorl %ecx, %ecx
650 ; BWOFF-NEXT: .p2align 4, 0x90
651 ; BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
652 ; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax
653 ; BWOFF-NEXT: movb (%rdx,%rax), %r9b
654 ; BWOFF-NEXT: incl %eax
655 ; BWOFF-NEXT: movsbq %al, %rax
656 ; BWOFF-NEXT: movb (%rdx,%rax), %al
657 ; BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2)
658 ; BWOFF-NEXT: movb %al, 1(%rsi,%rcx,2)
659 ; BWOFF-NEXT: incq %rcx
660 ; BWOFF-NEXT: cmpl %ecx, %r8d
661 ; BWOFF-NEXT: jne .LBB12_1
662 ; BWOFF-NEXT: # %bb.2:
667 %.09 = phi i32 [ %n, %0 ], [ %12, %1 ]
668 %.08 = phi i8* [ %b, %0 ], [ %11, %1 ]
669 %.0 = phi i8* [ %a, %0 ], [ %2, %1 ]
670 %2 = getelementptr inbounds i8, i8* %.0, i64 1
671 %3 = load i8, i8* %.0, align 1
672 %4 = sext i8 %3 to i64
673 %5 = getelementptr inbounds i8, i8* %c, i64 %4
674 %6 = load i8, i8* %5, align 1
676 %wrap.4 = sext i8 %7 to i64
677 %8 = getelementptr inbounds i8, i8* %c, i64 %wrap.4
678 %9 = load i8, i8* %8, align 1
679 store i8 %6, i8* %.08, align 1
680 %10 = getelementptr inbounds i8, i8* %.08, i64 1
681 store i8 %9, i8* %10, align 1
682 %11 = getelementptr inbounds i8, i8* %.08, i64 2
683 %12 = add nsw i32 %.09, -1
684 %13 = icmp eq i32 %12, 0
685 br i1 %13, label %14, label %1
691 ; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
692 define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
693 ; CHECK-LABEL: merge_vec_element_store:
695 ; CHECK-NEXT: vmovups %ymm0, (%rdi)
696 ; CHECK-NEXT: vzeroupper
698 %vecext0 = extractelement <8 x float> %v, i32 0
699 %vecext1 = extractelement <8 x float> %v, i32 1
700 %vecext2 = extractelement <8 x float> %v, i32 2
701 %vecext3 = extractelement <8 x float> %v, i32 3
702 %vecext4 = extractelement <8 x float> %v, i32 4
703 %vecext5 = extractelement <8 x float> %v, i32 5
704 %vecext6 = extractelement <8 x float> %v, i32 6
705 %vecext7 = extractelement <8 x float> %v, i32 7
706 %arrayidx1 = getelementptr inbounds float, float* %ptr, i64 1
707 %arrayidx2 = getelementptr inbounds float, float* %ptr, i64 2
708 %arrayidx3 = getelementptr inbounds float, float* %ptr, i64 3
709 %arrayidx4 = getelementptr inbounds float, float* %ptr, i64 4
710 %arrayidx5 = getelementptr inbounds float, float* %ptr, i64 5
711 %arrayidx6 = getelementptr inbounds float, float* %ptr, i64 6
712 %arrayidx7 = getelementptr inbounds float, float* %ptr, i64 7
713 store float %vecext0, float* %ptr, align 4
714 store float %vecext1, float* %arrayidx1, align 4
715 store float %vecext2, float* %arrayidx2, align 4
716 store float %vecext3, float* %arrayidx3, align 4
717 store float %vecext4, float* %arrayidx4, align 4
718 store float %vecext5, float* %arrayidx5, align 4
719 store float %vecext6, float* %arrayidx6, align 4
720 store float %vecext7, float* %arrayidx7, align 4
725 ; PR21711 - Merge vector stores into wider vector stores.
726 ; These should be merged into 32-byte stores.
727 define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
728 ; CHECK-LABEL: merge_vec_extract_stores:
730 ; CHECK-NEXT: vmovups %ymm0, 48(%rdi)
731 ; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
732 ; CHECK-NEXT: vzeroupper
734 %idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
735 %idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
736 %idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
737 %idx3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 6
738 %shuffle0 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
739 %shuffle1 = shufflevector <8 x float> %v1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
740 %shuffle2 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
741 %shuffle3 = shufflevector <8 x float> %v2, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
742 store <4 x float> %shuffle0, <4 x float>* %idx0, align 16
743 store <4 x float> %shuffle1, <4 x float>* %idx1, align 16
744 store <4 x float> %shuffle2, <4 x float>* %idx2, align 16
745 store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
750 ; Merging vector stores when sourced from vector loads.
751 define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
752 ; CHECK-LABEL: merge_vec_stores_from_loads:
754 ; CHECK-NEXT: vmovups (%rdi), %ymm0
755 ; CHECK-NEXT: vmovups %ymm0, (%rsi)
756 ; CHECK-NEXT: vzeroupper
758 %load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
759 %load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
760 %v0 = load <4 x float>, <4 x float>* %load_idx0
761 %v1 = load <4 x float>, <4 x float>* %load_idx1
762 %store_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 0
763 %store_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
764 store <4 x float> %v0, <4 x float>* %store_idx0, align 16
765 store <4 x float> %v1, <4 x float>* %store_idx1, align 16
770 ; Merging vector stores when sourced from a constant vector is not currently handled.
771 define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
772 ; CHECK-LABEL: merge_vec_stores_of_constants:
774 ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
775 ; CHECK-NEXT: vmovaps %xmm0, 48(%rdi)
776 ; CHECK-NEXT: vmovaps %xmm0, 64(%rdi)
778 %idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
779 %idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
780 store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
781 store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
786 ; This is a minimized test based on real code that was failing.
787 ; This should now be merged.
788 define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
789 ; CHECK-LABEL: merge_vec_element_and_scalar_load:
791 ; CHECK-NEXT: vmovups (%rdi), %xmm0
792 ; CHECK-NEXT: vmovups %xmm0, 32(%rdi)
794 %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
795 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
796 %idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
797 %idx5 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 5
799 %a0 = load i64, i64* %idx0, align 8
800 store i64 %a0, i64* %idx4, align 8
802 %b = bitcast i64* %idx1 to <2 x i64>*
803 %v = load <2 x i64>, <2 x i64>* %b, align 8
804 %a1 = extractelement <2 x i64> %v, i32 0
805 store i64 %a1, i64* %idx5, align 8
810 ; Don't let a non-consecutive store thwart merging of the last two.
811 define void @almost_consecutive_stores(i8* %p) {
812 ; CHECK-LABEL: almost_consecutive_stores:
814 ; CHECK-NEXT: movb $0, (%rdi)
815 ; CHECK-NEXT: movb $1, 42(%rdi)
816 ; CHECK-NEXT: movw $770, 2(%rdi) # imm = 0x302
819 %p1 = getelementptr i8, i8* %p, i64 42
821 %p2 = getelementptr i8, i8* %p, i64 2
823 %p3 = getelementptr i8, i8* %p, i64 3
828 ; We should be able to merge these.
829 define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
830 ; CHECK-LABEL: merge_bitcast:
832 ; CHECK-NEXT: vmovups %xmm0, (%rdi)
834 %fv = bitcast <4 x i32> %v to <4 x float>
835 %vecext1 = extractelement <4 x i32> %v, i32 1
836 %vecext2 = extractelement <4 x i32> %v, i32 2
837 %vecext3 = extractelement <4 x i32> %v, i32 3
838 %f0 = extractelement <4 x float> %fv, i32 0
839 %f1 = bitcast i32 %vecext1 to float
840 %f2 = bitcast i32 %vecext2 to float
841 %f3 = bitcast i32 %vecext3 to float
842 %idx0 = getelementptr inbounds float, float* %ptr, i64 0
843 %idx1 = getelementptr inbounds float, float* %ptr, i64 1
844 %idx2 = getelementptr inbounds float, float* %ptr, i64 2
845 %idx3 = getelementptr inbounds float, float* %ptr, i64 3
846 store float %f0, float* %idx0, align 4
847 store float %f1, float* %idx1, align 4
848 store float %f2, float* %idx2, align 4
849 store float %f3, float* %idx3, align 4
853 ; same as @merge_const_store with heterogeneous types.
854 define void @merge_const_store_heterogeneous(i32 %count, %struct.C* nocapture %p) nounwind uwtable noinline ssp {
855 ; CHECK-LABEL: merge_const_store_heterogeneous:
857 ; CHECK-NEXT: testl %edi, %edi
858 ; CHECK-NEXT: jle .LBB20_3
859 ; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
860 ; CHECK-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
861 ; CHECK-NEXT: .p2align 4, 0x90
862 ; CHECK-NEXT: .LBB20_2: # %.lr.ph
863 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
864 ; CHECK-NEXT: movq %rax, (%rsi)
865 ; CHECK-NEXT: addq $24, %rsi
866 ; CHECK-NEXT: decl %edi
867 ; CHECK-NEXT: jne .LBB20_2
868 ; CHECK-NEXT: .LBB20_3: # %._crit_edge
870 %1 = icmp sgt i32 %count, 0
871 br i1 %1, label %.lr.ph, label %._crit_edge
873 %i.02 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
874 %.01 = phi %struct.C* [ %8, %.lr.ph ], [ %p, %0 ]
875 %2 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 0
876 store i8 1, i8* %2, align 1
877 %3 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 1
878 store i8 2, i8* %3, align 1
879 %4 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 2
880 store i8 3, i8* %4, align 1
881 %5 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 3
882 store i8 4, i8* %5, align 1
883 %6 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 0, i32 4
884 store i32 134678021, i32* %6, align 1
885 %7 = add nsw i32 %i.02, 1
886 %8 = getelementptr inbounds %struct.C, %struct.C* %.01, i64 1
887 %exitcond = icmp eq i32 %7, %count
888 br i1 %exitcond, label %._crit_edge, label %.lr.ph
893 ; Merging heterogeneous integer types.
894 define void @merge_heterogeneous(%struct.C* nocapture %p, %struct.C* nocapture %q) {
895 ; CHECK-LABEL: merge_heterogeneous:
897 ; CHECK-NEXT: movq (%rdi), %rax
898 ; CHECK-NEXT: movq %rax, (%rsi)
900 %s0 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 0
901 %s1 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 1
902 %s2 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 2
903 %s3 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 3
904 %s4 = getelementptr inbounds %struct.C, %struct.C* %p, i64 0, i32 4
905 %d0 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 0
906 %d1 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 1
907 %d2 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 2
908 %d3 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 3
909 %d4 = getelementptr inbounds %struct.C, %struct.C* %q, i64 0, i32 4
910 %v0 = load i8, i8* %s0, align 1
911 %v1 = load i8, i8* %s1, align 1
912 %v2 = load i8, i8* %s2, align 1
913 %v3 = load i8, i8* %s3, align 1
914 %v4 = load i32, i32* %s4, align 1
915 store i8 %v0, i8* %d0, align 1
916 store i8 %v1, i8* %d1, align 1
917 store i8 %v2, i8* %d2, align 1
918 store i8 %v3, i8* %d3, align 1
919 store i32 %v4, i32* %d4, align 4