1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \
3 ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
12 ; 1: base: base1 + offset, offsets: (0, offset)
13 ; 2: base: base1 + 3*offset, offsets: (0, offset)
15 ; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) {
16 ; long long o1 = base1 + offset;
17 ; long long o2 = base1 + 2 * offset;
18 ; long long o3 = base1 + 3 * offset;
19 ; long long o4 = base1 + 4 * offset;
25 ; for (long long i = 0; i < n; ++i) {
26 ; unsigned long x1 = *(unsigned long *)(p1 + i);
27 ; unsigned long x2 = *(unsigned long *)(p2 + i);
28 ; unsigned long x3 = *(unsigned long *)(p3 + i);
29 ; unsigned long x4 = *(unsigned long *)(p4 + i);
30 ; sum += x1 * x2 * x3 * x4;
35 define i64 @two_chain_same_offset_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) {
36 ; CHECK-LABEL: two_chain_same_offset_succ:
37 ; CHECK: # %bb.0: # %entry
38 ; CHECK-NEXT: cmpdi r6, 1
39 ; CHECK-NEXT: blt cr0, .LBB0_4
40 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
41 ; CHECK-NEXT: sldi r7, r4, 1
42 ; CHECK-NEXT: mtctr r6
43 ; CHECK-NEXT: add r8, r4, r7
44 ; CHECK-NEXT: add r7, r5, r4
45 ; CHECK-NEXT: add r5, r5, r8
46 ; CHECK-NEXT: add r7, r3, r7
47 ; CHECK-NEXT: add r5, r3, r5
48 ; CHECK-NEXT: li r3, 0
49 ; CHECK-NEXT: .p2align 4
50 ; CHECK-NEXT: .LBB0_2: # %for.body
52 ; CHECK-NEXT: ld r6, 0(r7)
53 ; CHECK-NEXT: ldx r8, r7, r4
54 ; CHECK-NEXT: ld r9, 0(r5)
55 ; CHECK-NEXT: ldx r10, r5, r4
56 ; CHECK-NEXT: addi r7, r7, 1
57 ; CHECK-NEXT: addi r5, r5, 1
58 ; CHECK-NEXT: mulld r6, r8, r6
59 ; CHECK-NEXT: mulld r6, r6, r9
60 ; CHECK-NEXT: maddld r3, r6, r10, r3
61 ; CHECK-NEXT: bdnz .LBB0_2
62 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
64 ; CHECK-NEXT: .LBB0_4:
65 ; CHECK-NEXT: li r3, 0
68 %mul = shl nsw i64 %offset, 1
69 %mul2 = mul nsw i64 %offset, 3
70 %mul4 = shl nsw i64 %offset, 2
71 %cmp46 = icmp sgt i64 %n, 0
72 br i1 %cmp46, label %for.body, label %for.cond.cleanup
74 for.cond.cleanup: ; preds = %for.body, %entry
75 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
78 for.body: ; preds = %entry, %for.body
79 %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
80 %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
81 %add = add i64 %i.047, %base1
82 %add.ptr9.idx = add i64 %add, %offset
83 %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
84 %0 = bitcast i8* %add.ptr9 to i64*
85 %1 = load i64, i64* %0, align 8
86 %add.ptr10.idx = add i64 %add, %mul
87 %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
88 %2 = bitcast i8* %add.ptr10 to i64*
89 %3 = load i64, i64* %2, align 8
90 %add.ptr11.idx = add i64 %add, %mul2
91 %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
92 %4 = bitcast i8* %add.ptr11 to i64*
93 %5 = load i64, i64* %4, align 8
94 %add.ptr12.idx = add i64 %add, %mul4
95 %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
96 %6 = bitcast i8* %add.ptr12 to i64*
97 %7 = load i64, i64* %6, align 8
98 %mul13 = mul i64 %3, %1
99 %mul14 = mul i64 %mul13, %5
100 %mul15 = mul i64 %mul14, %7
101 %add16 = add i64 %mul15, %sum.048
102 %inc = add nuw nsw i64 %i.047, 1
103 %exitcond.not = icmp eq i64 %inc, %n
104 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
114 ; It can not be commoned to chains because we need a chain for a single address.
115 ; It is not profitable to common chains if not all addresses are in chains.
117 ; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) {
118 ; long long o1 = base1 + offset;
119 ; long long o2 = base1 + 2 * offset;
120 ; long long o3 = base1 + 3 * offset;
121 ; long long o4 = base1 + 4 * offset;
122 ; long long o5 = base1 + 5 * offset;
129 ; for (long long i = 0; i < n; ++i) {
130 ; unsigned long x1 = *(unsigned long *)(p1 + i);
131 ; unsigned long x2 = *(unsigned long *)(p2 + i);
132 ; unsigned long x3 = *(unsigned long *)(p3 + i);
133 ; unsigned long x4 = *(unsigned long *)(p4 + i);
134 ; unsigned long x5 = *(unsigned long *)(p5 + i);
135 ; sum += x1 * x2 * x3 * x4 * x5;
140 define i64 @not_perfect_chain_all_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
141 ; CHECK-LABEL: not_perfect_chain_all_same_offset_fail:
142 ; CHECK: # %bb.0: # %entry
143 ; CHECK-NEXT: cmpdi r6, 1
144 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
145 ; CHECK-NEXT: blt cr0, .LBB1_4
146 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
147 ; CHECK-NEXT: sldi r7, r4, 1
148 ; CHECK-NEXT: sldi r9, r4, 2
149 ; CHECK-NEXT: add r5, r3, r5
150 ; CHECK-NEXT: li r3, 0
151 ; CHECK-NEXT: add r8, r4, r7
152 ; CHECK-NEXT: mtctr r6
153 ; CHECK-NEXT: add r10, r4, r9
154 ; CHECK-NEXT: .p2align 4
155 ; CHECK-NEXT: .LBB1_2: # %for.body
157 ; CHECK-NEXT: ldx r6, r5, r4
158 ; CHECK-NEXT: ldx r11, r5, r7
159 ; CHECK-NEXT: ldx r12, r5, r8
160 ; CHECK-NEXT: ldx r0, r5, r9
161 ; CHECK-NEXT: mulld r6, r11, r6
162 ; CHECK-NEXT: ldx r30, r5, r10
163 ; CHECK-NEXT: addi r5, r5, 1
164 ; CHECK-NEXT: mulld r6, r6, r12
165 ; CHECK-NEXT: mulld r6, r6, r0
166 ; CHECK-NEXT: maddld r3, r6, r30, r3
167 ; CHECK-NEXT: bdnz .LBB1_2
168 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
169 ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
171 ; CHECK-NEXT: .LBB1_4:
172 ; CHECK-NEXT: li r3, 0
173 ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
176 %mul = shl nsw i64 %offset, 1
177 %mul2 = mul nsw i64 %offset, 3
178 %mul4 = shl nsw i64 %offset, 2
179 %mul6 = mul nsw i64 %offset, 5
180 %cmp58 = icmp sgt i64 %n, 0
181 br i1 %cmp58, label %for.body, label %for.cond.cleanup
183 for.cond.cleanup: ; preds = %for.body, %entry
184 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ]
187 for.body: ; preds = %entry, %for.body
188 %sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ]
189 %i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
190 %add = add i64 %i.059, %base1
191 %add.ptr12.idx = add i64 %add, %offset
192 %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
193 %0 = bitcast i8* %add.ptr12 to i64*
194 %1 = load i64, i64* %0, align 8
195 %add.ptr13.idx = add i64 %add, %mul
196 %add.ptr13 = getelementptr inbounds i8, i8* %p, i64 %add.ptr13.idx
197 %2 = bitcast i8* %add.ptr13 to i64*
198 %3 = load i64, i64* %2, align 8
199 %add.ptr14.idx = add i64 %add, %mul2
200 %add.ptr14 = getelementptr inbounds i8, i8* %p, i64 %add.ptr14.idx
201 %4 = bitcast i8* %add.ptr14 to i64*
202 %5 = load i64, i64* %4, align 8
203 %add.ptr15.idx = add i64 %add, %mul4
204 %add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx
205 %6 = bitcast i8* %add.ptr15 to i64*
206 %7 = load i64, i64* %6, align 8
207 %add.ptr16.idx = add i64 %add, %mul6
208 %add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx
209 %8 = bitcast i8* %add.ptr16 to i64*
210 %9 = load i64, i64* %8, align 8
211 %mul17 = mul i64 %3, %1
212 %mul18 = mul i64 %mul17, %5
213 %mul19 = mul i64 %mul18, %7
214 %mul20 = mul i64 %mul19, %9
215 %add21 = add i64 %mul20, %sum.060
216 %inc = add nuw nsw i64 %i.059, 1
217 %exitcond.not = icmp eq i64 %inc, %n
218 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
226 ; We need at least 4 addresses to common 2 chains to reuse at least 1 offset.
228 ; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) {
229 ; long long o1 = base1;
230 ; long long o2 = base1 + 2 * offset;
231 ; long long o3 = base1 + 3 * offset;
236 ; for (long long i = 0; i < n; ++i) {
237 ; unsigned long x1 = *(unsigned long *)(p1 + i);
238 ; unsigned long x2 = *(unsigned long *)(p2 + i);
239 ; unsigned long x3 = *(unsigned long *)(p3 + i);
240 ; sum += x1 * x2 * x3;
245 define i64 @no_enough_elements_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
246 ; CHECK-LABEL: no_enough_elements_fail:
247 ; CHECK: # %bb.0: # %entry
248 ; CHECK-NEXT: cmpdi r6, 1
249 ; CHECK-NEXT: blt cr0, .LBB2_4
250 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
251 ; CHECK-NEXT: sldi r7, r4, 1
252 ; CHECK-NEXT: mtctr r6
253 ; CHECK-NEXT: add r5, r3, r5
254 ; CHECK-NEXT: li r3, 0
255 ; CHECK-NEXT: add r4, r4, r7
256 ; CHECK-NEXT: .p2align 5
257 ; CHECK-NEXT: .LBB2_2: # %for.body
259 ; CHECK-NEXT: ld r6, 0(r5)
260 ; CHECK-NEXT: ldx r8, r5, r7
261 ; CHECK-NEXT: ldx r9, r5, r4
262 ; CHECK-NEXT: addi r5, r5, 1
263 ; CHECK-NEXT: mulld r6, r8, r6
264 ; CHECK-NEXT: maddld r3, r6, r9, r3
265 ; CHECK-NEXT: bdnz .LBB2_2
266 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
268 ; CHECK-NEXT: .LBB2_4:
269 ; CHECK-NEXT: li r3, 0
272 %mul = shl nsw i64 %offset, 1
273 %mul1 = mul nsw i64 %offset, 3
274 %cmp32 = icmp sgt i64 %n, 0
275 br i1 %cmp32, label %for.body, label %for.cond.cleanup
277 for.cond.cleanup: ; preds = %for.body, %entry
278 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ]
281 for.body: ; preds = %entry, %for.body
282 %sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ]
283 %i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
284 %add.ptr5.idx = add i64 %i.033, %base1
285 %add.ptr5 = getelementptr inbounds i8, i8* %p, i64 %add.ptr5.idx
286 %0 = bitcast i8* %add.ptr5 to i64*
287 %1 = load i64, i64* %0, align 8
288 %add.ptr6.idx = add i64 %add.ptr5.idx, %mul
289 %add.ptr6 = getelementptr inbounds i8, i8* %p, i64 %add.ptr6.idx
290 %2 = bitcast i8* %add.ptr6 to i64*
291 %3 = load i64, i64* %2, align 8
292 %add.ptr7.idx = add i64 %add.ptr5.idx, %mul1
293 %add.ptr7 = getelementptr inbounds i8, i8* %p, i64 %add.ptr7.idx
294 %4 = bitcast i8* %add.ptr7 to i64*
295 %5 = load i64, i64* %4, align 8
296 %mul8 = mul i64 %3, %1
297 %mul9 = mul i64 %mul8, %5
298 %add10 = add i64 %mul9, %sum.034
299 %inc = add nuw nsw i64 %i.033, 1
300 %exitcond.not = icmp eq i64 %inc, %n
301 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
310 ; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains,
311 ; so we can not common any chains.
313 ; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) {
314 ; long long o1 = base1;
315 ; long long o2 = base1 + 2 * offset;
316 ; long long o3 = base1 + 4 * offset;
317 ; long long o4 = base1 + 7 * offset;
323 ; for (long long i = 0; i < n; ++i) {
324 ; unsigned long x1 = *(unsigned long *)(p1 + i);
325 ; unsigned long x2 = *(unsigned long *)(p2 + i);
326 ; unsigned long x3 = *(unsigned long *)(p3 + i);
327 ; unsigned long x4 = *(unsigned long *)(p4 + i);
328 ; sum += x1 * x2 * x3 * x4;
333 define i64 @no_reuseable_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
334 ; CHECK-LABEL: no_reuseable_offset_fail:
335 ; CHECK: # %bb.0: # %entry
336 ; CHECK-NEXT: cmpdi r6, 1
337 ; CHECK-NEXT: blt cr0, .LBB3_4
338 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
339 ; CHECK-NEXT: sldi r9, r4, 3
340 ; CHECK-NEXT: mtctr r6
341 ; CHECK-NEXT: add r5, r3, r5
342 ; CHECK-NEXT: li r3, 0
343 ; CHECK-NEXT: sldi r7, r4, 1
344 ; CHECK-NEXT: sldi r8, r4, 2
345 ; CHECK-NEXT: sub r4, r9, r4
346 ; CHECK-NEXT: .p2align 4
347 ; CHECK-NEXT: .LBB3_2: # %for.body
349 ; CHECK-NEXT: ld r6, 0(r5)
350 ; CHECK-NEXT: ldx r9, r5, r7
351 ; CHECK-NEXT: ldx r10, r5, r8
352 ; CHECK-NEXT: ldx r11, r5, r4
353 ; CHECK-NEXT: addi r5, r5, 1
354 ; CHECK-NEXT: mulld r6, r9, r6
355 ; CHECK-NEXT: mulld r6, r6, r10
356 ; CHECK-NEXT: maddld r3, r6, r11, r3
357 ; CHECK-NEXT: bdnz .LBB3_2
358 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
360 ; CHECK-NEXT: .LBB3_4:
361 ; CHECK-NEXT: li r3, 0
364 %mul = shl nsw i64 %offset, 1
365 %mul1 = shl nsw i64 %offset, 2
366 %mul3 = mul nsw i64 %offset, 7
367 %cmp44 = icmp sgt i64 %n, 0
368 br i1 %cmp44, label %for.body, label %for.cond.cleanup
370 for.cond.cleanup: ; preds = %for.body, %entry
371 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
374 for.body: ; preds = %entry, %for.body
375 %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
376 %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
377 %add.ptr8.idx = add i64 %i.045, %base1
378 %add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx
379 %0 = bitcast i8* %add.ptr8 to i64*
380 %1 = load i64, i64* %0, align 8
381 %add.ptr9.idx = add i64 %add.ptr8.idx, %mul
382 %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
383 %2 = bitcast i8* %add.ptr9 to i64*
384 %3 = load i64, i64* %2, align 8
385 %add.ptr10.idx = add i64 %add.ptr8.idx, %mul1
386 %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
387 %4 = bitcast i8* %add.ptr10 to i64*
388 %5 = load i64, i64* %4, align 8
389 %add.ptr11.idx = add i64 %add.ptr8.idx, %mul3
390 %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
391 %6 = bitcast i8* %add.ptr11 to i64*
392 %7 = load i64, i64* %6, align 8
393 %mul12 = mul i64 %3, %1
394 %mul13 = mul i64 %mul12, %5
395 %mul14 = mul i64 %mul13, %7
396 %add15 = add i64 %mul14, %sum.046
397 %inc = add nuw nsw i64 %i.045, 1
398 %exitcond.not = icmp eq i64 %inc, %n
399 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
410 ; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5.
411 ; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6
412 ; and address 5(2*offset), so we can not common chains for these addresses.
414 ; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) {
415 ; long long o1 = base1 + offset;
416 ; long long o2 = base1 + 2 * offset;
417 ; long long o3 = base1 + 5 * offset;
418 ; long long o4 = base1 + 7 * offset;
419 ; long long o5 = base1 + 8 * offset;
420 ; long long o6 = base1 + 10 * offset;
428 ; for (long long i = 0; i < n; ++i) {
429 ; unsigned long x1 = *(unsigned long *)(p1 + i);
430 ; unsigned long x2 = *(unsigned long *)(p2 + i);
431 ; unsigned long x3 = *(unsigned long *)(p3 + i);
432 ; unsigned long x4 = *(unsigned long *)(p4 + i);
433 ; unsigned long x5 = *(unsigned long *)(p5 + i);
434 ; unsigned long x6 = *(unsigned long *)(p6 + i);
435 ; sum += x1 * x2 * x3 * x4 * x5 * x6;
440 define i64 @not_same_offset_fail(i8* %p, i64 %offset, i64 %base1, i64 %n) {
441 ; CHECK-LABEL: not_same_offset_fail:
442 ; CHECK: # %bb.0: # %entry
443 ; CHECK-NEXT: cmpdi r6, 1
444 ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
445 ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
446 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
447 ; CHECK-NEXT: blt cr0, .LBB4_3
448 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
449 ; CHECK-NEXT: mulli r11, r4, 10
450 ; CHECK-NEXT: sldi r8, r4, 2
451 ; CHECK-NEXT: add r5, r3, r5
452 ; CHECK-NEXT: li r3, 0
453 ; CHECK-NEXT: add r8, r4, r8
454 ; CHECK-NEXT: sldi r9, r4, 3
455 ; CHECK-NEXT: mtctr r6
456 ; CHECK-NEXT: sldi r7, r4, 1
457 ; CHECK-NEXT: sub r10, r9, r4
458 ; CHECK-NEXT: .p2align 4
459 ; CHECK-NEXT: .LBB4_2: # %for.body
461 ; CHECK-NEXT: ldx r6, r5, r4
462 ; CHECK-NEXT: ldx r12, r5, r7
463 ; CHECK-NEXT: ldx r0, r5, r8
464 ; CHECK-NEXT: ldx r30, r5, r10
465 ; CHECK-NEXT: mulld r6, r12, r6
466 ; CHECK-NEXT: ldx r29, r5, r9
467 ; CHECK-NEXT: ldx r28, r5, r11
468 ; CHECK-NEXT: addi r5, r5, 1
469 ; CHECK-NEXT: mulld r6, r6, r0
470 ; CHECK-NEXT: mulld r6, r6, r30
471 ; CHECK-NEXT: mulld r6, r6, r29
472 ; CHECK-NEXT: maddld r3, r6, r28, r3
473 ; CHECK-NEXT: bdnz .LBB4_2
474 ; CHECK-NEXT: b .LBB4_4
475 ; CHECK-NEXT: .LBB4_3:
476 ; CHECK-NEXT: li r3, 0
477 ; CHECK-NEXT: .LBB4_4: # %for.cond.cleanup
478 ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
479 ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
480 ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
483 %mul = shl nsw i64 %offset, 1
484 %mul2 = mul nsw i64 %offset, 5
485 %mul4 = mul nsw i64 %offset, 7
486 %mul6 = shl nsw i64 %offset, 3
487 %mul8 = mul nsw i64 %offset, 10
488 %cmp70 = icmp sgt i64 %n, 0
489 br i1 %cmp70, label %for.body, label %for.cond.cleanup
491 for.cond.cleanup: ; preds = %for.body, %entry
492 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ]
495 for.body: ; preds = %entry, %for.body
496 %sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ]
497 %i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
498 %add = add i64 %i.071, %base1
499 %add.ptr15.idx = add i64 %add, %offset
500 %add.ptr15 = getelementptr inbounds i8, i8* %p, i64 %add.ptr15.idx
501 %0 = bitcast i8* %add.ptr15 to i64*
502 %1 = load i64, i64* %0, align 8
503 %add.ptr16.idx = add i64 %add, %mul
504 %add.ptr16 = getelementptr inbounds i8, i8* %p, i64 %add.ptr16.idx
505 %2 = bitcast i8* %add.ptr16 to i64*
506 %3 = load i64, i64* %2, align 8
507 %add.ptr17.idx = add i64 %add, %mul2
508 %add.ptr17 = getelementptr inbounds i8, i8* %p, i64 %add.ptr17.idx
509 %4 = bitcast i8* %add.ptr17 to i64*
510 %5 = load i64, i64* %4, align 8
511 %add.ptr18.idx = add i64 %add, %mul4
512 %add.ptr18 = getelementptr inbounds i8, i8* %p, i64 %add.ptr18.idx
513 %6 = bitcast i8* %add.ptr18 to i64*
514 %7 = load i64, i64* %6, align 8
515 %add.ptr19.idx = add i64 %add, %mul6
516 %add.ptr19 = getelementptr inbounds i8, i8* %p, i64 %add.ptr19.idx
517 %8 = bitcast i8* %add.ptr19 to i64*
518 %9 = load i64, i64* %8, align 8
519 %add.ptr20.idx = add i64 %add, %mul8
520 %add.ptr20 = getelementptr inbounds i8, i8* %p, i64 %add.ptr20.idx
521 %10 = bitcast i8* %add.ptr20 to i64*
522 %11 = load i64, i64* %10, align 8
523 %mul21 = mul i64 %3, %1
524 %mul22 = mul i64 %mul21, %5
525 %mul23 = mul i64 %mul22, %7
526 %mul24 = mul i64 %mul23, %9
527 %mul25 = mul i64 %mul24, %11
528 %add26 = add i64 %mul25, %sum.072
529 %inc = add nuw nsw i64 %i.071, 1
530 %exitcond.not = icmp eq i64 %inc, %n
531 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
541 ; 1: base1 + offset, offsets: (0, 2*offset)
542 ; 2: base1 + 4*offset, offsets: (0, 2*offset)
544 ; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) {
545 ; long long o1 = base1 + offset;
546 ; long long o2 = base1 + 3 * offset;
547 ; long long o3 = base1 + 4 * offset;
548 ; long long o4 = base1 + 6 * offset;
554 ; for (long long i = 0; i < n; ++i) {
555 ; unsigned long x1 = *(unsigned long *)(p1 + i);
556 ; unsigned long x2 = *(unsigned long *)(p2 + i);
557 ; unsigned long x3 = *(unsigned long *)(p3 + i);
558 ; unsigned long x4 = *(unsigned long *)(p4 + i);
559 ; sum += x1 * x2 * x3 * x4;
564 define i64 @two_chain_different_offsets_succ(i8* %p, i64 %offset, i64 %base1, i64 %n) {
565 ; CHECK-LABEL: two_chain_different_offsets_succ:
566 ; CHECK: # %bb.0: # %entry
567 ; CHECK-NEXT: cmpdi r6, 1
568 ; CHECK-NEXT: blt cr0, .LBB5_4
569 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
570 ; CHECK-NEXT: sldi r8, r4, 2
571 ; CHECK-NEXT: add r7, r5, r4
572 ; CHECK-NEXT: mtctr r6
573 ; CHECK-NEXT: add r5, r5, r8
574 ; CHECK-NEXT: add r7, r3, r7
575 ; CHECK-NEXT: sldi r4, r4, 1
576 ; CHECK-NEXT: add r5, r3, r5
577 ; CHECK-NEXT: li r3, 0
578 ; CHECK-NEXT: .p2align 4
579 ; CHECK-NEXT: .LBB5_2: # %for.body
581 ; CHECK-NEXT: ld r6, 0(r7)
582 ; CHECK-NEXT: ldx r8, r7, r4
583 ; CHECK-NEXT: ld r9, 0(r5)
584 ; CHECK-NEXT: ldx r10, r5, r4
585 ; CHECK-NEXT: addi r7, r7, 1
586 ; CHECK-NEXT: addi r5, r5, 1
587 ; CHECK-NEXT: mulld r6, r8, r6
588 ; CHECK-NEXT: mulld r6, r6, r9
589 ; CHECK-NEXT: maddld r3, r6, r10, r3
590 ; CHECK-NEXT: bdnz .LBB5_2
591 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
593 ; CHECK-NEXT: .LBB5_4:
594 ; CHECK-NEXT: li r3, 0
597 %mul = mul nsw i64 %offset, 3
598 %mul2 = shl nsw i64 %offset, 2
599 %mul4 = mul nsw i64 %offset, 6
600 %cmp46 = icmp sgt i64 %n, 0
601 br i1 %cmp46, label %for.body, label %for.cond.cleanup
603 for.cond.cleanup: ; preds = %for.body, %entry
604 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
607 for.body: ; preds = %entry, %for.body
608 %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
609 %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
610 %add = add i64 %i.047, %base1
611 %add.ptr9.idx = add i64 %add, %offset
612 %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
613 %0 = bitcast i8* %add.ptr9 to i64*
614 %1 = load i64, i64* %0, align 8
615 %add.ptr10.idx = add i64 %add, %mul
616 %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
617 %2 = bitcast i8* %add.ptr10 to i64*
618 %3 = load i64, i64* %2, align 8
619 %add.ptr11.idx = add i64 %add, %mul2
620 %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
621 %4 = bitcast i8* %add.ptr11 to i64*
622 %5 = load i64, i64* %4, align 8
623 %add.ptr12.idx = add i64 %add, %mul4
624 %add.ptr12 = getelementptr inbounds i8, i8* %p, i64 %add.ptr12.idx
625 %6 = bitcast i8* %add.ptr12 to i64*
626 %7 = load i64, i64* %6, align 8
627 %mul13 = mul i64 %3, %1
628 %mul14 = mul i64 %mul13, %5
629 %mul15 = mul i64 %mul14, %7
630 %add16 = add i64 %mul15, %sum.048
631 %inc = add nuw nsw i64 %i.047, 1
632 %exitcond.not = icmp eq i64 %inc, %n
633 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
639 ; 3: + base2 - base1 - 2*offset
643 ; 1: base1 + offset, offsets: (0, 2*offset)
644 ; 2: base2 + offset, offsets: (0, 2*offset)
646 ; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) {
647 ; long long o1 = base1 + offset;
648 ; long long o2 = base1 + 3 * offset;
649 ; long long o3 = base2 + offset;
650 ; long long o4 = base2 + 3 * offset;
656 ; for (long long i = 0; i < n; ++i) {
657 ; unsigned long x1 = *(unsigned long *)(p1 + i);
658 ; unsigned long x2 = *(unsigned long *)(p2 + i);
659 ; unsigned long x3 = *(unsigned long *)(p3 + i);
660 ; unsigned long x4 = *(unsigned long *)(p4 + i);
661 ; sum += x1 * x2 * x3 * x4;
666 define i64 @two_chain_two_bases_succ(i8* %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) {
667 ; CHECK-LABEL: two_chain_two_bases_succ:
668 ; CHECK: # %bb.0: # %entry
669 ; CHECK-NEXT: cmpdi r7, 1
670 ; CHECK-NEXT: blt cr0, .LBB6_4
671 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
672 ; CHECK-NEXT: add r6, r6, r4
673 ; CHECK-NEXT: add r5, r5, r4
674 ; CHECK-NEXT: mtctr r7
675 ; CHECK-NEXT: sldi r4, r4, 1
676 ; CHECK-NEXT: add r5, r3, r5
677 ; CHECK-NEXT: add r6, r3, r6
678 ; CHECK-NEXT: li r3, 0
679 ; CHECK-NEXT: .p2align 4
680 ; CHECK-NEXT: .LBB6_2: # %for.body
682 ; CHECK-NEXT: ld r7, 0(r5)
683 ; CHECK-NEXT: ldx r8, r5, r4
684 ; CHECK-NEXT: ld r9, 0(r6)
685 ; CHECK-NEXT: ldx r10, r6, r4
686 ; CHECK-NEXT: addi r5, r5, 1
687 ; CHECK-NEXT: addi r6, r6, 1
688 ; CHECK-NEXT: mulld r7, r8, r7
689 ; CHECK-NEXT: mulld r7, r7, r9
690 ; CHECK-NEXT: maddld r3, r7, r10, r3
691 ; CHECK-NEXT: bdnz .LBB6_2
692 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
694 ; CHECK-NEXT: .LBB6_4:
695 ; CHECK-NEXT: li r3, 0
698 %mul = mul nsw i64 %offset, 3
699 %cmp44 = icmp sgt i64 %n, 0
700 br i1 %cmp44, label %for.body, label %for.cond.cleanup
702 for.cond.cleanup: ; preds = %for.body, %entry
703 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
706 for.body: ; preds = %entry, %for.body
707 %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
708 %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
709 %add = add i64 %i.045, %base1
710 %add.ptr8.idx = add i64 %add, %offset
711 %add.ptr8 = getelementptr inbounds i8, i8* %p, i64 %add.ptr8.idx
712 %0 = bitcast i8* %add.ptr8 to i64*
713 %1 = load i64, i64* %0, align 8
714 %add1 = add i64 %i.045, %mul
715 %add.ptr9.idx = add i64 %add1, %base1
716 %add.ptr9 = getelementptr inbounds i8, i8* %p, i64 %add.ptr9.idx
717 %2 = bitcast i8* %add.ptr9 to i64*
718 %3 = load i64, i64* %2, align 8
719 %add2 = add i64 %i.045, %base2
720 %add.ptr10.idx = add i64 %add2, %offset
721 %add.ptr10 = getelementptr inbounds i8, i8* %p, i64 %add.ptr10.idx
722 %4 = bitcast i8* %add.ptr10 to i64*
723 %5 = load i64, i64* %4, align 8
724 %add.ptr11.idx = add i64 %add2, %mul
725 %add.ptr11 = getelementptr inbounds i8, i8* %p, i64 %add.ptr11.idx
726 %6 = bitcast i8* %add.ptr11 to i64*
727 %7 = load i64, i64* %6, align 8
728 %mul12 = mul i64 %3, %1
729 %mul13 = mul i64 %mul12, %5
730 %mul14 = mul i64 %mul13, %7
731 %add15 = add i64 %mul14, %sum.046
732 %inc = add nuw nsw i64 %i.045, 1
733 %exitcond.not = icmp eq i64 %inc, %n
734 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
737 ; Check chain commoning can reduce register pressure to save register spill/reload.
739 ; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) {
742 ; for (long long i = 0; i < 4 * m; i++) {
743 ; output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1];
744 ; output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2];
745 ; output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3];
751 define signext i32 @spill_reduce_succ(double* %input1, double* %input2, double* %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) {
752 ; CHECK-LABEL: spill_reduce_succ:
753 ; CHECK: # %bb.0: # %entry
754 ; CHECK-NEXT: cmpdi r6, 1
755 ; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill
756 ; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill
757 ; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill
758 ; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill
759 ; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill
760 ; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill
761 ; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill
762 ; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill
763 ; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill
764 ; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill
765 ; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill
766 ; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill
767 ; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill
768 ; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill
769 ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
770 ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
771 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
772 ; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill
773 ; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill
774 ; CHECK-NEXT: std r9, -160(r1) # 8-byte Folded Spill
775 ; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill
776 ; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill
777 ; CHECK-NEXT: blt cr0, .LBB7_7
778 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
779 ; CHECK-NEXT: sldi r6, r6, 2
780 ; CHECK-NEXT: li r7, 1
781 ; CHECK-NEXT: mr r12, r10
782 ; CHECK-NEXT: cmpdi r6, 1
783 ; CHECK-NEXT: iselgt r7, r6, r7
784 ; CHECK-NEXT: addi r8, r7, -1
785 ; CHECK-NEXT: clrldi r6, r7, 63
786 ; CHECK-NEXT: cmpldi r8, 3
787 ; CHECK-NEXT: blt cr0, .LBB7_4
788 ; CHECK-NEXT: # %bb.2: # %for.body.preheader.new
789 ; CHECK-NEXT: rldicl r7, r7, 62, 2
790 ; CHECK-NEXT: sldi r10, r12, 2
791 ; CHECK-NEXT: ld r2, -168(r1) # 8-byte Folded Reload
792 ; CHECK-NEXT: rldicl r7, r7, 2, 1
793 ; CHECK-NEXT: std r7, -184(r1) # 8-byte Folded Spill
794 ; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload
795 ; CHECK-NEXT: add r8, r7, r10
796 ; CHECK-NEXT: mr r22, r7
797 ; CHECK-NEXT: mr r7, r4
798 ; CHECK-NEXT: mr r4, r3
799 ; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload
800 ; CHECK-NEXT: sldi r8, r8, 3
801 ; CHECK-NEXT: add r9, r5, r8
802 ; CHECK-NEXT: add r8, r3, r10
803 ; CHECK-NEXT: add r10, r2, r10
804 ; CHECK-NEXT: sldi r10, r10, 3
805 ; CHECK-NEXT: sldi r8, r8, 3
806 ; CHECK-NEXT: add r30, r5, r10
807 ; CHECK-NEXT: add r29, r7, r10
808 ; CHECK-NEXT: add r28, r4, r10
809 ; CHECK-NEXT: sldi r10, r12, 1
810 ; CHECK-NEXT: add r8, r5, r8
811 ; CHECK-NEXT: add r11, r12, r10
812 ; CHECK-NEXT: add r0, r22, r11
813 ; CHECK-NEXT: sldi r0, r0, 3
814 ; CHECK-NEXT: add r27, r5, r0
815 ; CHECK-NEXT: add r0, r3, r11
816 ; CHECK-NEXT: add r11, r2, r11
817 ; CHECK-NEXT: sldi r11, r11, 3
818 ; CHECK-NEXT: sldi r0, r0, 3
819 ; CHECK-NEXT: add r25, r5, r11
820 ; CHECK-NEXT: add r24, r7, r11
821 ; CHECK-NEXT: add r23, r4, r11
822 ; CHECK-NEXT: add r11, r22, r10
823 ; CHECK-NEXT: add r26, r5, r0
824 ; CHECK-NEXT: mr r0, r22
825 ; CHECK-NEXT: sldi r11, r11, 3
826 ; CHECK-NEXT: add r22, r5, r11
827 ; CHECK-NEXT: add r11, r3, r10
828 ; CHECK-NEXT: add r10, r2, r10
829 ; CHECK-NEXT: sldi r10, r10, 3
830 ; CHECK-NEXT: sldi r11, r11, 3
831 ; CHECK-NEXT: add r20, r5, r10
832 ; CHECK-NEXT: add r19, r7, r10
833 ; CHECK-NEXT: add r18, r4, r10
834 ; CHECK-NEXT: add r10, r12, r0
835 ; CHECK-NEXT: add r21, r5, r11
836 ; CHECK-NEXT: sldi r11, r2, 3
837 ; CHECK-NEXT: sldi r10, r10, 3
838 ; CHECK-NEXT: add r17, r5, r10
839 ; CHECK-NEXT: add r10, r12, r3
840 ; CHECK-NEXT: sldi r10, r10, 3
841 ; CHECK-NEXT: add r16, r5, r10
842 ; CHECK-NEXT: add r10, r12, r2
843 ; CHECK-NEXT: sldi r10, r10, 3
844 ; CHECK-NEXT: add r15, r5, r10
845 ; CHECK-NEXT: add r14, r7, r10
846 ; CHECK-NEXT: add r31, r4, r10
847 ; CHECK-NEXT: sldi r10, r3, 3
848 ; CHECK-NEXT: mr r3, r4
849 ; CHECK-NEXT: mr r4, r7
850 ; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload
851 ; CHECK-NEXT: sub r0, r10, r11
852 ; CHECK-NEXT: sldi r10, r7, 3
853 ; CHECK-NEXT: ld r7, -184(r1) # 8-byte Folded Reload
854 ; CHECK-NEXT: sub r2, r10, r11
855 ; CHECK-NEXT: li r11, 0
856 ; CHECK-NEXT: mr r10, r12
857 ; CHECK-NEXT: addi r7, r7, -4
858 ; CHECK-NEXT: rldicl r7, r7, 62, 2
859 ; CHECK-NEXT: addi r7, r7, 1
860 ; CHECK-NEXT: mtctr r7
861 ; CHECK-NEXT: sldi r7, r12, 5
862 ; CHECK-NEXT: .p2align 4
863 ; CHECK-NEXT: .LBB7_3: # %for.body
865 ; CHECK-NEXT: lfd f0, 0(r31)
866 ; CHECK-NEXT: lfd f1, 0(r14)
867 ; CHECK-NEXT: add r10, r10, r12
868 ; CHECK-NEXT: add r10, r10, r12
869 ; CHECK-NEXT: xsmuldp f0, f0, f1
870 ; CHECK-NEXT: lfd f1, 0(r15)
871 ; CHECK-NEXT: add r10, r10, r12
872 ; CHECK-NEXT: add r10, r10, r12
873 ; CHECK-NEXT: xsadddp f0, f1, f0
874 ; CHECK-NEXT: stfd f0, 0(r15)
875 ; CHECK-NEXT: add r15, r15, r7
876 ; CHECK-NEXT: lfdx f0, r31, r0
877 ; CHECK-NEXT: lfdx f1, r14, r0
878 ; CHECK-NEXT: xsmuldp f0, f0, f1
879 ; CHECK-NEXT: lfdx f1, r16, r11
880 ; CHECK-NEXT: xsadddp f0, f1, f0
881 ; CHECK-NEXT: stfdx f0, r16, r11
882 ; CHECK-NEXT: lfdx f0, r31, r2
883 ; CHECK-NEXT: lfdx f1, r14, r2
884 ; CHECK-NEXT: add r31, r31, r7
885 ; CHECK-NEXT: add r14, r14, r7
886 ; CHECK-NEXT: xsmuldp f0, f0, f1
887 ; CHECK-NEXT: lfdx f1, r17, r11
888 ; CHECK-NEXT: xsadddp f0, f1, f0
889 ; CHECK-NEXT: stfdx f0, r17, r11
890 ; CHECK-NEXT: lfd f0, 0(r18)
891 ; CHECK-NEXT: lfd f1, 0(r19)
892 ; CHECK-NEXT: xsmuldp f0, f0, f1
893 ; CHECK-NEXT: lfdx f1, r20, r11
894 ; CHECK-NEXT: xsadddp f0, f1, f0
895 ; CHECK-NEXT: stfdx f0, r20, r11
896 ; CHECK-NEXT: lfdx f0, r18, r0
897 ; CHECK-NEXT: lfdx f1, r19, r0
898 ; CHECK-NEXT: xsmuldp f0, f0, f1
899 ; CHECK-NEXT: lfdx f1, r21, r11
900 ; CHECK-NEXT: xsadddp f0, f1, f0
901 ; CHECK-NEXT: stfdx f0, r21, r11
902 ; CHECK-NEXT: lfdx f0, r18, r2
903 ; CHECK-NEXT: lfdx f1, r19, r2
904 ; CHECK-NEXT: add r18, r18, r7
905 ; CHECK-NEXT: add r19, r19, r7
906 ; CHECK-NEXT: xsmuldp f0, f0, f1
907 ; CHECK-NEXT: lfdx f1, r22, r11
908 ; CHECK-NEXT: xsadddp f0, f1, f0
909 ; CHECK-NEXT: stfdx f0, r22, r11
910 ; CHECK-NEXT: lfd f0, 0(r23)
911 ; CHECK-NEXT: lfd f1, 0(r24)
912 ; CHECK-NEXT: xsmuldp f0, f0, f1
913 ; CHECK-NEXT: lfdx f1, r25, r11
914 ; CHECK-NEXT: xsadddp f0, f1, f0
915 ; CHECK-NEXT: stfdx f0, r25, r11
916 ; CHECK-NEXT: lfdx f0, r23, r0
917 ; CHECK-NEXT: lfdx f1, r24, r0
918 ; CHECK-NEXT: xsmuldp f0, f0, f1
919 ; CHECK-NEXT: lfdx f1, r26, r11
920 ; CHECK-NEXT: xsadddp f0, f1, f0
921 ; CHECK-NEXT: stfdx f0, r26, r11
922 ; CHECK-NEXT: lfdx f0, r23, r2
923 ; CHECK-NEXT: lfdx f1, r24, r2
924 ; CHECK-NEXT: add r23, r23, r7
925 ; CHECK-NEXT: add r24, r24, r7
926 ; CHECK-NEXT: xsmuldp f0, f0, f1
927 ; CHECK-NEXT: lfdx f1, r27, r11
928 ; CHECK-NEXT: xsadddp f0, f1, f0
929 ; CHECK-NEXT: stfdx f0, r27, r11
930 ; CHECK-NEXT: lfd f0, 0(r28)
931 ; CHECK-NEXT: lfd f1, 0(r29)
932 ; CHECK-NEXT: xsmuldp f0, f0, f1
933 ; CHECK-NEXT: lfdx f1, r30, r11
934 ; CHECK-NEXT: xsadddp f0, f1, f0
935 ; CHECK-NEXT: stfdx f0, r30, r11
936 ; CHECK-NEXT: lfdx f0, r28, r0
937 ; CHECK-NEXT: lfdx f1, r29, r0
938 ; CHECK-NEXT: xsmuldp f0, f0, f1
939 ; CHECK-NEXT: lfdx f1, r8, r11
940 ; CHECK-NEXT: xsadddp f0, f1, f0
941 ; CHECK-NEXT: stfdx f0, r8, r11
942 ; CHECK-NEXT: lfdx f0, r28, r2
943 ; CHECK-NEXT: lfdx f1, r29, r2
944 ; CHECK-NEXT: add r28, r28, r7
945 ; CHECK-NEXT: add r29, r29, r7
946 ; CHECK-NEXT: xsmuldp f0, f0, f1
947 ; CHECK-NEXT: lfdx f1, r9, r11
948 ; CHECK-NEXT: xsadddp f0, f1, f0
949 ; CHECK-NEXT: stfdx f0, r9, r11
950 ; CHECK-NEXT: add r11, r11, r7
951 ; CHECK-NEXT: bdnz .LBB7_3
952 ; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
953 ; CHECK-NEXT: cmpldi r6, 0
954 ; CHECK-NEXT: beq cr0, .LBB7_7
955 ; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader
956 ; CHECK-NEXT: sldi r8, r12, 3
957 ; CHECK-NEXT: ld r12, -176(r1) # 8-byte Folded Reload
958 ; CHECK-NEXT: ld r7, -160(r1) # 8-byte Folded Reload
959 ; CHECK-NEXT: add r12, r10, r12
960 ; CHECK-NEXT: add r7, r10, r7
961 ; CHECK-NEXT: sldi r0, r12, 3
962 ; CHECK-NEXT: sldi r11, r7, 3
963 ; CHECK-NEXT: add r12, r5, r0
964 ; CHECK-NEXT: add r30, r4, r0
965 ; CHECK-NEXT: add r29, r3, r0
966 ; CHECK-NEXT: ld r0, -168(r1) # 8-byte Folded Reload
967 ; CHECK-NEXT: add r7, r5, r11
968 ; CHECK-NEXT: add r9, r4, r11
969 ; CHECK-NEXT: add r11, r3, r11
970 ; CHECK-NEXT: add r10, r10, r0
971 ; CHECK-NEXT: sldi r10, r10, 3
972 ; CHECK-NEXT: add r5, r5, r10
973 ; CHECK-NEXT: add r4, r4, r10
974 ; CHECK-NEXT: add r3, r3, r10
975 ; CHECK-NEXT: li r10, 0
976 ; CHECK-NEXT: .p2align 4
977 ; CHECK-NEXT: .LBB7_6: # %for.body.epil
979 ; CHECK-NEXT: lfdx f0, r3, r10
980 ; CHECK-NEXT: lfdx f1, r4, r10
981 ; CHECK-NEXT: addi r6, r6, -1
982 ; CHECK-NEXT: cmpldi r6, 0
983 ; CHECK-NEXT: xsmuldp f0, f0, f1
984 ; CHECK-NEXT: lfd f1, 0(r5)
985 ; CHECK-NEXT: xsadddp f0, f1, f0
986 ; CHECK-NEXT: stfd f0, 0(r5)
987 ; CHECK-NEXT: add r5, r5, r8
988 ; CHECK-NEXT: lfdx f0, r29, r10
989 ; CHECK-NEXT: lfdx f1, r30, r10
990 ; CHECK-NEXT: xsmuldp f0, f0, f1
991 ; CHECK-NEXT: lfdx f1, r12, r10
992 ; CHECK-NEXT: xsadddp f0, f1, f0
993 ; CHECK-NEXT: stfdx f0, r12, r10
994 ; CHECK-NEXT: lfdx f0, r11, r10
995 ; CHECK-NEXT: lfdx f1, r9, r10
996 ; CHECK-NEXT: xsmuldp f0, f0, f1
997 ; CHECK-NEXT: lfdx f1, r7, r10
998 ; CHECK-NEXT: xsadddp f0, f1, f0
999 ; CHECK-NEXT: stfdx f0, r7, r10
1000 ; CHECK-NEXT: add r10, r10, r8
1001 ; CHECK-NEXT: bne cr0, .LBB7_6
1002 ; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup
1003 ; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload
1004 ; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload
1005 ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
1006 ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
1007 ; CHECK-NEXT: li r3, 0
1008 ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
1009 ; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload
1010 ; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload
1011 ; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload
1012 ; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload
1013 ; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload
1014 ; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload
1015 ; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload
1016 ; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload
1017 ; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload
1018 ; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload
1019 ; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload
1020 ; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload
1021 ; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload
1022 ; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload
1025 %cmp49 = icmp sgt i64 %m, 0
1026 br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup
1028 for.body.preheader: ; preds = %entry
1030 %smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1)
1031 %1 = add nsw i64 %smax52, -1
1032 %xtraiter = and i64 %smax52, 1
1033 %2 = icmp ult i64 %1, 3
1034 br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1036 for.body.preheader.new: ; preds = %for.body.preheader
1037 %unroll_iter = and i64 %smax52, 9223372036854775804
1040 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1041 %inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ]
1042 %lcmp.mod.not = icmp eq i64 %xtraiter, 0
1043 br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
1045 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1046 %inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1047 %epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1048 %add.epil = add nsw i64 %inc.addr.050.epil, %inc1
1049 %arrayidx.epil = getelementptr inbounds double, double* %input1, i64 %add.epil
1050 %3 = load double, double* %arrayidx.epil, align 8
1051 %arrayidx2.epil = getelementptr inbounds double, double* %input2, i64 %add.epil
1052 %4 = load double, double* %arrayidx2.epil, align 8
1053 %mul3.epil = fmul double %3, %4
1054 %arrayidx5.epil = getelementptr inbounds double, double* %output, i64 %add.epil
1055 %5 = load double, double* %arrayidx5.epil, align 8
1056 %add6.epil = fadd double %5, %mul3.epil
1057 store double %add6.epil, double* %arrayidx5.epil, align 8
1058 %add7.epil = add nsw i64 %inc.addr.050.epil, %inc2
1059 %arrayidx8.epil = getelementptr inbounds double, double* %input1, i64 %add7.epil
1060 %6 = load double, double* %arrayidx8.epil, align 8
1061 %arrayidx10.epil = getelementptr inbounds double, double* %input2, i64 %add7.epil
1062 %7 = load double, double* %arrayidx10.epil, align 8
1063 %mul11.epil = fmul double %6, %7
1064 %arrayidx13.epil = getelementptr inbounds double, double* %output, i64 %add7.epil
1065 %8 = load double, double* %arrayidx13.epil, align 8
1066 %add14.epil = fadd double %8, %mul11.epil
1067 store double %add14.epil, double* %arrayidx13.epil, align 8
1068 %add15.epil = add nsw i64 %inc.addr.050.epil, %inc3
1069 %arrayidx16.epil = getelementptr inbounds double, double* %input1, i64 %add15.epil
1070 %9 = load double, double* %arrayidx16.epil, align 8
1071 %arrayidx18.epil = getelementptr inbounds double, double* %input2, i64 %add15.epil
1072 %10 = load double, double* %arrayidx18.epil, align 8
1073 %mul19.epil = fmul double %9, %10
1074 %arrayidx21.epil = getelementptr inbounds double, double* %output, i64 %add15.epil
1075 %11 = load double, double* %arrayidx21.epil, align 8
1076 %add22.epil = fadd double %11, %mul19.epil
1077 store double %add22.epil, double* %arrayidx21.epil, align 8
1078 %add23.epil = add nsw i64 %inc.addr.050.epil, %inc4
1079 %epil.iter.sub = add nsw i64 %epil.iter, -1
1080 %epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0
1081 br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil
1083 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1086 for.body: ; preds = %for.body, %for.body.preheader.new
1087 %inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ]
1088 %niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1089 %add = add nsw i64 %inc.addr.050, %inc1
1090 %arrayidx = getelementptr inbounds double, double* %input1, i64 %add
1091 %12 = load double, double* %arrayidx, align 8
1092 %arrayidx2 = getelementptr inbounds double, double* %input2, i64 %add
1093 %13 = load double, double* %arrayidx2, align 8
1094 %mul3 = fmul double %12, %13
1095 %arrayidx5 = getelementptr inbounds double, double* %output, i64 %add
1096 %14 = load double, double* %arrayidx5, align 8
1097 %add6 = fadd double %14, %mul3
1098 store double %add6, double* %arrayidx5, align 8
1099 %add7 = add nsw i64 %inc.addr.050, %inc2
1100 %arrayidx8 = getelementptr inbounds double, double* %input1, i64 %add7
1101 %15 = load double, double* %arrayidx8, align 8
1102 %arrayidx10 = getelementptr inbounds double, double* %input2, i64 %add7
1103 %16 = load double, double* %arrayidx10, align 8
1104 %mul11 = fmul double %15, %16
1105 %arrayidx13 = getelementptr inbounds double, double* %output, i64 %add7
1106 %17 = load double, double* %arrayidx13, align 8
1107 %add14 = fadd double %17, %mul11
1108 store double %add14, double* %arrayidx13, align 8
1109 %add15 = add nsw i64 %inc.addr.050, %inc3
1110 %arrayidx16 = getelementptr inbounds double, double* %input1, i64 %add15
1111 %18 = load double, double* %arrayidx16, align 8
1112 %arrayidx18 = getelementptr inbounds double, double* %input2, i64 %add15
1113 %19 = load double, double* %arrayidx18, align 8
1114 %mul19 = fmul double %18, %19
1115 %arrayidx21 = getelementptr inbounds double, double* %output, i64 %add15
1116 %20 = load double, double* %arrayidx21, align 8
1117 %add22 = fadd double %20, %mul19
1118 store double %add22, double* %arrayidx21, align 8
1119 %add23 = add nsw i64 %inc.addr.050, %inc4
1120 %add.1 = add nsw i64 %add23, %inc1
1121 %arrayidx.1 = getelementptr inbounds double, double* %input1, i64 %add.1
1122 %21 = load double, double* %arrayidx.1, align 8
1123 %arrayidx2.1 = getelementptr inbounds double, double* %input2, i64 %add.1
1124 %22 = load double, double* %arrayidx2.1, align 8
1125 %mul3.1 = fmul double %21, %22
1126 %arrayidx5.1 = getelementptr inbounds double, double* %output, i64 %add.1
1127 %23 = load double, double* %arrayidx5.1, align 8
1128 %add6.1 = fadd double %23, %mul3.1
1129 store double %add6.1, double* %arrayidx5.1, align 8
1130 %add7.1 = add nsw i64 %add23, %inc2
1131 %arrayidx8.1 = getelementptr inbounds double, double* %input1, i64 %add7.1
1132 %24 = load double, double* %arrayidx8.1, align 8
1133 %arrayidx10.1 = getelementptr inbounds double, double* %input2, i64 %add7.1
1134 %25 = load double, double* %arrayidx10.1, align 8
1135 %mul11.1 = fmul double %24, %25
1136 %arrayidx13.1 = getelementptr inbounds double, double* %output, i64 %add7.1
1137 %26 = load double, double* %arrayidx13.1, align 8
1138 %add14.1 = fadd double %26, %mul11.1
1139 store double %add14.1, double* %arrayidx13.1, align 8
1140 %add15.1 = add nsw i64 %add23, %inc3
1141 %arrayidx16.1 = getelementptr inbounds double, double* %input1, i64 %add15.1
1142 %27 = load double, double* %arrayidx16.1, align 8
1143 %arrayidx18.1 = getelementptr inbounds double, double* %input2, i64 %add15.1
1144 %28 = load double, double* %arrayidx18.1, align 8
1145 %mul19.1 = fmul double %27, %28
1146 %arrayidx21.1 = getelementptr inbounds double, double* %output, i64 %add15.1
1147 %29 = load double, double* %arrayidx21.1, align 8
1148 %add22.1 = fadd double %29, %mul19.1
1149 store double %add22.1, double* %arrayidx21.1, align 8
1150 %add23.1 = add nsw i64 %add23, %inc4
1151 %add.2 = add nsw i64 %add23.1, %inc1
1152 %arrayidx.2 = getelementptr inbounds double, double* %input1, i64 %add.2
1153 %30 = load double, double* %arrayidx.2, align 8
1154 %arrayidx2.2 = getelementptr inbounds double, double* %input2, i64 %add.2
1155 %31 = load double, double* %arrayidx2.2, align 8
1156 %mul3.2 = fmul double %30, %31
1157 %arrayidx5.2 = getelementptr inbounds double, double* %output, i64 %add.2
1158 %32 = load double, double* %arrayidx5.2, align 8
1159 %add6.2 = fadd double %32, %mul3.2
1160 store double %add6.2, double* %arrayidx5.2, align 8
1161 %add7.2 = add nsw i64 %add23.1, %inc2
1162 %arrayidx8.2 = getelementptr inbounds double, double* %input1, i64 %add7.2
1163 %33 = load double, double* %arrayidx8.2, align 8
1164 %arrayidx10.2 = getelementptr inbounds double, double* %input2, i64 %add7.2
1165 %34 = load double, double* %arrayidx10.2, align 8
1166 %mul11.2 = fmul double %33, %34
1167 %arrayidx13.2 = getelementptr inbounds double, double* %output, i64 %add7.2
1168 %35 = load double, double* %arrayidx13.2, align 8
1169 %add14.2 = fadd double %35, %mul11.2
1170 store double %add14.2, double* %arrayidx13.2, align 8
1171 %add15.2 = add nsw i64 %add23.1, %inc3
1172 %arrayidx16.2 = getelementptr inbounds double, double* %input1, i64 %add15.2
1173 %36 = load double, double* %arrayidx16.2, align 8
1174 %arrayidx18.2 = getelementptr inbounds double, double* %input2, i64 %add15.2
1175 %37 = load double, double* %arrayidx18.2, align 8
1176 %mul19.2 = fmul double %36, %37
1177 %arrayidx21.2 = getelementptr inbounds double, double* %output, i64 %add15.2
1178 %38 = load double, double* %arrayidx21.2, align 8
1179 %add22.2 = fadd double %38, %mul19.2
1180 store double %add22.2, double* %arrayidx21.2, align 8
1181 %add23.2 = add nsw i64 %add23.1, %inc4
1182 %add.3 = add nsw i64 %add23.2, %inc1
1183 %arrayidx.3 = getelementptr inbounds double, double* %input1, i64 %add.3
1184 %39 = load double, double* %arrayidx.3, align 8
1185 %arrayidx2.3 = getelementptr inbounds double, double* %input2, i64 %add.3
1186 %40 = load double, double* %arrayidx2.3, align 8
1187 %mul3.3 = fmul double %39, %40
1188 %arrayidx5.3 = getelementptr inbounds double, double* %output, i64 %add.3
1189 %41 = load double, double* %arrayidx5.3, align 8
1190 %add6.3 = fadd double %41, %mul3.3
1191 store double %add6.3, double* %arrayidx5.3, align 8
1192 %add7.3 = add nsw i64 %add23.2, %inc2
1193 %arrayidx8.3 = getelementptr inbounds double, double* %input1, i64 %add7.3
1194 %42 = load double, double* %arrayidx8.3, align 8
1195 %arrayidx10.3 = getelementptr inbounds double, double* %input2, i64 %add7.3
1196 %43 = load double, double* %arrayidx10.3, align 8
1197 %mul11.3 = fmul double %42, %43
1198 %arrayidx13.3 = getelementptr inbounds double, double* %output, i64 %add7.3
1199 %44 = load double, double* %arrayidx13.3, align 8
1200 %add14.3 = fadd double %44, %mul11.3
1201 store double %add14.3, double* %arrayidx13.3, align 8
1202 %add15.3 = add nsw i64 %add23.2, %inc3
1203 %arrayidx16.3 = getelementptr inbounds double, double* %input1, i64 %add15.3
1204 %45 = load double, double* %arrayidx16.3, align 8
1205 %arrayidx18.3 = getelementptr inbounds double, double* %input2, i64 %add15.3
1206 %46 = load double, double* %arrayidx18.3, align 8
1207 %mul19.3 = fmul double %45, %46
1208 %arrayidx21.3 = getelementptr inbounds double, double* %output, i64 %add15.3
1209 %47 = load double, double* %arrayidx21.3, align 8
1210 %add22.3 = fadd double %47, %mul19.3
1211 store double %add22.3, double* %arrayidx21.3, align 8
1212 %add23.3 = add nsw i64 %add23.2, %inc4
1213 %niter.nsub.3 = add i64 %niter, -4
1214 %niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0
1215 br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1218 declare i64 @llvm.smax.i64(i64, i64)