1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \
3 ; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
12 ; 1: base: base1 + offset, offsets: (0, offset)
13 ; 2: base: base1 + 3*offset, offsets: (0, offset)
15 ; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) {
16 ; long long o1 = base1 + offset;
17 ; long long o2 = base1 + 2 * offset;
18 ; long long o3 = base1 + 3 * offset;
19 ; long long o4 = base1 + 4 * offset;
25 ; for (long long i = 0; i < n; ++i) {
26 ; unsigned long x1 = *(unsigned long *)(p1 + i);
27 ; unsigned long x2 = *(unsigned long *)(p2 + i);
28 ; unsigned long x3 = *(unsigned long *)(p3 + i);
29 ; unsigned long x4 = *(unsigned long *)(p4 + i);
30 ; sum += x1 * x2 * x3 * x4;
35 define i64 @two_chain_same_offset_succ(ptr %p, i64 %offset, i64 %base1, i64 %n) {
36 ; CHECK-LABEL: two_chain_same_offset_succ:
37 ; CHECK: # %bb.0: # %entry
38 ; CHECK-NEXT: cmpdi r6, 0
39 ; CHECK-NEXT: ble cr0, .LBB0_4
40 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
41 ; CHECK-NEXT: sldi r7, r4, 1
42 ; CHECK-NEXT: mtctr r6
43 ; CHECK-NEXT: add r8, r4, r7
44 ; CHECK-NEXT: add r7, r5, r4
45 ; CHECK-NEXT: add r5, r5, r8
46 ; CHECK-NEXT: add r7, r3, r7
47 ; CHECK-NEXT: add r5, r3, r5
48 ; CHECK-NEXT: li r3, 0
49 ; CHECK-NEXT: .p2align 4
50 ; CHECK-NEXT: .LBB0_2: # %for.body
52 ; CHECK-NEXT: ld r6, 0(r7)
53 ; CHECK-NEXT: ldx r8, r7, r4
54 ; CHECK-NEXT: ld r9, 0(r5)
55 ; CHECK-NEXT: ldx r10, r5, r4
56 ; CHECK-NEXT: addi r7, r7, 1
57 ; CHECK-NEXT: addi r5, r5, 1
58 ; CHECK-NEXT: mulld r6, r8, r6
59 ; CHECK-NEXT: mulld r6, r6, r9
60 ; CHECK-NEXT: maddld r3, r6, r10, r3
61 ; CHECK-NEXT: bdnz .LBB0_2
62 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
64 ; CHECK-NEXT: .LBB0_4:
65 ; CHECK-NEXT: li r3, 0
68 %mul = shl nsw i64 %offset, 1
69 %mul2 = mul nsw i64 %offset, 3
70 %mul4 = shl nsw i64 %offset, 2
71 %cmp46 = icmp sgt i64 %n, 0
72 br i1 %cmp46, label %for.body, label %for.cond.cleanup
74 for.cond.cleanup: ; preds = %for.body, %entry
75 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
78 for.body: ; preds = %entry, %for.body
79 %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
80 %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
81 %add = add i64 %i.047, %base1
82 %add.ptr9.idx = add i64 %add, %offset
83 %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
84 %0 = load i64, ptr %add.ptr9, align 8
85 %add.ptr10.idx = add i64 %add, %mul
86 %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
87 %1 = load i64, ptr %add.ptr10, align 8
88 %add.ptr11.idx = add i64 %add, %mul2
89 %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
90 %2 = load i64, ptr %add.ptr11, align 8
91 %add.ptr12.idx = add i64 %add, %mul4
92 %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
93 %3 = load i64, ptr %add.ptr12, align 8
94 %mul13 = mul i64 %1, %0
95 %mul14 = mul i64 %mul13, %2
96 %mul15 = mul i64 %mul14, %3
97 %add16 = add i64 %mul15, %sum.048
98 %inc = add nuw nsw i64 %i.047, 1
99 %exitcond.not = icmp eq i64 %inc, %n
100 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
110 ; It can not be commoned to chains because we need a chain for a single address.
111 ; It is not profitable to common chains if not all addresses are in chains.
113 ; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) {
114 ; long long o1 = base1 + offset;
115 ; long long o2 = base1 + 2 * offset;
116 ; long long o3 = base1 + 3 * offset;
117 ; long long o4 = base1 + 4 * offset;
118 ; long long o5 = base1 + 5 * offset;
125 ; for (long long i = 0; i < n; ++i) {
126 ; unsigned long x1 = *(unsigned long *)(p1 + i);
127 ; unsigned long x2 = *(unsigned long *)(p2 + i);
128 ; unsigned long x3 = *(unsigned long *)(p3 + i);
129 ; unsigned long x4 = *(unsigned long *)(p4 + i);
130 ; unsigned long x5 = *(unsigned long *)(p5 + i);
131 ; sum += x1 * x2 * x3 * x4 * x5;
136 define i64 @not_perfect_chain_all_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
137 ; CHECK-LABEL: not_perfect_chain_all_same_offset_fail:
138 ; CHECK: # %bb.0: # %entry
139 ; CHECK-NEXT: cmpdi r6, 0
140 ; CHECK-NEXT: ble cr0, .LBB1_4
141 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
142 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
143 ; CHECK-NEXT: sldi r7, r4, 1
144 ; CHECK-NEXT: add r5, r3, r5
145 ; CHECK-NEXT: li r3, 0
146 ; CHECK-NEXT: add r8, r4, r7
147 ; CHECK-NEXT: sldi r9, r4, 2
148 ; CHECK-NEXT: mtctr r6
149 ; CHECK-NEXT: add r10, r4, r9
150 ; CHECK-NEXT: .p2align 4
151 ; CHECK-NEXT: .LBB1_2: # %for.body
153 ; CHECK-NEXT: ldx r6, r5, r4
154 ; CHECK-NEXT: ldx r11, r5, r7
155 ; CHECK-NEXT: ldx r12, r5, r8
156 ; CHECK-NEXT: ldx r0, r5, r9
157 ; CHECK-NEXT: mulld r6, r11, r6
158 ; CHECK-NEXT: ldx r30, r5, r10
159 ; CHECK-NEXT: addi r5, r5, 1
160 ; CHECK-NEXT: mulld r6, r6, r12
161 ; CHECK-NEXT: mulld r6, r6, r0
162 ; CHECK-NEXT: maddld r3, r6, r30, r3
163 ; CHECK-NEXT: bdnz .LBB1_2
164 ; CHECK-NEXT: # %bb.3:
165 ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
167 ; CHECK-NEXT: .LBB1_4:
168 ; CHECK-NEXT: li r3, 0
171 %mul = shl nsw i64 %offset, 1
172 %mul2 = mul nsw i64 %offset, 3
173 %mul4 = shl nsw i64 %offset, 2
174 %mul6 = mul nsw i64 %offset, 5
175 %cmp58 = icmp sgt i64 %n, 0
176 br i1 %cmp58, label %for.body, label %for.cond.cleanup
178 for.cond.cleanup: ; preds = %for.body, %entry
179 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ]
182 for.body: ; preds = %entry, %for.body
183 %sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ]
184 %i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
185 %add = add i64 %i.059, %base1
186 %add.ptr12.idx = add i64 %add, %offset
187 %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
188 %0 = load i64, ptr %add.ptr12, align 8
189 %add.ptr13.idx = add i64 %add, %mul
190 %add.ptr13 = getelementptr inbounds i8, ptr %p, i64 %add.ptr13.idx
191 %1 = load i64, ptr %add.ptr13, align 8
192 %add.ptr14.idx = add i64 %add, %mul2
193 %add.ptr14 = getelementptr inbounds i8, ptr %p, i64 %add.ptr14.idx
194 %2 = load i64, ptr %add.ptr14, align 8
195 %add.ptr15.idx = add i64 %add, %mul4
196 %add.ptr15 = getelementptr inbounds i8, ptr %p, i64 %add.ptr15.idx
197 %3 = load i64, ptr %add.ptr15, align 8
198 %add.ptr16.idx = add i64 %add, %mul6
199 %add.ptr16 = getelementptr inbounds i8, ptr %p, i64 %add.ptr16.idx
200 %4 = load i64, ptr %add.ptr16, align 8
201 %mul17 = mul i64 %1, %0
202 %mul18 = mul i64 %mul17, %2
203 %mul19 = mul i64 %mul18, %3
204 %mul20 = mul i64 %mul19, %4
205 %add21 = add i64 %mul20, %sum.060
206 %inc = add nuw nsw i64 %i.059, 1
207 %exitcond.not = icmp eq i64 %inc, %n
208 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
216 ; We need at least 4 addresses to common 2 chains to reuse at least 1 offset.
218 ; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) {
219 ; long long o1 = base1;
220 ; long long o2 = base1 + 2 * offset;
221 ; long long o3 = base1 + 3 * offset;
226 ; for (long long i = 0; i < n; ++i) {
227 ; unsigned long x1 = *(unsigned long *)(p1 + i);
228 ; unsigned long x2 = *(unsigned long *)(p2 + i);
229 ; unsigned long x3 = *(unsigned long *)(p3 + i);
230 ; sum += x1 * x2 * x3;
235 define i64 @no_enough_elements_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
236 ; CHECK-LABEL: no_enough_elements_fail:
237 ; CHECK: # %bb.0: # %entry
238 ; CHECK-NEXT: cmpdi r6, 0
239 ; CHECK-NEXT: ble cr0, .LBB2_4
240 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
241 ; CHECK-NEXT: sldi r7, r4, 1
242 ; CHECK-NEXT: mtctr r6
243 ; CHECK-NEXT: add r5, r3, r5
244 ; CHECK-NEXT: li r3, 0
245 ; CHECK-NEXT: add r4, r4, r7
246 ; CHECK-NEXT: .p2align 5
247 ; CHECK-NEXT: .LBB2_2: # %for.body
249 ; CHECK-NEXT: ld r6, 0(r5)
250 ; CHECK-NEXT: ldx r8, r5, r7
251 ; CHECK-NEXT: ldx r9, r5, r4
252 ; CHECK-NEXT: addi r5, r5, 1
253 ; CHECK-NEXT: mulld r6, r8, r6
254 ; CHECK-NEXT: maddld r3, r6, r9, r3
255 ; CHECK-NEXT: bdnz .LBB2_2
256 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
258 ; CHECK-NEXT: .LBB2_4:
259 ; CHECK-NEXT: li r3, 0
262 %mul = shl nsw i64 %offset, 1
263 %mul1 = mul nsw i64 %offset, 3
264 %cmp32 = icmp sgt i64 %n, 0
265 br i1 %cmp32, label %for.body, label %for.cond.cleanup
267 for.cond.cleanup: ; preds = %for.body, %entry
268 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ]
271 for.body: ; preds = %entry, %for.body
272 %sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ]
273 %i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
274 %add.ptr5.idx = add i64 %i.033, %base1
275 %add.ptr5 = getelementptr inbounds i8, ptr %p, i64 %add.ptr5.idx
276 %0 = load i64, ptr %add.ptr5, align 8
277 %add.ptr6.idx = add i64 %add.ptr5.idx, %mul
278 %add.ptr6 = getelementptr inbounds i8, ptr %p, i64 %add.ptr6.idx
279 %1 = load i64, ptr %add.ptr6, align 8
280 %add.ptr7.idx = add i64 %add.ptr5.idx, %mul1
281 %add.ptr7 = getelementptr inbounds i8, ptr %p, i64 %add.ptr7.idx
282 %2 = load i64, ptr %add.ptr7, align 8
283 %mul8 = mul i64 %1, %0
284 %mul9 = mul i64 %mul8, %2
285 %add10 = add i64 %mul9, %sum.034
286 %inc = add nuw nsw i64 %i.033, 1
287 %exitcond.not = icmp eq i64 %inc, %n
288 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
297 ; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains,
298 ; so we can not common any chains.
300 ; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) {
301 ; long long o1 = base1;
302 ; long long o2 = base1 + 2 * offset;
303 ; long long o3 = base1 + 4 * offset;
304 ; long long o4 = base1 + 7 * offset;
310 ; for (long long i = 0; i < n; ++i) {
311 ; unsigned long x1 = *(unsigned long *)(p1 + i);
312 ; unsigned long x2 = *(unsigned long *)(p2 + i);
313 ; unsigned long x3 = *(unsigned long *)(p3 + i);
314 ; unsigned long x4 = *(unsigned long *)(p4 + i);
315 ; sum += x1 * x2 * x3 * x4;
320 define i64 @no_reuseable_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
321 ; CHECK-LABEL: no_reuseable_offset_fail:
322 ; CHECK: # %bb.0: # %entry
323 ; CHECK-NEXT: cmpdi r6, 0
324 ; CHECK-NEXT: ble cr0, .LBB3_4
325 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
326 ; CHECK-NEXT: sldi r9, r4, 3
327 ; CHECK-NEXT: mtctr r6
328 ; CHECK-NEXT: add r5, r3, r5
329 ; CHECK-NEXT: li r3, 0
330 ; CHECK-NEXT: sldi r7, r4, 1
331 ; CHECK-NEXT: sldi r8, r4, 2
332 ; CHECK-NEXT: sub r4, r9, r4
333 ; CHECK-NEXT: .p2align 4
334 ; CHECK-NEXT: .LBB3_2: # %for.body
336 ; CHECK-NEXT: ld r6, 0(r5)
337 ; CHECK-NEXT: ldx r9, r5, r7
338 ; CHECK-NEXT: ldx r10, r5, r8
339 ; CHECK-NEXT: ldx r11, r5, r4
340 ; CHECK-NEXT: addi r5, r5, 1
341 ; CHECK-NEXT: mulld r6, r9, r6
342 ; CHECK-NEXT: mulld r6, r6, r10
343 ; CHECK-NEXT: maddld r3, r6, r11, r3
344 ; CHECK-NEXT: bdnz .LBB3_2
345 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
347 ; CHECK-NEXT: .LBB3_4:
348 ; CHECK-NEXT: li r3, 0
351 %mul = shl nsw i64 %offset, 1
352 %mul1 = shl nsw i64 %offset, 2
353 %mul3 = mul nsw i64 %offset, 7
354 %cmp44 = icmp sgt i64 %n, 0
355 br i1 %cmp44, label %for.body, label %for.cond.cleanup
357 for.cond.cleanup: ; preds = %for.body, %entry
358 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
361 for.body: ; preds = %entry, %for.body
362 %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
363 %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
364 %add.ptr8.idx = add i64 %i.045, %base1
365 %add.ptr8 = getelementptr inbounds i8, ptr %p, i64 %add.ptr8.idx
366 %0 = load i64, ptr %add.ptr8, align 8
367 %add.ptr9.idx = add i64 %add.ptr8.idx, %mul
368 %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
369 %1 = load i64, ptr %add.ptr9, align 8
370 %add.ptr10.idx = add i64 %add.ptr8.idx, %mul1
371 %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
372 %2 = load i64, ptr %add.ptr10, align 8
373 %add.ptr11.idx = add i64 %add.ptr8.idx, %mul3
374 %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
375 %3 = load i64, ptr %add.ptr11, align 8
376 %mul12 = mul i64 %1, %0
377 %mul13 = mul i64 %mul12, %2
378 %mul14 = mul i64 %mul13, %3
379 %add15 = add i64 %mul14, %sum.046
380 %inc = add nuw nsw i64 %i.045, 1
381 %exitcond.not = icmp eq i64 %inc, %n
382 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
393 ; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5.
394 ; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6
395 ; and address 5(2*offset), so we can not common chains for these addresses.
397 ; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) {
398 ; long long o1 = base1 + offset;
399 ; long long o2 = base1 + 2 * offset;
400 ; long long o3 = base1 + 5 * offset;
401 ; long long o4 = base1 + 7 * offset;
402 ; long long o5 = base1 + 8 * offset;
403 ; long long o6 = base1 + 10 * offset;
411 ; for (long long i = 0; i < n; ++i) {
412 ; unsigned long x1 = *(unsigned long *)(p1 + i);
413 ; unsigned long x2 = *(unsigned long *)(p2 + i);
414 ; unsigned long x3 = *(unsigned long *)(p3 + i);
415 ; unsigned long x4 = *(unsigned long *)(p4 + i);
416 ; unsigned long x5 = *(unsigned long *)(p5 + i);
417 ; unsigned long x6 = *(unsigned long *)(p6 + i);
418 ; sum += x1 * x2 * x3 * x4 * x5 * x6;
423 define i64 @not_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
424 ; CHECK-LABEL: not_same_offset_fail:
425 ; CHECK: # %bb.0: # %entry
426 ; CHECK-NEXT: cmpdi r6, 0
427 ; CHECK-NEXT: ble cr0, .LBB4_4
428 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
429 ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
430 ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
431 ; CHECK-NEXT: add r5, r3, r5
432 ; CHECK-NEXT: li r3, 0
433 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
434 ; CHECK-NEXT: mtctr r6
435 ; CHECK-NEXT: mulli r11, r4, 10
436 ; CHECK-NEXT: sldi r8, r4, 2
437 ; CHECK-NEXT: add r8, r4, r8
438 ; CHECK-NEXT: sldi r9, r4, 3
439 ; CHECK-NEXT: sub r10, r9, r4
440 ; CHECK-NEXT: sldi r7, r4, 1
441 ; CHECK-NEXT: .p2align 4
442 ; CHECK-NEXT: .LBB4_2: # %for.body
444 ; CHECK-NEXT: ldx r6, r5, r4
445 ; CHECK-NEXT: ldx r12, r5, r7
446 ; CHECK-NEXT: ldx r0, r5, r8
447 ; CHECK-NEXT: ldx r30, r5, r10
448 ; CHECK-NEXT: mulld r6, r12, r6
449 ; CHECK-NEXT: ldx r29, r5, r9
450 ; CHECK-NEXT: ldx r28, r5, r11
451 ; CHECK-NEXT: addi r5, r5, 1
452 ; CHECK-NEXT: mulld r6, r6, r0
453 ; CHECK-NEXT: mulld r6, r6, r30
454 ; CHECK-NEXT: mulld r6, r6, r29
455 ; CHECK-NEXT: maddld r3, r6, r28, r3
456 ; CHECK-NEXT: bdnz .LBB4_2
457 ; CHECK-NEXT: # %bb.3:
458 ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
459 ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
460 ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
462 ; CHECK-NEXT: .LBB4_4:
463 ; CHECK-NEXT: li r3, 0
466 %mul = shl nsw i64 %offset, 1
467 %mul2 = mul nsw i64 %offset, 5
468 %mul4 = mul nsw i64 %offset, 7
469 %mul6 = shl nsw i64 %offset, 3
470 %mul8 = mul nsw i64 %offset, 10
471 %cmp70 = icmp sgt i64 %n, 0
472 br i1 %cmp70, label %for.body, label %for.cond.cleanup
474 for.cond.cleanup: ; preds = %for.body, %entry
475 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ]
478 for.body: ; preds = %entry, %for.body
479 %sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ]
480 %i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
481 %add = add i64 %i.071, %base1
482 %add.ptr15.idx = add i64 %add, %offset
483 %add.ptr15 = getelementptr inbounds i8, ptr %p, i64 %add.ptr15.idx
484 %0 = load i64, ptr %add.ptr15, align 8
485 %add.ptr16.idx = add i64 %add, %mul
486 %add.ptr16 = getelementptr inbounds i8, ptr %p, i64 %add.ptr16.idx
487 %1 = load i64, ptr %add.ptr16, align 8
488 %add.ptr17.idx = add i64 %add, %mul2
489 %add.ptr17 = getelementptr inbounds i8, ptr %p, i64 %add.ptr17.idx
490 %2 = load i64, ptr %add.ptr17, align 8
491 %add.ptr18.idx = add i64 %add, %mul4
492 %add.ptr18 = getelementptr inbounds i8, ptr %p, i64 %add.ptr18.idx
493 %3 = load i64, ptr %add.ptr18, align 8
494 %add.ptr19.idx = add i64 %add, %mul6
495 %add.ptr19 = getelementptr inbounds i8, ptr %p, i64 %add.ptr19.idx
496 %4 = load i64, ptr %add.ptr19, align 8
497 %add.ptr20.idx = add i64 %add, %mul8
498 %add.ptr20 = getelementptr inbounds i8, ptr %p, i64 %add.ptr20.idx
499 %5 = load i64, ptr %add.ptr20, align 8
500 %mul21 = mul i64 %1, %0
501 %mul22 = mul i64 %mul21, %2
502 %mul23 = mul i64 %mul22, %3
503 %mul24 = mul i64 %mul23, %4
504 %mul25 = mul i64 %mul24, %5
505 %add26 = add i64 %mul25, %sum.072
506 %inc = add nuw nsw i64 %i.071, 1
507 %exitcond.not = icmp eq i64 %inc, %n
508 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
518 ; 1: base1 + offset, offsets: (0, 2*offset)
519 ; 2: base1 + 4*offset, offsets: (0, 2*offset)
521 ; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) {
522 ; long long o1 = base1 + offset;
523 ; long long o2 = base1 + 3 * offset;
524 ; long long o3 = base1 + 4 * offset;
525 ; long long o4 = base1 + 6 * offset;
531 ; for (long long i = 0; i < n; ++i) {
532 ; unsigned long x1 = *(unsigned long *)(p1 + i);
533 ; unsigned long x2 = *(unsigned long *)(p2 + i);
534 ; unsigned long x3 = *(unsigned long *)(p3 + i);
535 ; unsigned long x4 = *(unsigned long *)(p4 + i);
536 ; sum += x1 * x2 * x3 * x4;
541 define i64 @two_chain_different_offsets_succ(ptr %p, i64 %offset, i64 %base1, i64 %n) {
542 ; CHECK-LABEL: two_chain_different_offsets_succ:
543 ; CHECK: # %bb.0: # %entry
544 ; CHECK-NEXT: cmpdi r6, 0
545 ; CHECK-NEXT: ble cr0, .LBB5_4
546 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
547 ; CHECK-NEXT: sldi r8, r4, 2
548 ; CHECK-NEXT: add r7, r5, r4
549 ; CHECK-NEXT: mtctr r6
550 ; CHECK-NEXT: add r5, r5, r8
551 ; CHECK-NEXT: add r7, r3, r7
552 ; CHECK-NEXT: sldi r4, r4, 1
553 ; CHECK-NEXT: add r5, r3, r5
554 ; CHECK-NEXT: li r3, 0
555 ; CHECK-NEXT: .p2align 4
556 ; CHECK-NEXT: .LBB5_2: # %for.body
558 ; CHECK-NEXT: ld r6, 0(r7)
559 ; CHECK-NEXT: ldx r8, r7, r4
560 ; CHECK-NEXT: ld r9, 0(r5)
561 ; CHECK-NEXT: ldx r10, r5, r4
562 ; CHECK-NEXT: addi r7, r7, 1
563 ; CHECK-NEXT: addi r5, r5, 1
564 ; CHECK-NEXT: mulld r6, r8, r6
565 ; CHECK-NEXT: mulld r6, r6, r9
566 ; CHECK-NEXT: maddld r3, r6, r10, r3
567 ; CHECK-NEXT: bdnz .LBB5_2
568 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
570 ; CHECK-NEXT: .LBB5_4:
571 ; CHECK-NEXT: li r3, 0
574 %mul = mul nsw i64 %offset, 3
575 %mul2 = shl nsw i64 %offset, 2
576 %mul4 = mul nsw i64 %offset, 6
577 %cmp46 = icmp sgt i64 %n, 0
578 br i1 %cmp46, label %for.body, label %for.cond.cleanup
580 for.cond.cleanup: ; preds = %for.body, %entry
581 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
584 for.body: ; preds = %entry, %for.body
585 %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
586 %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
587 %add = add i64 %i.047, %base1
588 %add.ptr9.idx = add i64 %add, %offset
589 %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
590 %0 = load i64, ptr %add.ptr9, align 8
591 %add.ptr10.idx = add i64 %add, %mul
592 %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
593 %1 = load i64, ptr %add.ptr10, align 8
594 %add.ptr11.idx = add i64 %add, %mul2
595 %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
596 %2 = load i64, ptr %add.ptr11, align 8
597 %add.ptr12.idx = add i64 %add, %mul4
598 %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
599 %3 = load i64, ptr %add.ptr12, align 8
600 %mul13 = mul i64 %1, %0
601 %mul14 = mul i64 %mul13, %2
602 %mul15 = mul i64 %mul14, %3
603 %add16 = add i64 %mul15, %sum.048
604 %inc = add nuw nsw i64 %i.047, 1
605 %exitcond.not = icmp eq i64 %inc, %n
606 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
612 ; 3: + base2 - base1 - 2*offset
616 ; 1: base1 + offset, offsets: (0, 2*offset)
617 ; 2: base2 + offset, offsets: (0, 2*offset)
619 ; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) {
620 ; long long o1 = base1 + offset;
621 ; long long o2 = base1 + 3 * offset;
622 ; long long o3 = base2 + offset;
623 ; long long o4 = base2 + 3 * offset;
629 ; for (long long i = 0; i < n; ++i) {
630 ; unsigned long x1 = *(unsigned long *)(p1 + i);
631 ; unsigned long x2 = *(unsigned long *)(p2 + i);
632 ; unsigned long x3 = *(unsigned long *)(p3 + i);
633 ; unsigned long x4 = *(unsigned long *)(p4 + i);
634 ; sum += x1 * x2 * x3 * x4;
639 define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) {
640 ; CHECK-LABEL: two_chain_two_bases_succ:
641 ; CHECK: # %bb.0: # %entry
642 ; CHECK-NEXT: cmpdi r7, 0
643 ; CHECK-NEXT: ble cr0, .LBB6_4
644 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
645 ; CHECK-NEXT: add r6, r6, r4
646 ; CHECK-NEXT: add r5, r5, r4
647 ; CHECK-NEXT: mtctr r7
648 ; CHECK-NEXT: sldi r4, r4, 1
649 ; CHECK-NEXT: add r5, r3, r5
650 ; CHECK-NEXT: add r6, r3, r6
651 ; CHECK-NEXT: li r3, 0
652 ; CHECK-NEXT: .p2align 4
653 ; CHECK-NEXT: .LBB6_2: # %for.body
655 ; CHECK-NEXT: ld r7, 0(r5)
656 ; CHECK-NEXT: ldx r8, r5, r4
657 ; CHECK-NEXT: ld r9, 0(r6)
658 ; CHECK-NEXT: ldx r10, r6, r4
659 ; CHECK-NEXT: addi r5, r5, 1
660 ; CHECK-NEXT: addi r6, r6, 1
661 ; CHECK-NEXT: mulld r7, r8, r7
662 ; CHECK-NEXT: mulld r7, r7, r9
663 ; CHECK-NEXT: maddld r3, r7, r10, r3
664 ; CHECK-NEXT: bdnz .LBB6_2
665 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
667 ; CHECK-NEXT: .LBB6_4:
668 ; CHECK-NEXT: li r3, 0
671 %mul = mul nsw i64 %offset, 3
672 %cmp44 = icmp sgt i64 %n, 0
673 br i1 %cmp44, label %for.body, label %for.cond.cleanup
675 for.cond.cleanup: ; preds = %for.body, %entry
676 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
679 for.body: ; preds = %entry, %for.body
680 %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
681 %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
682 %add = add i64 %i.045, %base1
683 %add.ptr8.idx = add i64 %add, %offset
684 %add.ptr8 = getelementptr inbounds i8, ptr %p, i64 %add.ptr8.idx
685 %0 = load i64, ptr %add.ptr8, align 8
686 %add1 = add i64 %i.045, %mul
687 %add.ptr9.idx = add i64 %add1, %base1
688 %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
689 %1 = load i64, ptr %add.ptr9, align 8
690 %add2 = add i64 %i.045, %base2
691 %add.ptr10.idx = add i64 %add2, %offset
692 %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
693 %2 = load i64, ptr %add.ptr10, align 8
694 %add.ptr11.idx = add i64 %add2, %mul
695 %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
696 %3 = load i64, ptr %add.ptr11, align 8
697 %mul12 = mul i64 %1, %0
698 %mul13 = mul i64 %mul12, %2
699 %mul14 = mul i64 %mul13, %3
700 %add15 = add i64 %mul14, %sum.046
701 %inc = add nuw nsw i64 %i.045, 1
702 %exitcond.not = icmp eq i64 %inc, %n
703 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
706 ; Check chain commoning can reduce register pressure to save register spill/reload.
708 ; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) {
711 ; for (long long i = 0; i < 4 * m; i++) {
712 ; output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1];
713 ; output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2];
714 ; output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3];
720 define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) {
721 ; CHECK-LABEL: spill_reduce_succ:
722 ; CHECK: # %bb.0: # %entry
723 ; CHECK-NEXT: cmpdi r6, 0
724 ; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill
725 ; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill
726 ; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill
727 ; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill
728 ; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill
729 ; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill
730 ; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill
731 ; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill
732 ; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill
733 ; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill
734 ; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill
735 ; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill
736 ; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill
737 ; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill
738 ; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
739 ; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
740 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
741 ; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill
742 ; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill
743 ; CHECK-NEXT: std r9, -184(r1) # 8-byte Folded Spill
744 ; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill
745 ; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill
746 ; CHECK-NEXT: std r3, -160(r1) # 8-byte Folded Spill
747 ; CHECK-NEXT: ble cr0, .LBB7_7
748 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
749 ; CHECK-NEXT: sldi r6, r6, 2
750 ; CHECK-NEXT: li r7, 1
751 ; CHECK-NEXT: mr r30, r10
752 ; CHECK-NEXT: cmpdi r6, 1
753 ; CHECK-NEXT: iselgt r7, r6, r7
754 ; CHECK-NEXT: addi r8, r7, -1
755 ; CHECK-NEXT: clrldi r6, r7, 63
756 ; CHECK-NEXT: cmpldi r8, 3
757 ; CHECK-NEXT: blt cr0, .LBB7_4
758 ; CHECK-NEXT: # %bb.2: # %for.body.preheader.new
759 ; CHECK-NEXT: ld r14, -168(r1) # 8-byte Folded Reload
760 ; CHECK-NEXT: mulli r24, r30, 24
761 ; CHECK-NEXT: ld r16, -184(r1) # 8-byte Folded Reload
762 ; CHECK-NEXT: ld r15, -176(r1) # 8-byte Folded Reload
763 ; CHECK-NEXT: ld r3, -160(r1) # 8-byte Folded Reload
764 ; CHECK-NEXT: rldicl r0, r7, 62, 2
765 ; CHECK-NEXT: sldi r11, r30, 5
766 ; CHECK-NEXT: sldi r19, r30, 4
767 ; CHECK-NEXT: sldi r7, r14, 3
768 ; CHECK-NEXT: add r14, r30, r14
769 ; CHECK-NEXT: sldi r10, r16, 3
770 ; CHECK-NEXT: sldi r12, r15, 3
771 ; CHECK-NEXT: add r16, r30, r16
772 ; CHECK-NEXT: add r15, r30, r15
773 ; CHECK-NEXT: add r27, r11, r7
774 ; CHECK-NEXT: add r22, r24, r7
775 ; CHECK-NEXT: add r17, r19, r7
776 ; CHECK-NEXT: sldi r2, r14, 3
777 ; CHECK-NEXT: add r26, r24, r10
778 ; CHECK-NEXT: add r25, r24, r12
779 ; CHECK-NEXT: add r21, r19, r10
780 ; CHECK-NEXT: add r20, r19, r12
781 ; CHECK-NEXT: add r8, r11, r10
782 ; CHECK-NEXT: sldi r16, r16, 3
783 ; CHECK-NEXT: add r29, r5, r27
784 ; CHECK-NEXT: add r28, r4, r27
785 ; CHECK-NEXT: add r27, r3, r27
786 ; CHECK-NEXT: add r24, r5, r22
787 ; CHECK-NEXT: add r23, r4, r22
788 ; CHECK-NEXT: add r22, r3, r22
789 ; CHECK-NEXT: add r19, r5, r17
790 ; CHECK-NEXT: add r18, r4, r17
791 ; CHECK-NEXT: add r17, r3, r17
792 ; CHECK-NEXT: add r14, r5, r2
793 ; CHECK-NEXT: add r31, r4, r2
794 ; CHECK-NEXT: add r2, r3, r2
795 ; CHECK-NEXT: add r9, r5, r8
796 ; CHECK-NEXT: add r8, r11, r12
797 ; CHECK-NEXT: add r26, r5, r26
798 ; CHECK-NEXT: add r25, r5, r25
799 ; CHECK-NEXT: add r21, r5, r21
800 ; CHECK-NEXT: add r20, r5, r20
801 ; CHECK-NEXT: add r16, r5, r16
802 ; CHECK-NEXT: add r8, r5, r8
803 ; CHECK-NEXT: rldicl r3, r0, 2, 1
804 ; CHECK-NEXT: addi r3, r3, -4
805 ; CHECK-NEXT: sub r0, r12, r7
806 ; CHECK-NEXT: sub r12, r10, r7
807 ; CHECK-NEXT: li r7, 0
808 ; CHECK-NEXT: mr r10, r30
809 ; CHECK-NEXT: sldi r15, r15, 3
810 ; CHECK-NEXT: add r15, r5, r15
811 ; CHECK-NEXT: rldicl r3, r3, 62, 2
812 ; CHECK-NEXT: addi r3, r3, 1
813 ; CHECK-NEXT: mtctr r3
814 ; CHECK-NEXT: .p2align 4
815 ; CHECK-NEXT: .LBB7_3: # %for.body
817 ; CHECK-NEXT: lfd f0, 0(r2)
818 ; CHECK-NEXT: lfd f1, 0(r31)
819 ; CHECK-NEXT: add r3, r10, r30
820 ; CHECK-NEXT: add r3, r3, r30
821 ; CHECK-NEXT: xsmuldp f0, f0, f1
822 ; CHECK-NEXT: lfd f1, 0(r14)
823 ; CHECK-NEXT: add r3, r3, r30
824 ; CHECK-NEXT: add r10, r3, r30
825 ; CHECK-NEXT: xsadddp f0, f1, f0
826 ; CHECK-NEXT: stfd f0, 0(r14)
827 ; CHECK-NEXT: add r14, r14, r11
828 ; CHECK-NEXT: lfdx f0, r2, r0
829 ; CHECK-NEXT: lfdx f1, r31, r0
830 ; CHECK-NEXT: xsmuldp f0, f0, f1
831 ; CHECK-NEXT: lfdx f1, r15, r7
832 ; CHECK-NEXT: xsadddp f0, f1, f0
833 ; CHECK-NEXT: stfdx f0, r15, r7
834 ; CHECK-NEXT: lfdx f0, r2, r12
835 ; CHECK-NEXT: lfdx f1, r31, r12
836 ; CHECK-NEXT: add r2, r2, r11
837 ; CHECK-NEXT: add r31, r31, r11
838 ; CHECK-NEXT: xsmuldp f0, f0, f1
839 ; CHECK-NEXT: lfdx f1, r16, r7
840 ; CHECK-NEXT: xsadddp f0, f1, f0
841 ; CHECK-NEXT: stfdx f0, r16, r7
842 ; CHECK-NEXT: lfd f0, 0(r17)
843 ; CHECK-NEXT: lfd f1, 0(r18)
844 ; CHECK-NEXT: xsmuldp f0, f0, f1
845 ; CHECK-NEXT: lfdx f1, r19, r7
846 ; CHECK-NEXT: xsadddp f0, f1, f0
847 ; CHECK-NEXT: stfdx f0, r19, r7
848 ; CHECK-NEXT: lfdx f0, r17, r0
849 ; CHECK-NEXT: lfdx f1, r18, r0
850 ; CHECK-NEXT: xsmuldp f0, f0, f1
851 ; CHECK-NEXT: lfdx f1, r20, r7
852 ; CHECK-NEXT: xsadddp f0, f1, f0
853 ; CHECK-NEXT: stfdx f0, r20, r7
854 ; CHECK-NEXT: lfdx f0, r17, r12
855 ; CHECK-NEXT: lfdx f1, r18, r12
856 ; CHECK-NEXT: add r17, r17, r11
857 ; CHECK-NEXT: add r18, r18, r11
858 ; CHECK-NEXT: xsmuldp f0, f0, f1
859 ; CHECK-NEXT: lfdx f1, r21, r7
860 ; CHECK-NEXT: xsadddp f0, f1, f0
861 ; CHECK-NEXT: stfdx f0, r21, r7
862 ; CHECK-NEXT: lfd f0, 0(r22)
863 ; CHECK-NEXT: lfd f1, 0(r23)
864 ; CHECK-NEXT: xsmuldp f0, f0, f1
865 ; CHECK-NEXT: lfdx f1, r24, r7
866 ; CHECK-NEXT: xsadddp f0, f1, f0
867 ; CHECK-NEXT: stfdx f0, r24, r7
868 ; CHECK-NEXT: lfdx f0, r22, r0
869 ; CHECK-NEXT: lfdx f1, r23, r0
870 ; CHECK-NEXT: xsmuldp f0, f0, f1
871 ; CHECK-NEXT: lfdx f1, r25, r7
872 ; CHECK-NEXT: xsadddp f0, f1, f0
873 ; CHECK-NEXT: stfdx f0, r25, r7
874 ; CHECK-NEXT: lfdx f0, r22, r12
875 ; CHECK-NEXT: lfdx f1, r23, r12
876 ; CHECK-NEXT: add r22, r22, r11
877 ; CHECK-NEXT: add r23, r23, r11
878 ; CHECK-NEXT: xsmuldp f0, f0, f1
879 ; CHECK-NEXT: lfdx f1, r26, r7
880 ; CHECK-NEXT: xsadddp f0, f1, f0
881 ; CHECK-NEXT: stfdx f0, r26, r7
882 ; CHECK-NEXT: lfd f0, 0(r27)
883 ; CHECK-NEXT: lfd f1, 0(r28)
884 ; CHECK-NEXT: xsmuldp f0, f0, f1
885 ; CHECK-NEXT: lfdx f1, r29, r7
886 ; CHECK-NEXT: xsadddp f0, f1, f0
887 ; CHECK-NEXT: stfdx f0, r29, r7
888 ; CHECK-NEXT: lfdx f0, r27, r0
889 ; CHECK-NEXT: lfdx f1, r28, r0
890 ; CHECK-NEXT: xsmuldp f0, f0, f1
891 ; CHECK-NEXT: lfdx f1, r8, r7
892 ; CHECK-NEXT: xsadddp f0, f1, f0
893 ; CHECK-NEXT: stfdx f0, r8, r7
894 ; CHECK-NEXT: lfdx f0, r27, r12
895 ; CHECK-NEXT: lfdx f1, r28, r12
896 ; CHECK-NEXT: add r27, r27, r11
897 ; CHECK-NEXT: add r28, r28, r11
898 ; CHECK-NEXT: xsmuldp f0, f0, f1
899 ; CHECK-NEXT: lfdx f1, r9, r7
900 ; CHECK-NEXT: xsadddp f0, f1, f0
901 ; CHECK-NEXT: stfdx f0, r9, r7
902 ; CHECK-NEXT: add r7, r7, r11
903 ; CHECK-NEXT: bdnz .LBB7_3
904 ; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
905 ; CHECK-NEXT: cmpldi r6, 0
906 ; CHECK-NEXT: beq cr0, .LBB7_7
907 ; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader
908 ; CHECK-NEXT: ld r3, -184(r1) # 8-byte Folded Reload
909 ; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload
910 ; CHECK-NEXT: sldi r8, r30, 3
911 ; CHECK-NEXT: add r3, r10, r3
912 ; CHECK-NEXT: sldi r3, r3, 3
913 ; CHECK-NEXT: add r7, r5, r3
914 ; CHECK-NEXT: add r9, r4, r3
915 ; CHECK-NEXT: add r11, r0, r3
916 ; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload
917 ; CHECK-NEXT: add r3, r10, r3
918 ; CHECK-NEXT: sldi r3, r3, 3
919 ; CHECK-NEXT: add r12, r5, r3
920 ; CHECK-NEXT: add r30, r4, r3
921 ; CHECK-NEXT: add r29, r0, r3
922 ; CHECK-NEXT: ld r3, -168(r1) # 8-byte Folded Reload
923 ; CHECK-NEXT: add r3, r10, r3
924 ; CHECK-NEXT: li r10, 0
925 ; CHECK-NEXT: sldi r3, r3, 3
926 ; CHECK-NEXT: add r5, r5, r3
927 ; CHECK-NEXT: add r4, r4, r3
928 ; CHECK-NEXT: add r3, r0, r3
929 ; CHECK-NEXT: .p2align 4
930 ; CHECK-NEXT: .LBB7_6: # %for.body.epil
932 ; CHECK-NEXT: lfdx f0, r3, r10
933 ; CHECK-NEXT: lfdx f1, r4, r10
934 ; CHECK-NEXT: addi r6, r6, -1
935 ; CHECK-NEXT: cmpldi r6, 0
936 ; CHECK-NEXT: xsmuldp f0, f0, f1
937 ; CHECK-NEXT: lfd f1, 0(r5)
938 ; CHECK-NEXT: xsadddp f0, f1, f0
939 ; CHECK-NEXT: stfd f0, 0(r5)
940 ; CHECK-NEXT: add r5, r5, r8
941 ; CHECK-NEXT: lfdx f0, r29, r10
942 ; CHECK-NEXT: lfdx f1, r30, r10
943 ; CHECK-NEXT: xsmuldp f0, f0, f1
944 ; CHECK-NEXT: lfdx f1, r12, r10
945 ; CHECK-NEXT: xsadddp f0, f1, f0
946 ; CHECK-NEXT: stfdx f0, r12, r10
947 ; CHECK-NEXT: lfdx f0, r11, r10
948 ; CHECK-NEXT: lfdx f1, r9, r10
949 ; CHECK-NEXT: xsmuldp f0, f0, f1
950 ; CHECK-NEXT: lfdx f1, r7, r10
951 ; CHECK-NEXT: xsadddp f0, f1, f0
952 ; CHECK-NEXT: stfdx f0, r7, r10
953 ; CHECK-NEXT: add r10, r10, r8
954 ; CHECK-NEXT: bne cr0, .LBB7_6
955 ; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup
956 ; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload
957 ; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload
958 ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
959 ; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
960 ; CHECK-NEXT: li r3, 0
961 ; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
962 ; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload
963 ; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload
964 ; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload
965 ; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload
966 ; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload
967 ; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload
968 ; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload
969 ; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload
970 ; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload
971 ; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload
972 ; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload
973 ; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload
974 ; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload
975 ; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload
978 %cmp49 = icmp sgt i64 %m, 0
979 br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup
981 for.body.preheader: ; preds = %entry
983 %smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1)
984 %1 = add nsw i64 %smax52, -1
985 %xtraiter = and i64 %smax52, 1
986 %2 = icmp ult i64 %1, 3
987 br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
989 for.body.preheader.new: ; preds = %for.body.preheader
990 %unroll_iter = and i64 %smax52, 9223372036854775804
993 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
994 %inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ]
995 %lcmp.mod.not = icmp eq i64 %xtraiter, 0
996 br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
998 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
999 %inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1000 %epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1001 %add.epil = add nsw i64 %inc.addr.050.epil, %inc1
1002 %arrayidx.epil = getelementptr inbounds double, ptr %input1, i64 %add.epil
1003 %3 = load double, ptr %arrayidx.epil, align 8
1004 %arrayidx2.epil = getelementptr inbounds double, ptr %input2, i64 %add.epil
1005 %4 = load double, ptr %arrayidx2.epil, align 8
1006 %mul3.epil = fmul double %3, %4
1007 %arrayidx5.epil = getelementptr inbounds double, ptr %output, i64 %add.epil
1008 %5 = load double, ptr %arrayidx5.epil, align 8
1009 %add6.epil = fadd double %5, %mul3.epil
1010 store double %add6.epil, ptr %arrayidx5.epil, align 8
1011 %add7.epil = add nsw i64 %inc.addr.050.epil, %inc2
1012 %arrayidx8.epil = getelementptr inbounds double, ptr %input1, i64 %add7.epil
1013 %6 = load double, ptr %arrayidx8.epil, align 8
1014 %arrayidx10.epil = getelementptr inbounds double, ptr %input2, i64 %add7.epil
1015 %7 = load double, ptr %arrayidx10.epil, align 8
1016 %mul11.epil = fmul double %6, %7
1017 %arrayidx13.epil = getelementptr inbounds double, ptr %output, i64 %add7.epil
1018 %8 = load double, ptr %arrayidx13.epil, align 8
1019 %add14.epil = fadd double %8, %mul11.epil
1020 store double %add14.epil, ptr %arrayidx13.epil, align 8
1021 %add15.epil = add nsw i64 %inc.addr.050.epil, %inc3
1022 %arrayidx16.epil = getelementptr inbounds double, ptr %input1, i64 %add15.epil
1023 %9 = load double, ptr %arrayidx16.epil, align 8
1024 %arrayidx18.epil = getelementptr inbounds double, ptr %input2, i64 %add15.epil
1025 %10 = load double, ptr %arrayidx18.epil, align 8
1026 %mul19.epil = fmul double %9, %10
1027 %arrayidx21.epil = getelementptr inbounds double, ptr %output, i64 %add15.epil
1028 %11 = load double, ptr %arrayidx21.epil, align 8
1029 %add22.epil = fadd double %11, %mul19.epil
1030 store double %add22.epil, ptr %arrayidx21.epil, align 8
1031 %add23.epil = add nsw i64 %inc.addr.050.epil, %inc4
1032 %epil.iter.sub = add nsw i64 %epil.iter, -1
1033 %epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0
1034 br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil
1036 for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1039 for.body: ; preds = %for.body, %for.body.preheader.new
1040 %inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ]
1041 %niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1042 %add = add nsw i64 %inc.addr.050, %inc1
1043 %arrayidx = getelementptr inbounds double, ptr %input1, i64 %add
1044 %12 = load double, ptr %arrayidx, align 8
1045 %arrayidx2 = getelementptr inbounds double, ptr %input2, i64 %add
1046 %13 = load double, ptr %arrayidx2, align 8
1047 %mul3 = fmul double %12, %13
1048 %arrayidx5 = getelementptr inbounds double, ptr %output, i64 %add
1049 %14 = load double, ptr %arrayidx5, align 8
1050 %add6 = fadd double %14, %mul3
1051 store double %add6, ptr %arrayidx5, align 8
1052 %add7 = add nsw i64 %inc.addr.050, %inc2
1053 %arrayidx8 = getelementptr inbounds double, ptr %input1, i64 %add7
1054 %15 = load double, ptr %arrayidx8, align 8
1055 %arrayidx10 = getelementptr inbounds double, ptr %input2, i64 %add7
1056 %16 = load double, ptr %arrayidx10, align 8
1057 %mul11 = fmul double %15, %16
1058 %arrayidx13 = getelementptr inbounds double, ptr %output, i64 %add7
1059 %17 = load double, ptr %arrayidx13, align 8
1060 %add14 = fadd double %17, %mul11
1061 store double %add14, ptr %arrayidx13, align 8
1062 %add15 = add nsw i64 %inc.addr.050, %inc3
1063 %arrayidx16 = getelementptr inbounds double, ptr %input1, i64 %add15
1064 %18 = load double, ptr %arrayidx16, align 8
1065 %arrayidx18 = getelementptr inbounds double, ptr %input2, i64 %add15
1066 %19 = load double, ptr %arrayidx18, align 8
1067 %mul19 = fmul double %18, %19
1068 %arrayidx21 = getelementptr inbounds double, ptr %output, i64 %add15
1069 %20 = load double, ptr %arrayidx21, align 8
1070 %add22 = fadd double %20, %mul19
1071 store double %add22, ptr %arrayidx21, align 8
1072 %add23 = add nsw i64 %inc.addr.050, %inc4
1073 %add.1 = add nsw i64 %add23, %inc1
1074 %arrayidx.1 = getelementptr inbounds double, ptr %input1, i64 %add.1
1075 %21 = load double, ptr %arrayidx.1, align 8
1076 %arrayidx2.1 = getelementptr inbounds double, ptr %input2, i64 %add.1
1077 %22 = load double, ptr %arrayidx2.1, align 8
1078 %mul3.1 = fmul double %21, %22
1079 %arrayidx5.1 = getelementptr inbounds double, ptr %output, i64 %add.1
1080 %23 = load double, ptr %arrayidx5.1, align 8
1081 %add6.1 = fadd double %23, %mul3.1
1082 store double %add6.1, ptr %arrayidx5.1, align 8
1083 %add7.1 = add nsw i64 %add23, %inc2
1084 %arrayidx8.1 = getelementptr inbounds double, ptr %input1, i64 %add7.1
1085 %24 = load double, ptr %arrayidx8.1, align 8
1086 %arrayidx10.1 = getelementptr inbounds double, ptr %input2, i64 %add7.1
1087 %25 = load double, ptr %arrayidx10.1, align 8
1088 %mul11.1 = fmul double %24, %25
1089 %arrayidx13.1 = getelementptr inbounds double, ptr %output, i64 %add7.1
1090 %26 = load double, ptr %arrayidx13.1, align 8
1091 %add14.1 = fadd double %26, %mul11.1
1092 store double %add14.1, ptr %arrayidx13.1, align 8
1093 %add15.1 = add nsw i64 %add23, %inc3
1094 %arrayidx16.1 = getelementptr inbounds double, ptr %input1, i64 %add15.1
1095 %27 = load double, ptr %arrayidx16.1, align 8
1096 %arrayidx18.1 = getelementptr inbounds double, ptr %input2, i64 %add15.1
1097 %28 = load double, ptr %arrayidx18.1, align 8
1098 %mul19.1 = fmul double %27, %28
1099 %arrayidx21.1 = getelementptr inbounds double, ptr %output, i64 %add15.1
1100 %29 = load double, ptr %arrayidx21.1, align 8
1101 %add22.1 = fadd double %29, %mul19.1
1102 store double %add22.1, ptr %arrayidx21.1, align 8
1103 %add23.1 = add nsw i64 %add23, %inc4
1104 %add.2 = add nsw i64 %add23.1, %inc1
1105 %arrayidx.2 = getelementptr inbounds double, ptr %input1, i64 %add.2
1106 %30 = load double, ptr %arrayidx.2, align 8
1107 %arrayidx2.2 = getelementptr inbounds double, ptr %input2, i64 %add.2
1108 %31 = load double, ptr %arrayidx2.2, align 8
1109 %mul3.2 = fmul double %30, %31
1110 %arrayidx5.2 = getelementptr inbounds double, ptr %output, i64 %add.2
1111 %32 = load double, ptr %arrayidx5.2, align 8
1112 %add6.2 = fadd double %32, %mul3.2
1113 store double %add6.2, ptr %arrayidx5.2, align 8
1114 %add7.2 = add nsw i64 %add23.1, %inc2
1115 %arrayidx8.2 = getelementptr inbounds double, ptr %input1, i64 %add7.2
1116 %33 = load double, ptr %arrayidx8.2, align 8
1117 %arrayidx10.2 = getelementptr inbounds double, ptr %input2, i64 %add7.2
1118 %34 = load double, ptr %arrayidx10.2, align 8
1119 %mul11.2 = fmul double %33, %34
1120 %arrayidx13.2 = getelementptr inbounds double, ptr %output, i64 %add7.2
1121 %35 = load double, ptr %arrayidx13.2, align 8
1122 %add14.2 = fadd double %35, %mul11.2
1123 store double %add14.2, ptr %arrayidx13.2, align 8
1124 %add15.2 = add nsw i64 %add23.1, %inc3
1125 %arrayidx16.2 = getelementptr inbounds double, ptr %input1, i64 %add15.2
1126 %36 = load double, ptr %arrayidx16.2, align 8
1127 %arrayidx18.2 = getelementptr inbounds double, ptr %input2, i64 %add15.2
1128 %37 = load double, ptr %arrayidx18.2, align 8
1129 %mul19.2 = fmul double %36, %37
1130 %arrayidx21.2 = getelementptr inbounds double, ptr %output, i64 %add15.2
1131 %38 = load double, ptr %arrayidx21.2, align 8
1132 %add22.2 = fadd double %38, %mul19.2
1133 store double %add22.2, ptr %arrayidx21.2, align 8
1134 %add23.2 = add nsw i64 %add23.1, %inc4
1135 %add.3 = add nsw i64 %add23.2, %inc1
1136 %arrayidx.3 = getelementptr inbounds double, ptr %input1, i64 %add.3
1137 %39 = load double, ptr %arrayidx.3, align 8
1138 %arrayidx2.3 = getelementptr inbounds double, ptr %input2, i64 %add.3
1139 %40 = load double, ptr %arrayidx2.3, align 8
1140 %mul3.3 = fmul double %39, %40
1141 %arrayidx5.3 = getelementptr inbounds double, ptr %output, i64 %add.3
1142 %41 = load double, ptr %arrayidx5.3, align 8
1143 %add6.3 = fadd double %41, %mul3.3
1144 store double %add6.3, ptr %arrayidx5.3, align 8
1145 %add7.3 = add nsw i64 %add23.2, %inc2
1146 %arrayidx8.3 = getelementptr inbounds double, ptr %input1, i64 %add7.3
1147 %42 = load double, ptr %arrayidx8.3, align 8
1148 %arrayidx10.3 = getelementptr inbounds double, ptr %input2, i64 %add7.3
1149 %43 = load double, ptr %arrayidx10.3, align 8
1150 %mul11.3 = fmul double %42, %43
1151 %arrayidx13.3 = getelementptr inbounds double, ptr %output, i64 %add7.3
1152 %44 = load double, ptr %arrayidx13.3, align 8
1153 %add14.3 = fadd double %44, %mul11.3
1154 store double %add14.3, ptr %arrayidx13.3, align 8
1155 %add15.3 = add nsw i64 %add23.2, %inc3
1156 %arrayidx16.3 = getelementptr inbounds double, ptr %input1, i64 %add15.3
1157 %45 = load double, ptr %arrayidx16.3, align 8
1158 %arrayidx18.3 = getelementptr inbounds double, ptr %input2, i64 %add15.3
1159 %46 = load double, ptr %arrayidx18.3, align 8
1160 %mul19.3 = fmul double %45, %46
1161 %arrayidx21.3 = getelementptr inbounds double, ptr %output, i64 %add15.3
1162 %47 = load double, ptr %arrayidx21.3, align 8
1163 %add22.3 = fadd double %47, %mul19.3
1164 store double %add22.3, ptr %arrayidx21.3, align 8
1165 %add23.3 = add nsw i64 %add23.2, %inc4
1166 %niter.nsub.3 = add i64 %niter, -4
1167 %niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0
1168 br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1171 declare i64 @llvm.smax.i64(i64, i64)