1 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic -pre-RA-sched=source -enable-misched \
2 ; RUN: -misched-topdown -verify-machineinstrs \
3 ; RUN: | FileCheck %s -check-prefix=TOPDOWN
4 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic -pre-RA-sched=source -enable-misched \
5 ; RUN: -misched=ilpmin -verify-machineinstrs \
6 ; RUN: | FileCheck %s -check-prefix=ILPMIN
7 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic -pre-RA-sched=source -enable-misched \
8 ; RUN: -misched=ilpmax -verify-machineinstrs \
9 ; RUN: | FileCheck %s -check-prefix=ILPMAX
11 ; Verify that the MI scheduler minimizes register pressure for a
12 ; uniform set of bottom-up subtrees (unrolled matrix multiply).
14 ; For current top-down heuristics, ensure that some folded imulls have
15 ; been reordered with the stores. This tests the scheduler's cheap
16 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
18 ; TOPDOWN-LABEL: %for.body
19 ; TOPDOWN: movl %{{.*}}, (
20 ; TOPDOWN: imull {{[0-9]*}}(
21 ; TOPDOWN: movl %{{.*}}, 4(
22 ; TOPDOWN: imull {{[0-9]*}}(
23 ; TOPDOWN: movl %{{.*}}, 8(
24 ; TOPDOWN: movl %{{.*}}, 12(
25 ; TOPDOWN-LABEL: %for.end
27 ; For -misched=ilpmin, verify that each expression subtree is
28 ; scheduled independently, and that the imull/adds are interleaved.
30 ; ILPMIN-LABEL: %for.body
31 ; ILPMIN: movl %{{.*}}, (
39 ; ILPMIN: movl %{{.*}}, 4(
47 ; ILPMIN: movl %{{.*}}, 8(
55 ; ILPMIN: movl %{{.*}}, 12(
56 ; ILPMIN-LABEL: %for.end
58 ; For -misched=ilpmax, verify that each expression subtree is
59 ; scheduled independently, and that the imull/adds are clustered.
61 ; ILPMAX-LABEL: %for.body
62 ; ILPMAX: movl %{{.*}}, (
70 ; ILPMAX: movl %{{.*}}, 4(
78 ; ILPMAX: movl %{{.*}}, 8(
86 ; ILPMAX: movl %{{.*}}, 12(
87 ; ILPMAX-LABEL: %for.end
89 define void @mmult(ptr noalias nocapture %m1, ptr noalias nocapture %m2,
90 ptr noalias nocapture %m3) nounwind uwtable ssp {
94 for.body: ; preds = %for.body, %entry
95 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
96 %arrayidx8 = getelementptr inbounds [4 x i32], ptr %m1, i64 %indvars.iv, i64 0
97 %tmp = load i32, ptr %arrayidx8, align 4
98 %tmp1 = load i32, ptr %m2, align 4
99 %arrayidx8.1 = getelementptr inbounds [4 x i32], ptr %m1, i64 %indvars.iv, i64 1
100 %tmp2 = load i32, ptr %arrayidx8.1, align 4
101 %arrayidx12.1 = getelementptr inbounds [4 x i32], ptr %m2, i64 1, i64 0
102 %tmp3 = load i32, ptr %arrayidx12.1, align 4
103 %arrayidx8.2 = getelementptr inbounds [4 x i32], ptr %m1, i64 %indvars.iv, i64 2
104 %tmp4 = load i32, ptr %arrayidx8.2, align 4
105 %arrayidx12.2 = getelementptr inbounds [4 x i32], ptr %m2, i64 2, i64 0
106 %tmp5 = load i32, ptr %arrayidx12.2, align 4
107 %arrayidx8.3 = getelementptr inbounds [4 x i32], ptr %m1, i64 %indvars.iv, i64 3
108 %tmp6 = load i32, ptr %arrayidx8.3, align 4
109 %arrayidx12.3 = getelementptr inbounds [4 x i32], ptr %m2, i64 3, i64 0
110 %tmp8 = load i32, ptr %arrayidx8, align 4
111 %arrayidx12.137 = getelementptr inbounds [4 x i32], ptr %m2, i64 0, i64 1
112 %tmp9 = load i32, ptr %arrayidx12.137, align 4
113 %tmp10 = load i32, ptr %arrayidx8.1, align 4
114 %arrayidx12.1.1 = getelementptr inbounds [4 x i32], ptr %m2, i64 1, i64 1
115 %tmp11 = load i32, ptr %arrayidx12.1.1, align 4
116 %tmp12 = load i32, ptr %arrayidx8.2, align 4
117 %arrayidx12.2.1 = getelementptr inbounds [4 x i32], ptr %m2, i64 2, i64 1
118 %tmp13 = load i32, ptr %arrayidx12.2.1, align 4
119 %tmp14 = load i32, ptr %arrayidx8.3, align 4
120 %arrayidx12.3.1 = getelementptr inbounds [4 x i32], ptr %m2, i64 3, i64 1
121 %tmp15 = load i32, ptr %arrayidx12.3.1, align 4
122 %tmp16 = load i32, ptr %arrayidx8, align 4
123 %arrayidx12.239 = getelementptr inbounds [4 x i32], ptr %m2, i64 0, i64 2
124 %tmp17 = load i32, ptr %arrayidx12.239, align 4
125 %tmp18 = load i32, ptr %arrayidx8.1, align 4
126 %arrayidx12.1.2 = getelementptr inbounds [4 x i32], ptr %m2, i64 1, i64 2
127 %tmp19 = load i32, ptr %arrayidx12.1.2, align 4
128 %tmp20 = load i32, ptr %arrayidx8.2, align 4
129 %arrayidx12.2.2 = getelementptr inbounds [4 x i32], ptr %m2, i64 2, i64 2
130 %tmp21 = load i32, ptr %arrayidx12.2.2, align 4
131 %tmp22 = load i32, ptr %arrayidx8.3, align 4
132 %arrayidx12.3.2 = getelementptr inbounds [4 x i32], ptr %m2, i64 3, i64 2
133 %tmp23 = load i32, ptr %arrayidx12.3.2, align 4
134 %tmp24 = load i32, ptr %arrayidx8, align 4
135 %arrayidx12.341 = getelementptr inbounds [4 x i32], ptr %m2, i64 0, i64 3
136 %tmp25 = load i32, ptr %arrayidx12.341, align 4
137 %tmp26 = load i32, ptr %arrayidx8.1, align 4
138 %arrayidx12.1.3 = getelementptr inbounds [4 x i32], ptr %m2, i64 1, i64 3
139 %tmp27 = load i32, ptr %arrayidx12.1.3, align 4
140 %tmp28 = load i32, ptr %arrayidx8.2, align 4
141 %arrayidx12.2.3 = getelementptr inbounds [4 x i32], ptr %m2, i64 2, i64 3
142 %tmp29 = load i32, ptr %arrayidx12.2.3, align 4
143 %tmp30 = load i32, ptr %arrayidx8.3, align 4
144 %arrayidx12.3.3 = getelementptr inbounds [4 x i32], ptr %m2, i64 3, i64 3
145 %tmp31 = load i32, ptr %arrayidx12.3.3, align 4
146 %tmp7 = load i32, ptr %arrayidx12.3, align 4
147 %mul = mul nsw i32 %tmp1, %tmp
148 %mul.1 = mul nsw i32 %tmp3, %tmp2
149 %mul.2 = mul nsw i32 %tmp5, %tmp4
150 %mul.3 = mul nsw i32 %tmp7, %tmp6
151 %mul.138 = mul nsw i32 %tmp9, %tmp8
152 %mul.1.1 = mul nsw i32 %tmp11, %tmp10
153 %mul.2.1 = mul nsw i32 %tmp13, %tmp12
154 %mul.3.1 = mul nsw i32 %tmp15, %tmp14
155 %mul.240 = mul nsw i32 %tmp17, %tmp16
156 %mul.1.2 = mul nsw i32 %tmp19, %tmp18
157 %mul.2.2 = mul nsw i32 %tmp21, %tmp20
158 %mul.3.2 = mul nsw i32 %tmp23, %tmp22
159 %mul.342 = mul nsw i32 %tmp25, %tmp24
160 %mul.1.3 = mul nsw i32 %tmp27, %tmp26
161 %mul.2.3 = mul nsw i32 %tmp29, %tmp28
162 %mul.3.3 = mul nsw i32 %tmp31, %tmp30
163 %add.1 = add nsw i32 %mul.1, %mul
164 %add.2 = add nsw i32 %mul.2, %add.1
165 %add.3 = add nsw i32 %mul.3, %add.2
166 %add.1.1 = add nsw i32 %mul.1.1, %mul.138
167 %add.2.1 = add nsw i32 %mul.2.1, %add.1.1
168 %add.3.1 = add nsw i32 %mul.3.1, %add.2.1
169 %add.1.2 = add nsw i32 %mul.1.2, %mul.240
170 %add.2.2 = add nsw i32 %mul.2.2, %add.1.2
171 %add.3.2 = add nsw i32 %mul.3.2, %add.2.2
172 %add.1.3 = add nsw i32 %mul.1.3, %mul.342
173 %add.2.3 = add nsw i32 %mul.2.3, %add.1.3
174 %add.3.3 = add nsw i32 %mul.3.3, %add.2.3
175 %arrayidx16 = getelementptr inbounds [4 x i32], ptr %m3, i64 %indvars.iv, i64 0
176 store i32 %add.3, ptr %arrayidx16, align 4
177 %arrayidx16.1 = getelementptr inbounds [4 x i32], ptr %m3, i64 %indvars.iv, i64 1
178 store i32 %add.3.1, ptr %arrayidx16.1, align 4
179 %arrayidx16.2 = getelementptr inbounds [4 x i32], ptr %m3, i64 %indvars.iv, i64 2
180 store i32 %add.3.2, ptr %arrayidx16.2, align 4
181 %arrayidx16.3 = getelementptr inbounds [4 x i32], ptr %m3, i64 %indvars.iv, i64 3
182 store i32 %add.3.3, ptr %arrayidx16.3, align 4
183 %indvars.iv.next = add i64 %indvars.iv, 1
184 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
185 %exitcond = icmp eq i32 %lftr.wideiv, 4
186 br i1 %exitcond, label %for.end, label %for.body
188 for.end: ; preds = %for.body