1 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
2 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -basicaa -early-cse-memssa | FileCheck %s
3 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s
4 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes=early-cse-memssa | FileCheck %s
6 define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
8 ; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
9 ; CHECK-LABEL: @test_cse
10 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
11 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
12 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
15 for.cond: ; preds = %for.body, %entry
16 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
17 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
18 %cmp = icmp slt i32 %i.0, %n
19 br i1 %cmp, label %for.body, label %for.end
21 for.body: ; preds = %for.cond
22 %0 = bitcast i32* %a to i8*
23 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
24 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
25 %3 = bitcast <16 x i8> %1 to <4 x i32>
26 %4 = bitcast <16 x i8> %2 to <4 x i32>
27 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
28 %5 = bitcast i32* %a to i8*
29 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
30 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
31 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
32 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
33 %inc = add nsw i32 %i.0, 1
36 for.end: ; preds = %for.cond
40 define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
42 ; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
43 ; CHECK-LABEL: @test_cse2
44 ; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
45 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, i8* %0)
46 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
47 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
50 for.cond: ; preds = %for.body, %entry
51 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
52 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
53 %cmp = icmp slt i32 %i.0, %n
54 br i1 %cmp, label %for.body, label %for.end
56 for.body: ; preds = %for.cond
57 %0 = bitcast i32* %a to i8*
58 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
59 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
60 %3 = bitcast <16 x i8> %1 to <4 x i32>
61 %4 = bitcast <16 x i8> %2 to <4 x i32>
62 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
63 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
64 %5 = bitcast i32* %a to i8*
65 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
66 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
67 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
68 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
69 %inc = add nsw i32 %i.0, 1
72 for.end: ; preds = %for.cond
76 define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
78 ; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
79 ; CHECK-LABEL: @test_cse3
80 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
81 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
82 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
83 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
86 for.cond: ; preds = %for.body, %entry
87 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
88 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
89 %cmp = icmp slt i32 %i.0, %n
90 br i1 %cmp, label %for.body, label %for.end
92 for.body: ; preds = %for.cond
93 %0 = bitcast i32* %a to i8*
94 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
95 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
96 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
97 %1 = bitcast i32* %a to i8*
98 %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
99 %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
100 %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
101 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
102 %inc = add nsw i32 %i.0, 1
105 for.end: ; preds = %for.cond
110 define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
112 ; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
114 ; CHECK-LABEL: @test_nocse
115 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
116 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
117 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
120 for.cond: ; preds = %for.body, %entry
121 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
122 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
123 %cmp = icmp slt i32 %i.0, %n
124 br i1 %cmp, label %for.body, label %for.end
126 for.body: ; preds = %for.cond
127 %0 = bitcast i32* %a to i8*
128 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
129 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
130 %3 = bitcast <16 x i8> %1 to <4 x i32>
131 %4 = bitcast <16 x i8> %2 to <4 x i32>
132 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
133 store i32 0, i32* %b, align 4
134 %5 = bitcast i32* %a to i8*
135 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
136 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
137 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
138 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
139 %inc = add nsw i32 %i.0, 1
142 for.end: ; preds = %for.cond
146 define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
148 ; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
149 ; to mismatch between st2 and ld3.
150 ; CHECK-LABEL: @test_nocse2
151 ; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
152 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
153 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
156 for.cond: ; preds = %for.body, %entry
157 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
158 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
159 %cmp = icmp slt i32 %i.0, %n
160 br i1 %cmp, label %for.body, label %for.end
162 for.body: ; preds = %for.cond
163 %0 = bitcast i32* %a to i8*
164 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
165 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
166 %3 = bitcast <16 x i8> %1 to <4 x i32>
167 %4 = bitcast <16 x i8> %2 to <4 x i32>
168 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
169 %5 = bitcast i32* %a to i8*
170 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
171 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
172 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
173 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
174 %inc = add nsw i32 %i.0, 1
177 for.end: ; preds = %for.cond
181 define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
183 ; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
184 ; mismatch between st2 and st3.
185 ; CHECK-LABEL: @test_nocse3
186 ; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
187 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
188 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
189 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
192 for.cond: ; preds = %for.body, %entry
193 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
194 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
195 %cmp = icmp slt i32 %i.0, %n
196 br i1 %cmp, label %for.body, label %for.end
198 for.body: ; preds = %for.cond
199 %0 = bitcast i32* %a to i8*
200 %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
201 %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
202 %3 = bitcast <16 x i8> %1 to <4 x i32>
203 %4 = bitcast <16 x i8> %2 to <4 x i32>
204 call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
205 call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
206 %5 = bitcast i32* %a to i8*
207 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
208 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
209 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
210 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
211 %inc = add nsw i32 %i.0, 1
214 for.end: ; preds = %for.cond
218 ; Function Attrs: nounwind
219 declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
221 ; Function Attrs: nounwind
222 declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
224 ; Function Attrs: nounwind readonly
225 declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
227 ; Function Attrs: nounwind readonly
228 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
230 define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
232 %add = add <4 x i32> %__p0, %__p1