1 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse -earlycse-debug-hash | FileCheck %s
2 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse<memssa>' | FileCheck %s
4 define <4 x i32> @test_cse(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
6 ; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
7 ; CHECK-LABEL: @test_cse
8 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
9 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
10 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
13 for.cond: ; preds = %for.body, %entry
14 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
15 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
16 %cmp = icmp slt i32 %i.0, %n
17 br i1 %cmp, label %for.body, label %for.end
19 for.body: ; preds = %for.cond
20 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
21 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
22 %2 = bitcast <16 x i8> %0 to <4 x i32>
23 %3 = bitcast <16 x i8> %1 to <4 x i32>
24 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a)
25 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
26 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
27 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
28 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
29 %inc = add nsw i32 %i.0, 1
32 for.end: ; preds = %for.cond
36 define <4 x i32> @test_cse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
38 ; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
39 ; CHECK-LABEL: @test_cse2
40 ; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %3, ptr %0)
41 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, ptr %a)
42 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
43 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
46 for.cond: ; preds = %for.body, %entry
47 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
48 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
49 %cmp = icmp slt i32 %i.0, %n
50 br i1 %cmp, label %for.body, label %for.end
52 for.body: ; preds = %for.cond
53 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
54 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
55 %2 = bitcast <16 x i8> %0 to <4 x i32>
56 %3 = bitcast <16 x i8> %1 to <4 x i32>
57 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %2, ptr %a)
58 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a)
59 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
60 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
61 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
62 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
63 %inc = add nsw i32 %i.0, 1
66 for.end: ; preds = %for.cond
70 define <4 x i32> @test_cse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
72 ; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
73 ; CHECK-LABEL: @test_cse3
74 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
75 ; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
76 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
77 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
80 for.cond: ; preds = %for.body, %entry
81 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
82 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
83 %cmp = icmp slt i32 %i.0, %n
84 br i1 %cmp, label %for.body, label %for.end
86 for.body: ; preds = %for.cond
87 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
88 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
89 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
90 %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
91 %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
92 %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
93 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
94 %inc = add nsw i32 %i.0, 1
97 for.end: ; preds = %for.cond
102 define <4 x i32> @test_nocse(ptr %a, ptr %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
104 ; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
106 ; CHECK-LABEL: @test_nocse
107 ; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0
108 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
109 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
112 for.cond: ; preds = %for.body, %entry
113 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
114 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
115 %cmp = icmp slt i32 %i.0, %n
116 br i1 %cmp, label %for.body, label %for.end
118 for.body: ; preds = %for.cond
119 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
120 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
121 %2 = bitcast <16 x i8> %0 to <4 x i32>
122 %3 = bitcast <16 x i8> %1 to <4 x i32>
123 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a)
124 store i32 0, ptr %b, align 4
125 %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a)
126 %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
127 %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
128 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
129 %inc = add nsw i32 %i.0, 1
132 for.end: ; preds = %for.cond
136 define <4 x i32> @test_nocse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
138 ; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
139 ; to mismatch between st2 and ld3.
140 ; CHECK-LABEL: @test_nocse2
141 ; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0
142 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
143 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
146 for.cond: ; preds = %for.body, %entry
147 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
148 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
149 %cmp = icmp slt i32 %i.0, %n
150 br i1 %cmp, label %for.body, label %for.end
152 for.body: ; preds = %for.cond
153 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
154 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
155 %2 = bitcast <16 x i8> %0 to <4 x i32>
156 %3 = bitcast <16 x i8> %1 to <4 x i32>
157 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %3, ptr %a)
158 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a)
159 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
160 %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
161 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
162 %inc = add nsw i32 %i.0, 1
165 for.end: ; preds = %for.cond
169 define <4 x i32> @test_nocse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
171 ; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
172 ; mismatch between st2 and st3.
173 ; CHECK-LABEL: @test_nocse3
174 ; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0
175 ; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0
176 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
177 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
180 for.cond: ; preds = %for.body, %entry
181 %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
182 %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
183 %cmp = icmp slt i32 %i.0, %n
184 br i1 %cmp, label %for.body, label %for.end
186 for.body: ; preds = %for.cond
187 %0 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
188 %1 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
189 %2 = bitcast <16 x i8> %0 to <4 x i32>
190 %3 = bitcast <16 x i8> %1 to <4 x i32>
191 call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> %3, <4 x i32> %2, <4 x i32> %2, ptr %a)
192 call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %2, <4 x i32> %2, ptr %a)
193 %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a)
194 %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
195 %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
196 %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
197 %inc = add nsw i32 %i.0, 1
200 for.end: ; preds = %for.cond
204 ; Function Attrs: nounwind
205 declare void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32>, <4 x i32>, ptr nocapture)
207 ; Function Attrs: nounwind
208 declare void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32>, <4 x i32>, <4 x i32>, ptr nocapture)
210 ; Function Attrs: nounwind readonly
211 declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr)
213 ; Function Attrs: nounwind readonly
214 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr)
216 define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
218 %add = add <4 x i32> %__p0, %__p1