1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s
4 define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
6 ; CHECK: // %bb.0: // %entry
7 ; CHECK-NEXT: mov x8, xzr
8 ; CHECK-NEXT: .LBB0_1: // %vector.body
9 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
10 ; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32
11 ; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s
12 ; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s
13 ; CHECK-NEXT: str q2, [x1, x8]
14 ; CHECK-NEXT: add x8, x8, #16
15 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
16 ; CHECK-NEXT: b.ne .LBB0_1
17 ; CHECK-NEXT: // %bb.2: // %while.end
22 vector.body: ; preds = %vector.body, %entry
23 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
24 %0 = shl i64 %index, 1
25 %next.gep = getelementptr float, float* %pSrc, i64 %0
26 %next.gep19 = getelementptr float, float* %pDst, i64 %index
27 %1 = bitcast float* %next.gep to <8 x float>*
28 %wide.vec = load <8 x float>, <8 x float>* %1, align 4
29 %2 = fmul fast <8 x float> %wide.vec, %wide.vec
30 %3 = shufflevector <8 x float> %2, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
31 %4 = fmul fast <8 x float> %wide.vec, %wide.vec
32 %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
33 %6 = fadd fast <4 x float> %5, %3
34 %7 = bitcast float* %next.gep19 to <4 x float>*
35 store <4 x float> %6, <4 x float>* %7, align 4
36 %index.next = add i64 %index, 4
37 %8 = icmp eq i64 %index.next, 1024
38 br i1 %8, label %while.end, label %vector.body
40 while.end: ; preds = %vector.body
44 define void @vld3(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
46 ; CHECK: // %bb.0: // %entry
47 ; CHECK-NEXT: mov x8, xzr
48 ; CHECK-NEXT: .LBB1_1: // %vector.body
49 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
50 ; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48
51 ; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s
52 ; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s
53 ; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s
54 ; CHECK-NEXT: str q3, [x1, x8]
55 ; CHECK-NEXT: add x8, x8, #16
56 ; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096
57 ; CHECK-NEXT: b.ne .LBB1_1
58 ; CHECK-NEXT: // %bb.2: // %while.end
63 vector.body: ; preds = %vector.body, %entry
64 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
65 %0 = mul i64 %index, 3
66 %next.gep = getelementptr float, float* %pSrc, i64 %0
67 %next.gep23 = getelementptr float, float* %pDst, i64 %index
68 %1 = bitcast float* %next.gep to <12 x float>*
69 %wide.vec = load <12 x float>, <12 x float>* %1, align 4
70 %2 = fmul fast <12 x float> %wide.vec, %wide.vec
71 %3 = shufflevector <12 x float> %2, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
72 %4 = fmul fast <12 x float> %wide.vec, %wide.vec
73 %5 = shufflevector <12 x float> %4, <12 x float> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
74 %6 = fadd fast <4 x float> %5, %3
75 %7 = fmul fast <12 x float> %wide.vec, %wide.vec
76 %8 = shufflevector <12 x float> %7, <12 x float> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
77 %9 = fadd fast <4 x float> %6, %8
78 %10 = bitcast float* %next.gep23 to <4 x float>*
79 store <4 x float> %9, <4 x float>* %10, align 4
80 %index.next = add i64 %index, 4
81 %11 = icmp eq i64 %index.next, 1024
82 br i1 %11, label %while.end, label %vector.body
84 while.end: ; preds = %vector.body
88 define void @vld4(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) {
90 ; CHECK: // %bb.0: // %entry
91 ; CHECK-NEXT: mov x8, xzr
92 ; CHECK-NEXT: .LBB2_1: // %vector.body
93 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
94 ; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
95 ; CHECK-NEXT: add x9, x1, x8
96 ; CHECK-NEXT: add x8, x8, #32
97 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192
98 ; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s
99 ; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s
100 ; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s
101 ; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s
102 ; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9]
103 ; CHECK-NEXT: b.ne .LBB2_1
104 ; CHECK-NEXT: // %bb.2: // %while.end
107 br label %vector.body
109 vector.body: ; preds = %vector.body, %entry
110 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
111 %0 = shl i64 %index, 2
112 %next.gep = getelementptr float, float* %pSrc, i64 %0
113 %1 = shl i64 %index, 1
114 %2 = bitcast float* %next.gep to <16 x float>*
115 %wide.vec = load <16 x float>, <16 x float>* %2, align 4
116 %3 = fmul fast <16 x float> %wide.vec, %wide.vec
117 %4 = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
118 %5 = fmul fast <16 x float> %wide.vec, %wide.vec
119 %6 = shufflevector <16 x float> %5, <16 x float> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
120 %7 = fadd fast <4 x float> %6, %4
121 %8 = fmul fast <16 x float> %wide.vec, %wide.vec
122 %9 = shufflevector <16 x float> %8, <16 x float> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
123 %10 = fmul fast <16 x float> %wide.vec, %wide.vec
124 %11 = shufflevector <16 x float> %10, <16 x float> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
125 %12 = fadd fast <4 x float> %11, %9
126 %13 = getelementptr inbounds float, float* %pDst, i64 %1
127 %14 = bitcast float* %13 to <8 x float>*
128 %interleaved.vec = shufflevector <4 x float> %7, <4 x float> %12, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
129 store <8 x float> %interleaved.vec, <8 x float>* %14, align 4
130 %index.next = add i64 %index, 4
131 %15 = icmp eq i64 %index.next, 1024
132 br i1 %15, label %while.end, label %vector.body
134 while.end: ; preds = %vector.body
138 define void @twosrc(float* nocapture readonly %pSrc, float* nocapture readonly %pSrc2, float* noalias nocapture %pDst, i32 %numSamples) {
139 ; CHECK-LABEL: twosrc:
140 ; CHECK: // %bb.0: // %entry
141 ; CHECK-NEXT: mov x8, xzr
142 ; CHECK-NEXT: .LBB3_1: // %vector.body
143 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
144 ; CHECK-NEXT: add x9, x0, x8
145 ; CHECK-NEXT: add x10, x1, x8
146 ; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9]
147 ; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10]
148 ; CHECK-NEXT: add x8, x8, #32
149 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192
150 ; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s
151 ; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s
152 ; CHECK-NEXT: str q4, [x2], #16
153 ; CHECK-NEXT: b.ne .LBB3_1
154 ; CHECK-NEXT: // %bb.2: // %while.end
157 br label %vector.body
159 vector.body: ; preds = %vector.body, %entry
160 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
161 %0 = shl i64 %index, 1
162 %next.gep = getelementptr float, float* %pSrc, i64 %0
163 %1 = shl i64 %index, 1
164 %next.gep23 = getelementptr float, float* %pSrc2, i64 %1
165 %next.gep24 = getelementptr float, float* %pDst, i64 %index
166 %2 = bitcast float* %next.gep to <8 x float>*
167 %wide.vec = load <8 x float>, <8 x float>* %2, align 4
168 %3 = bitcast float* %next.gep23 to <8 x float>*
169 %wide.vec26 = load <8 x float>, <8 x float>* %3, align 4
170 %4 = fmul fast <8 x float> %wide.vec26, %wide.vec
171 %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
172 %6 = fmul fast <8 x float> %wide.vec26, %wide.vec
173 %7 = shufflevector <8 x float> %6, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
174 %8 = fadd fast <4 x float> %7, %5
175 %9 = bitcast float* %next.gep24 to <4 x float>*
176 store <4 x float> %8, <4 x float>* %9, align 4
177 %index.next = add i64 %index, 4
178 %10 = icmp eq i64 %index.next, 1024
179 br i1 %10, label %while.end, label %vector.body
181 while.end: ; preds = %vector.body