1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; Test vector add reduction instrinsic
4 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s
7 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
8 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
9 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
10 declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
11 declare i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
13 declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a)
14 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a)
15 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
16 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a)
17 declare i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
20 declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a)
21 declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a)
22 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
23 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a)
24 declare i128 @llvm.vector.reduce.add.v4i128(<4 x i128> %a)
26 declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
27 declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
28 declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
29 declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)
31 define i8 @f1_1(<16 x i8> %a) {
34 ; CHECK-NEXT: vgbm %v0, 0
35 ; CHECK-NEXT: vsumb %v1, %v24, %v0
36 ; CHECK-NEXT: vsumqf %v0, %v1, %v0
37 ; CHECK-NEXT: vlgvf %r2, %v0, 3
38 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
40 %redadd = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a)
44 define i16 @f1_2(<8 x i16> %a) {
47 ; CHECK-NEXT: vgbm %v0, 0
48 ; CHECK-NEXT: vsumh %v1, %v24, %v0
49 ; CHECK-NEXT: vsumqf %v0, %v1, %v0
50 ; CHECK-NEXT: vlgvf %r2, %v0, 3
51 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
53 %redadd = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a)
57 define i32 @f1_3(<4 x i32> %a) {
60 ; CHECK-NEXT: vgbm %v0, 0
61 ; CHECK-NEXT: vsumqf %v0, %v24, %v0
62 ; CHECK-NEXT: vlgvf %r2, %v0, 3
63 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
66 %redadd = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
70 define i64 @f1_4(<2 x i64> %a) {
73 ; CHECK-NEXT: vrepg %v0, %v24, 1
74 ; CHECK-NEXT: vag %v0, %v24, %v0
75 ; CHECK-NEXT: vlgvg %r2, %v0, 0
78 %redadd = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a)
82 define i128 @f1_5(<1 x i128> %a) {
85 ; CHECK-NEXT: vst %v24, 0(%r2), 3
87 %redadd = call i128 @llvm.vector.reduce.add.v1i128(<1 x i128> %a)
91 define i8 @f2_1(<32 x i8> %a) {
94 ; CHECK-NEXT: vab %v0, %v24, %v26
95 ; CHECK-NEXT: vgbm %v1, 0
96 ; CHECK-NEXT: vsumb %v0, %v0, %v1
97 ; CHECK-NEXT: vsumqf %v0, %v0, %v1
98 ; CHECK-NEXT: vlgvf %r2, %v0, 3
99 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
100 ; CHECK-NEXT: br %r14
101 %redadd = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %a)
105 define i16 @f2_2(<16 x i16> %a) {
108 ; CHECK-NEXT: vah %v0, %v24, %v26
109 ; CHECK-NEXT: vgbm %v1, 0
110 ; CHECK-NEXT: vsumh %v0, %v0, %v1
111 ; CHECK-NEXT: vsumqf %v0, %v0, %v1
112 ; CHECK-NEXT: vlgvf %r2, %v0, 3
113 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
114 ; CHECK-NEXT: br %r14
115 %redadd = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a)
119 define i32 @f2_3(<8 x i32> %a) {
122 ; CHECK-NEXT: vaf %v0, %v24, %v26
123 ; CHECK-NEXT: vgbm %v1, 0
124 ; CHECK-NEXT: vsumqf %v0, %v0, %v1
125 ; CHECK-NEXT: vlgvf %r2, %v0, 3
126 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
127 ; CHECK-NEXT: br %r14
129 %redadd = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %a)
133 define i64 @f2_4(<4 x i64> %a) {
136 ; CHECK-NEXT: vag %v0, %v24, %v26
137 ; CHECK-NEXT: vrepg %v1, %v0, 1
138 ; CHECK-NEXT: vag %v0, %v0, %v1
139 ; CHECK-NEXT: vlgvg %r2, %v0, 0
140 ; CHECK-NEXT: br %r14
142 %redadd = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a)
146 define i128 @f2_5(<2 x i128> %a) {
149 ; CHECK-NEXT: vl %v0, 16(%r3), 3
150 ; CHECK-NEXT: vl %v1, 0(%r3), 3
151 ; CHECK-NEXT: vaq %v0, %v1, %v0
152 ; CHECK-NEXT: vst %v0, 0(%r2), 3
153 ; CHECK-NEXT: br %r14
154 %redadd = call i128 @llvm.vector.reduce.add.v2i128(<2 x i128> %a)
158 define i8 @f3_1(<64 x i8> %a) {
161 ; CHECK-NEXT: vab %v0, %v26, %v30
162 ; CHECK-NEXT: vab %v1, %v24, %v28
163 ; CHECK-NEXT: vab %v0, %v1, %v0
164 ; CHECK-NEXT: vgbm %v1, 0
165 ; CHECK-NEXT: vsumb %v0, %v0, %v1
166 ; CHECK-NEXT: vsumqf %v0, %v0, %v1
167 ; CHECK-NEXT: vlgvf %r2, %v0, 3
168 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
169 ; CHECK-NEXT: br %r14
170 %redadd = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %a)
174 define i16 @f3_2(<32 x i16> %a) {
177 ; CHECK-NEXT: vah %v0, %v26, %v30
178 ; CHECK-NEXT: vah %v1, %v24, %v28
179 ; CHECK-NEXT: vah %v0, %v1, %v0
180 ; CHECK-NEXT: vgbm %v1, 0
181 ; CHECK-NEXT: vsumh %v0, %v0, %v1
182 ; CHECK-NEXT: vsumqf %v0, %v0, %v1
183 ; CHECK-NEXT: vlgvf %r2, %v0, 3
184 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
185 ; CHECK-NEXT: br %r14
186 %redadd = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %a)
190 define i32 @f3_3(<16 x i32> %a) {
193 ; CHECK-NEXT: vaf %v0, %v26, %v30
194 ; CHECK-NEXT: vaf %v1, %v24, %v28
195 ; CHECK-NEXT: vaf %v0, %v1, %v0
196 ; CHECK-NEXT: vgbm %v1, 0
197 ; CHECK-NEXT: vsumqf %v0, %v0, %v1
198 ; CHECK-NEXT: vlgvf %r2, %v0, 3
199 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
200 ; CHECK-NEXT: br %r14
202 %redadd = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a)
206 define i64 @f3_4(<8 x i64> %a) {
209 ; CHECK-NEXT: vag %v0, %v26, %v30
210 ; CHECK-NEXT: vag %v1, %v24, %v28
211 ; CHECK-NEXT: vag %v0, %v1, %v0
212 ; CHECK-NEXT: vrepg %v1, %v0, 1
213 ; CHECK-NEXT: vag %v0, %v0, %v1
214 ; CHECK-NEXT: vlgvg %r2, %v0, 0
215 ; CHECK-NEXT: br %r14
217 %redadd = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a)
221 define i128 @f3_5(<4 x i128> %a) {
224 ; CHECK-NEXT: vl %v0, 32(%r3), 3
225 ; CHECK-NEXT: vl %v1, 0(%r3), 3
226 ; CHECK-NEXT: vl %v2, 48(%r3), 3
227 ; CHECK-NEXT: vl %v3, 16(%r3), 3
228 ; CHECK-NEXT: vaq %v2, %v3, %v2
229 ; CHECK-NEXT: vaq %v0, %v1, %v0
230 ; CHECK-NEXT: vaq %v0, %v0, %v2
231 ; CHECK-NEXT: vst %v0, 0(%r2), 3
232 ; CHECK-NEXT: br %r14
233 %redadd = call i128 @llvm.vector.reduce.add.v4i128(<4 x i128> %a)
238 define i8 @f4_1(<8 x i8> %a) {
241 ; CHECK-NEXT: vpkg %v0, %v24, %v24
242 ; CHECK-NEXT: vab %v0, %v24, %v0
243 ; CHECK-NEXT: vpkf %v1, %v0, %v0
244 ; CHECK-NEXT: vab %v0, %v0, %v1
245 ; CHECK-NEXT: vrepb %v1, %v0, 1
246 ; CHECK-NEXT: vab %v0, %v0, %v1
247 ; CHECK-NEXT: vlgvb %r2, %v0, 0
248 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
249 ; CHECK-NEXT: br %r14
250 %redadd = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
254 define i16 @f4_2(<4 x i16> %a) {
257 ; CHECK-NEXT: vpkg %v0, %v24, %v24
258 ; CHECK-NEXT: vah %v0, %v24, %v0
259 ; CHECK-NEXT: vreph %v1, %v0, 1
260 ; CHECK-NEXT: vah %v0, %v0, %v1
261 ; CHECK-NEXT: vlgvh %r2, %v0, 0
262 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
263 ; CHECK-NEXT: br %r14
264 %redadd = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
268 define i32 @f4_3(<2 x i32> %a) {
271 ; CHECK-NEXT: vrepf %v0, %v24, 1
272 ; CHECK-NEXT: vaf %v0, %v24, %v0
273 ; CHECK-NEXT: vlgvf %r2, %v0, 0
274 ; CHECK-NEXT: # kill: def $r2l killed $r2l killed $r2d
275 ; CHECK-NEXT: br %r14
277 %redadd = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
281 define i64 @f4_4(<1 x i64> %a) {
284 ; CHECK-NEXT: vlgvg %r2, %v24, 0
285 ; CHECK-NEXT: br %r14
287 %redadd = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a)