1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
3 ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
4 ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s
5 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
6 ; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names \
7 ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
10 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
11 define void @ass_acc(<512 x i1>* %ptr, <16 x i8> %vc) {
12 ; CHECK-LABEL: ass_acc:
13 ; CHECK: # %bb.0: # %entry
14 ; CHECK-NEXT: vmr v3, v2
15 ; CHECK-NEXT: xxlor vs0, v2, v2
16 ; CHECK-NEXT: xxlor vs1, v3, v3
17 ; CHECK-NEXT: xxlor vs2, v2, v2
18 ; CHECK-NEXT: xxlor vs3, v3, v3
19 ; CHECK-NEXT: stxv vs0, 48(r3)
20 ; CHECK-NEXT: stxv vs1, 32(r3)
21 ; CHECK-NEXT: stxv vs2, 16(r3)
22 ; CHECK-NEXT: stxv vs3, 0(r3)
25 ; CHECK-BE-LABEL: ass_acc:
26 ; CHECK-BE: # %bb.0: # %entry
27 ; CHECK-BE-NEXT: vmr v3, v2
28 ; CHECK-BE-NEXT: xxlor vs0, v2, v2
29 ; CHECK-BE-NEXT: xxlor vs1, v3, v3
30 ; CHECK-BE-NEXT: xxlor vs2, v2, v2
31 ; CHECK-BE-NEXT: xxlor vs3, v3, v3
32 ; CHECK-BE-NEXT: stxv vs1, 16(r3)
33 ; CHECK-BE-NEXT: stxv vs0, 0(r3)
34 ; CHECK-BE-NEXT: stxv vs3, 48(r3)
35 ; CHECK-BE-NEXT: stxv vs2, 32(r3)
38 %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
39 store <512 x i1> %0, <512 x i1>* %ptr, align 64
44 declare <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1>)
45 define void @int_xxmtacc(<512 x i1>* %ptr, <16 x i8> %vc) {
46 ; CHECK-LABEL: int_xxmtacc:
47 ; CHECK: # %bb.0: # %entry
48 ; CHECK-NEXT: vmr v3, v2
49 ; CHECK-NEXT: xxlor vs0, v2, v2
50 ; CHECK-NEXT: xxlor vs1, v3, v3
51 ; CHECK-NEXT: xxlor vs2, v2, v2
52 ; CHECK-NEXT: xxlor vs3, v3, v3
53 ; CHECK-NEXT: xxmtacc acc0
54 ; CHECK-NEXT: stxv vs0, 48(r3)
55 ; CHECK-NEXT: stxv vs1, 32(r3)
56 ; CHECK-NEXT: stxv vs2, 16(r3)
57 ; CHECK-NEXT: stxv vs3, 0(r3)
60 ; CHECK-BE-LABEL: int_xxmtacc:
61 ; CHECK-BE: # %bb.0: # %entry
62 ; CHECK-BE-NEXT: vmr v3, v2
63 ; CHECK-BE-NEXT: xxlor vs0, v2, v2
64 ; CHECK-BE-NEXT: xxlor vs1, v3, v3
65 ; CHECK-BE-NEXT: xxlor vs2, v2, v2
66 ; CHECK-BE-NEXT: xxlor vs3, v3, v3
67 ; CHECK-BE-NEXT: xxmtacc acc0
68 ; CHECK-BE-NEXT: stxv vs1, 16(r3)
69 ; CHECK-BE-NEXT: stxv vs0, 0(r3)
70 ; CHECK-BE-NEXT: stxv vs3, 48(r3)
71 ; CHECK-BE-NEXT: stxv vs2, 32(r3)
74 ; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is
75 ; generated from the call to xxmtacc then one xxmfacc is generated for the store
76 %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
77 %1 = tail call <512 x i1> @llvm.ppc.mma.xxmtacc(<512 x i1> %0)
78 store <512 x i1> %1, <512 x i1>* %ptr, align 64
83 declare <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1>)
84 define void @int_xxmfacc(<512 x i1>* %ptr, <16 x i8> %vc) {
85 ; CHECK-LABEL: int_xxmfacc:
86 ; CHECK: # %bb.0: # %entry
87 ; CHECK-NEXT: vmr v3, v2
88 ; CHECK-NEXT: xxlor vs0, v2, v2
89 ; CHECK-NEXT: xxlor vs1, v3, v3
90 ; CHECK-NEXT: xxlor vs2, v2, v2
91 ; CHECK-NEXT: xxlor vs3, v3, v3
92 ; CHECK-NEXT: stxv vs0, 48(r3)
93 ; CHECK-NEXT: stxv vs1, 32(r3)
94 ; CHECK-NEXT: stxv vs2, 16(r3)
95 ; CHECK-NEXT: stxv vs3, 0(r3)
98 ; CHECK-BE-LABEL: int_xxmfacc:
99 ; CHECK-BE: # %bb.0: # %entry
100 ; CHECK-BE-NEXT: vmr v3, v2
101 ; CHECK-BE-NEXT: xxlor vs0, v2, v2
102 ; CHECK-BE-NEXT: xxlor vs1, v3, v3
103 ; CHECK-BE-NEXT: xxlor vs2, v2, v2
104 ; CHECK-BE-NEXT: xxlor vs3, v3, v3
105 ; CHECK-BE-NEXT: stxv vs1, 16(r3)
106 ; CHECK-BE-NEXT: stxv vs0, 0(r3)
107 ; CHECK-BE-NEXT: stxv vs3, 48(r3)
108 ; CHECK-BE-NEXT: stxv vs2, 32(r3)
111 ; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is
112 ; generated from the call to xxmfacc then one xxmfacc is generated for the store
113 %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
114 %1 = tail call <512 x i1> @llvm.ppc.mma.xxmfacc(<512 x i1> %0)
115 store <512 x i1> %1, <512 x i1>* %ptr, align 64
120 declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
121 define void @int_xxsetaccz(<512 x i1>* %ptr) {
122 ; CHECK-LABEL: int_xxsetaccz:
123 ; CHECK: # %bb.0: # %entry
124 ; CHECK-NEXT: xxsetaccz acc0
125 ; CHECK-NEXT: xxmfacc acc0
126 ; CHECK-NEXT: stxv vs0, 48(r3)
127 ; CHECK-NEXT: stxv vs1, 32(r3)
128 ; CHECK-NEXT: stxv vs2, 16(r3)
129 ; CHECK-NEXT: stxv vs3, 0(r3)
132 ; CHECK-BE-LABEL: int_xxsetaccz:
133 ; CHECK-BE: # %bb.0: # %entry
134 ; CHECK-BE-NEXT: xxsetaccz acc0
135 ; CHECK-BE-NEXT: xxmfacc acc0
136 ; CHECK-BE-NEXT: stxv vs1, 16(r3)
137 ; CHECK-BE-NEXT: stxv vs0, 0(r3)
138 ; CHECK-BE-NEXT: stxv vs3, 48(r3)
139 ; CHECK-BE-NEXT: stxv vs2, 32(r3)
142 %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
143 store <512 x i1> %0, <512 x i1>* %ptr, align 64
148 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>)
149 define void @disass_acc(<16 x i8>* %ptr1, <16 x i8>* %ptr2, <16 x i8>* %ptr3, <16 x i8>* %ptr4) {
150 ; CHECK-LABEL: disass_acc:
151 ; CHECK: # %bb.0: # %entry
152 ; CHECK-NEXT: xxsetaccz acc0
153 ; CHECK-NEXT: xxmfacc acc0
154 ; CHECK-NEXT: stxv vs3, 0(r3)
155 ; CHECK-NEXT: stxv vs2, 0(r4)
156 ; CHECK-NEXT: stxv vs1, 0(r5)
157 ; CHECK-NEXT: stxv vs0, 0(r6)
160 ; CHECK-BE-LABEL: disass_acc:
161 ; CHECK-BE: # %bb.0: # %entry
162 ; CHECK-BE-NEXT: xxsetaccz acc0
163 ; CHECK-BE-NEXT: xxmfacc acc0
164 ; CHECK-BE-NEXT: stxv vs0, 0(r3)
165 ; CHECK-BE-NEXT: stxv vs1, 0(r4)
166 ; CHECK-BE-NEXT: stxv vs2, 0(r5)
167 ; CHECK-BE-NEXT: stxv vs3, 0(r6)
170 %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
171 %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0)
172 %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
173 %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
174 %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
175 %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
176 store <16 x i8> %2, <16 x i8>* %ptr1, align 16
177 store <16 x i8> %3, <16 x i8>* %ptr2, align 16
178 store <16 x i8> %4, <16 x i8>* %ptr3, align 16
179 store <16 x i8> %5, <16 x i8>* %ptr4, align 16
183 declare <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1>, <16 x i8>, <16 x i8>)
184 define void @testBranch(<512 x i1>* %ptr, <16 x i8> %vc, i32 %val) {
185 ; CHECK-LABEL: testBranch:
186 ; CHECK: # %bb.0: # %entry
187 ; CHECK-NEXT: cmplwi r7, 0
188 ; CHECK-NEXT: beq cr0, .LBB5_2
189 ; CHECK-NEXT: # %bb.1: # %if.then
190 ; CHECK-NEXT: xxsetaccz acc0
191 ; CHECK-NEXT: b .LBB5_3
192 ; CHECK-NEXT: .LBB5_2: # %if.else
193 ; CHECK-NEXT: lxv vs1, 32(r3)
194 ; CHECK-NEXT: lxv vs0, 48(r3)
195 ; CHECK-NEXT: lxv vs3, 0(r3)
196 ; CHECK-NEXT: lxv vs2, 16(r3)
197 ; CHECK-NEXT: xxmtacc acc0
198 ; CHECK-NEXT: xvi4ger8pp acc0, v2, v2
199 ; CHECK-NEXT: .LBB5_3: # %if.end
200 ; CHECK-NEXT: xxmfacc acc0
201 ; CHECK-NEXT: stxv vs0, 48(r3)
202 ; CHECK-NEXT: stxv vs1, 32(r3)
203 ; CHECK-NEXT: stxv vs2, 16(r3)
204 ; CHECK-NEXT: stxv vs3, 0(r3)
207 ; CHECK-BE-LABEL: testBranch:
208 ; CHECK-BE: # %bb.0: # %entry
209 ; CHECK-BE-NEXT: cmplwi r7, 0
210 ; CHECK-BE-NEXT: beq cr0, .LBB5_2
211 ; CHECK-BE-NEXT: # %bb.1: # %if.then
212 ; CHECK-BE-NEXT: xxsetaccz acc0
213 ; CHECK-BE-NEXT: b .LBB5_3
214 ; CHECK-BE-NEXT: .LBB5_2: # %if.else
215 ; CHECK-BE-NEXT: lxv vs1, 16(r3)
216 ; CHECK-BE-NEXT: lxv vs0, 0(r3)
217 ; CHECK-BE-NEXT: lxv vs3, 48(r3)
218 ; CHECK-BE-NEXT: lxv vs2, 32(r3)
219 ; CHECK-BE-NEXT: xxmtacc acc0
220 ; CHECK-BE-NEXT: xvi4ger8pp acc0, v2, v2
221 ; CHECK-BE-NEXT: .LBB5_3: # %if.end
222 ; CHECK-BE-NEXT: xxmfacc acc0
223 ; CHECK-BE-NEXT: stxv vs1, 16(r3)
224 ; CHECK-BE-NEXT: stxv vs0, 0(r3)
225 ; CHECK-BE-NEXT: stxv vs3, 48(r3)
226 ; CHECK-BE-NEXT: stxv vs2, 32(r3)
229 %tobool = icmp eq i32 %val, 0
230 br i1 %tobool, label %if.else, label %if.then
233 %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
237 %1 = load <512 x i1>, <512 x i1>* %ptr, align 64
238 %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
242 %vq1.0 = phi <512 x i1> [ %0, %if.then ], [ %2, %if.else ]
243 store <512 x i1> %vq1.0, <512 x i1>* %ptr, align 64
247 ; The following test cases check that the xxsetaccz instruction is correctly rematerialized
248 declare <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1>, <16 x i8>, <16 x i8>)
249 declare <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1>, <16 x i8>, <16 x i8>)
250 declare <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1>, <16 x i8>, <16 x i8>)
252 define void @testcse(<512 x i1>* %res, <16 x i8> %vc) {
253 ; CHECK-LABEL: testcse:
254 ; CHECK: # %bb.0: # %entry
255 ; CHECK-NEXT: xxsetaccz acc0
256 ; CHECK-NEXT: xvf32gerpp acc0, v2, v2
257 ; CHECK-NEXT: xxmfacc acc0
258 ; CHECK-NEXT: stxv vs0, 48(r3)
259 ; CHECK-NEXT: stxv vs1, 32(r3)
260 ; CHECK-NEXT: stxv vs2, 16(r3)
261 ; CHECK-NEXT: stxv vs3, 0(r3)
262 ; CHECK-NEXT: stxv vs0, 112(r3)
263 ; CHECK-NEXT: stxv vs1, 96(r3)
264 ; CHECK-NEXT: stxv vs2, 80(r3)
265 ; CHECK-NEXT: stxv vs3, 64(r3)
268 ; CHECK-BE-LABEL: testcse:
269 ; CHECK-BE: # %bb.0: # %entry
270 ; CHECK-BE-NEXT: xxsetaccz acc0
271 ; CHECK-BE-NEXT: xvf32gerpp acc0, v2, v2
272 ; CHECK-BE-NEXT: xxmfacc acc0
273 ; CHECK-BE-NEXT: stxv vs1, 16(r3)
274 ; CHECK-BE-NEXT: stxv vs0, 0(r3)
275 ; CHECK-BE-NEXT: stxv vs3, 48(r3)
276 ; CHECK-BE-NEXT: stxv vs2, 32(r3)
277 ; CHECK-BE-NEXT: stxv vs1, 80(r3)
278 ; CHECK-BE-NEXT: stxv vs0, 64(r3)
279 ; CHECK-BE-NEXT: stxv vs3, 112(r3)
280 ; CHECK-BE-NEXT: stxv vs2, 96(r3)
283 %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
284 %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
285 %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
286 %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
287 %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0
288 %5 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1
289 store <512 x i1> %2, <512 x i1>* %4, align 64
290 store <512 x i1> %3, <512 x i1>* %5, align 64
294 define void @testcse2(<512 x i1>* %res, <16 x i8> %vc) {
295 ; CHECK-LABEL: testcse2:
296 ; CHECK: # %bb.0: # %entry
297 ; CHECK-NEXT: xxsetaccz acc0
298 ; CHECK-NEXT: xxsetaccz acc1
299 ; CHECK-NEXT: xvf32gerpp acc1, v2, v2
300 ; CHECK-NEXT: xvf32gerpn acc0, v2, v2
301 ; CHECK-NEXT: xxmfacc acc1
302 ; CHECK-NEXT: xxmfacc acc0
303 ; CHECK-NEXT: stxv vs4, 48(r3)
304 ; CHECK-NEXT: stxv vs5, 32(r3)
305 ; CHECK-NEXT: stxv vs6, 16(r3)
306 ; CHECK-NEXT: stxv vs7, 0(r3)
307 ; CHECK-NEXT: stxv vs0, 112(r3)
308 ; CHECK-NEXT: stxv vs1, 96(r3)
309 ; CHECK-NEXT: stxv vs2, 80(r3)
310 ; CHECK-NEXT: stxv vs3, 64(r3)
313 ; CHECK-BE-LABEL: testcse2:
314 ; CHECK-BE: # %bb.0: # %entry
315 ; CHECK-BE-NEXT: xxsetaccz acc0
316 ; CHECK-BE-NEXT: xxsetaccz acc1
317 ; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2
318 ; CHECK-BE-NEXT: xvf32gerpn acc0, v2, v2
319 ; CHECK-BE-NEXT: xxmfacc acc1
320 ; CHECK-BE-NEXT: xxmfacc acc0
321 ; CHECK-BE-NEXT: stxv vs5, 16(r3)
322 ; CHECK-BE-NEXT: stxv vs4, 0(r3)
323 ; CHECK-BE-NEXT: stxv vs7, 48(r3)
324 ; CHECK-BE-NEXT: stxv vs6, 32(r3)
325 ; CHECK-BE-NEXT: stxv vs1, 80(r3)
326 ; CHECK-BE-NEXT: stxv vs0, 64(r3)
327 ; CHECK-BE-NEXT: stxv vs3, 112(r3)
328 ; CHECK-BE-NEXT: stxv vs2, 96(r3)
331 %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
332 %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
333 %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
334 %3 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %vc, <16 x i8> %vc)
335 %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0
336 %5 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1
337 store <512 x i1> %2, <512 x i1>* %4, align 64
338 store <512 x i1> %3, <512 x i1>* %5, align 64
342 define void @testcse3(<512 x i1>* %res, <16 x i8> %vc) {
343 ; CHECK-LABEL: testcse3:
344 ; CHECK: # %bb.0: # %entry
345 ; CHECK-NEXT: xxsetaccz acc0
346 ; CHECK-NEXT: xxsetaccz acc1
347 ; CHECK-NEXT: xvf32gerpp acc1, v2, v2
348 ; CHECK-NEXT: xvf32gerpn acc0, v2, v2
349 ; CHECK-NEXT: xxmfacc acc1
350 ; CHECK-NEXT: xxmfacc acc0
351 ; CHECK-NEXT: stxv vs4, 48(r3)
352 ; CHECK-NEXT: stxv vs5, 32(r3)
353 ; CHECK-NEXT: stxv vs6, 16(r3)
354 ; CHECK-NEXT: stxv vs7, 0(r3)
355 ; CHECK-NEXT: stxv vs0, 112(r3)
356 ; CHECK-NEXT: stxv vs1, 96(r3)
357 ; CHECK-NEXT: stxv vs2, 80(r3)
358 ; CHECK-NEXT: stxv vs3, 64(r3)
361 ; CHECK-BE-LABEL: testcse3:
362 ; CHECK-BE: # %bb.0: # %entry
363 ; CHECK-BE-NEXT: xxsetaccz acc0
364 ; CHECK-BE-NEXT: xxsetaccz acc1
365 ; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2
366 ; CHECK-BE-NEXT: xvf32gerpn acc0, v2, v2
367 ; CHECK-BE-NEXT: xxmfacc acc1
368 ; CHECK-BE-NEXT: xxmfacc acc0
369 ; CHECK-BE-NEXT: stxv vs5, 16(r3)
370 ; CHECK-BE-NEXT: stxv vs4, 0(r3)
371 ; CHECK-BE-NEXT: stxv vs7, 48(r3)
372 ; CHECK-BE-NEXT: stxv vs6, 32(r3)
373 ; CHECK-BE-NEXT: stxv vs1, 80(r3)
374 ; CHECK-BE-NEXT: stxv vs0, 64(r3)
375 ; CHECK-BE-NEXT: stxv vs3, 112(r3)
376 ; CHECK-BE-NEXT: stxv vs2, 96(r3)
379 %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
380 %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
381 %2 = call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
382 %3 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 0
383 %4 = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 1
384 store <512 x i1> %1, <512 x i1>* %3, align 64
385 store <512 x i1> %2, <512 x i1>* %4, align 64
389 define void @testcse4(<512 x i1>* %res, i32 %lim, <16 x i8>* %vc) {
390 ; CHECK-LABEL: testcse4:
391 ; CHECK: # %bb.0: # %entry
392 ; CHECK-NEXT: cmpwi r4, 1
393 ; CHECK-NEXT: bltlr cr0
394 ; CHECK-NEXT: # %bb.1: # %for.body.preheader
395 ; CHECK-NEXT: clrldi r4, r4, 32
396 ; CHECK-NEXT: li r6, 0
397 ; CHECK-NEXT: mtctr r4
398 ; CHECK-NEXT: li r4, 0
399 ; CHECK-NEXT: .p2align 4
400 ; CHECK-NEXT: .LBB9_2: # %for.body
402 ; CHECK-NEXT: rldic r7, r6, 4, 28
403 ; CHECK-NEXT: xxsetaccz acc2
404 ; CHECK-NEXT: xxsetaccz acc1
405 ; CHECK-NEXT: addi r6, r6, 6
406 ; CHECK-NEXT: lxvx vs0, r5, r7
407 ; CHECK-NEXT: add r7, r5, r7
408 ; CHECK-NEXT: lxv vs1, 16(r7)
409 ; CHECK-NEXT: xvf32gerpp acc2, vs0, vs1
410 ; CHECK-NEXT: lxv vs0, 32(r7)
411 ; CHECK-NEXT: lxv vs1, 48(r7)
412 ; CHECK-NEXT: xxmfacc acc2
413 ; CHECK-NEXT: xvf32gerpn acc1, vs0, vs1
414 ; CHECK-NEXT: lxv vs12, 64(r7)
415 ; CHECK-NEXT: lxv vs13, 80(r7)
416 ; CHECK-NEXT: rldic r7, r4, 6, 26
417 ; CHECK-NEXT: xxsetaccz acc0
418 ; CHECK-NEXT: addi r4, r4, 3
419 ; CHECK-NEXT: xxmfacc acc1
420 ; CHECK-NEXT: xvf32gernp acc0, vs12, vs13
421 ; CHECK-NEXT: stxvx vs11, r3, r7
422 ; CHECK-NEXT: add r7, r3, r7
423 ; CHECK-NEXT: xxmfacc acc0
424 ; CHECK-NEXT: stxv vs8, 48(r7)
425 ; CHECK-NEXT: stxv vs9, 32(r7)
426 ; CHECK-NEXT: stxv vs10, 16(r7)
427 ; CHECK-NEXT: stxv vs4, 112(r7)
428 ; CHECK-NEXT: stxv vs5, 96(r7)
429 ; CHECK-NEXT: stxv vs6, 80(r7)
430 ; CHECK-NEXT: stxv vs7, 64(r7)
431 ; CHECK-NEXT: stxv vs0, 176(r7)
432 ; CHECK-NEXT: stxv vs1, 160(r7)
433 ; CHECK-NEXT: stxv vs2, 144(r7)
434 ; CHECK-NEXT: stxv vs3, 128(r7)
435 ; CHECK-NEXT: bdnz .LBB9_2
436 ; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
439 ; CHECK-BE-LABEL: testcse4:
440 ; CHECK-BE: # %bb.0: # %entry
441 ; CHECK-BE-NEXT: cmpwi r4, 1
442 ; CHECK-BE-NEXT: bltlr cr0
443 ; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader
444 ; CHECK-BE-NEXT: clrldi r4, r4, 32
445 ; CHECK-BE-NEXT: li r6, 0
446 ; CHECK-BE-NEXT: mtctr r4
447 ; CHECK-BE-NEXT: li r4, 0
448 ; CHECK-BE-NEXT: .p2align 4
449 ; CHECK-BE-NEXT: .LBB9_2: # %for.body
451 ; CHECK-BE-NEXT: rldic r7, r6, 4, 28
452 ; CHECK-BE-NEXT: xxsetaccz acc2
453 ; CHECK-BE-NEXT: xxsetaccz acc1
454 ; CHECK-BE-NEXT: addi r6, r6, 6
455 ; CHECK-BE-NEXT: lxvx vs0, r5, r7
456 ; CHECK-BE-NEXT: add r7, r5, r7
457 ; CHECK-BE-NEXT: lxv vs1, 16(r7)
458 ; CHECK-BE-NEXT: xvf32gerpp acc2, vs0, vs1
459 ; CHECK-BE-NEXT: lxv vs0, 32(r7)
460 ; CHECK-BE-NEXT: lxv vs1, 48(r7)
461 ; CHECK-BE-NEXT: xxmfacc acc2
462 ; CHECK-BE-NEXT: xvf32gerpn acc1, vs0, vs1
463 ; CHECK-BE-NEXT: lxv vs12, 64(r7)
464 ; CHECK-BE-NEXT: lxv vs13, 80(r7)
465 ; CHECK-BE-NEXT: rldic r7, r4, 6, 26
466 ; CHECK-BE-NEXT: xxsetaccz acc0
467 ; CHECK-BE-NEXT: addi r4, r4, 3
468 ; CHECK-BE-NEXT: xxmfacc acc1
469 ; CHECK-BE-NEXT: xvf32gernp acc0, vs12, vs13
470 ; CHECK-BE-NEXT: stxvx vs8, r3, r7
471 ; CHECK-BE-NEXT: add r7, r3, r7
472 ; CHECK-BE-NEXT: xxmfacc acc0
473 ; CHECK-BE-NEXT: stxv vs9, 16(r7)
474 ; CHECK-BE-NEXT: stxv vs11, 48(r7)
475 ; CHECK-BE-NEXT: stxv vs10, 32(r7)
476 ; CHECK-BE-NEXT: stxv vs5, 80(r7)
477 ; CHECK-BE-NEXT: stxv vs4, 64(r7)
478 ; CHECK-BE-NEXT: stxv vs7, 112(r7)
479 ; CHECK-BE-NEXT: stxv vs6, 96(r7)
480 ; CHECK-BE-NEXT: stxv vs1, 144(r7)
481 ; CHECK-BE-NEXT: stxv vs0, 128(r7)
482 ; CHECK-BE-NEXT: stxv vs3, 176(r7)
483 ; CHECK-BE-NEXT: stxv vs2, 160(r7)
484 ; CHECK-BE-NEXT: bdnz .LBB9_2
485 ; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup
488 %cmp55 = icmp sgt i32 %lim, 0
489 br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup
491 for.body.preheader: ; preds = %entry
492 %wide.trip.count = zext i32 %lim to i64
495 for.cond.cleanup: ; preds = %for.body, %entry
498 for.body: ; preds = %for.body, %for.body.preheader
499 %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
500 %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
501 %1 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
502 %2 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
503 %3 = trunc i64 %indvars.iv to i32
504 %mul = mul nsw i32 %3, 6
505 %idxprom = zext i32 %mul to i64
506 %arrayidx = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom
507 %4 = load <16 x i8>, <16 x i8>* %arrayidx, align 16
508 %add2 = or i32 %mul, 1
509 %idxprom3 = zext i32 %add2 to i64
510 %arrayidx4 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom3
511 %5 = load <16 x i8>, <16 x i8>* %arrayidx4, align 16
512 %6 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %4, <16 x i8> %5)
513 %add6 = add nuw nsw i32 %mul, 2
514 %idxprom7 = zext i32 %add6 to i64
515 %arrayidx8 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom7
516 %7 = load <16 x i8>, <16 x i8>* %arrayidx8, align 16
517 %add10 = add nuw nsw i32 %mul, 3
518 %idxprom11 = zext i32 %add10 to i64
519 %arrayidx12 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom11
520 %8 = load <16 x i8>, <16 x i8>* %arrayidx12, align 16
521 %9 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %1, <16 x i8> %7, <16 x i8> %8)
522 %add14 = add nuw nsw i32 %mul, 4
523 %idxprom15 = zext i32 %add14 to i64
524 %arrayidx16 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom15
525 %10 = load <16 x i8>, <16 x i8>* %arrayidx16, align 16
526 %add18 = add nuw nsw i32 %mul, 5
527 %idxprom19 = zext i32 %add18 to i64
528 %arrayidx20 = getelementptr inbounds <16 x i8>, <16 x i8>* %vc, i64 %idxprom19
529 %11 = load <16 x i8>, <16 x i8>* %arrayidx20, align 16
530 %12 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %2, <16 x i8> %10, <16 x i8> %11)
531 %mul21 = mul i64 %indvars.iv, 3
532 %idx.ext = and i64 %mul21, 4294967295
533 %add.ptr = getelementptr inbounds <512 x i1>, <512 x i1>* %res, i64 %idx.ext
534 store <512 x i1> %6, <512 x i1>* %add.ptr, align 64
535 %add.ptr26 = getelementptr inbounds <512 x i1>, <512 x i1>* %add.ptr, i64 1
536 store <512 x i1> %9, <512 x i1>* %add.ptr26, align 64
537 %add.ptr30 = getelementptr inbounds <512 x i1>, <512 x i1>* %add.ptr, i64 2
538 store <512 x i1> %12, <512 x i1>* %add.ptr30, align 64
539 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
540 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
541 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
544 declare i32 @testRedundantPrimeUnprimeF()
545 define void @testRedundantPrimeUnprime(<512 x i1>* %dst, <16 x i8> %vc) nounwind {
546 ; CHECK-LABEL: testRedundantPrimeUnprime:
547 ; CHECK: # %bb.0: # %entry
548 ; CHECK-NEXT: mflr r0
549 ; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
550 ; CHECK-NEXT: std r0, 16(r1)
551 ; CHECK-NEXT: stdu r1, -112(r1)
552 ; CHECK-NEXT: xxsetaccz acc0
553 ; CHECK-NEXT: xxsetaccz acc1
554 ; CHECK-NEXT: mr r30, r3
555 ; CHECK-NEXT: xxmfacc acc0
556 ; CHECK-NEXT: stxv vs0, 48(r3)
557 ; CHECK-NEXT: stxv vs1, 32(r3)
558 ; CHECK-NEXT: stxv vs2, 16(r3)
559 ; CHECK-NEXT: stxv vs3, 0(r3)
560 ; CHECK-NEXT: xvf32gerpp acc1, v2, v2
561 ; CHECK-NEXT: xxmfacc acc1
562 ; CHECK-NEXT: stxvp vsp4, 64(r1)
563 ; CHECK-NEXT: stxvp vsp6, 32(r1)
564 ; CHECK-NEXT: bl testRedundantPrimeUnprimeF@notoc
565 ; CHECK-NEXT: lxvp vsp0, 64(r1)
566 ; CHECK-NEXT: lxvp vsp2, 32(r1)
567 ; CHECK-NEXT: stxv vs0, 112(r30)
568 ; CHECK-NEXT: stxv vs1, 96(r30)
569 ; CHECK-NEXT: stxv vs2, 80(r30)
570 ; CHECK-NEXT: stxv vs3, 64(r30)
571 ; CHECK-NEXT: addi r1, r1, 112
572 ; CHECK-NEXT: ld r0, 16(r1)
573 ; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
574 ; CHECK-NEXT: mtlr r0
577 ; CHECK-BE-LABEL: testRedundantPrimeUnprime:
578 ; CHECK-BE: # %bb.0: # %entry
579 ; CHECK-BE-NEXT: mflr r0
580 ; CHECK-BE-NEXT: std r0, 16(r1)
581 ; CHECK-BE-NEXT: stdu r1, -192(r1)
582 ; CHECK-BE-NEXT: xxsetaccz acc0
583 ; CHECK-BE-NEXT: xxsetaccz acc1
584 ; CHECK-BE-NEXT: std r30, 176(r1) # 8-byte Folded Spill
585 ; CHECK-BE-NEXT: mr r30, r3
586 ; CHECK-BE-NEXT: xxmfacc acc0
587 ; CHECK-BE-NEXT: stxv vs1, 16(r3)
588 ; CHECK-BE-NEXT: stxv vs0, 0(r3)
589 ; CHECK-BE-NEXT: stxv vs3, 48(r3)
590 ; CHECK-BE-NEXT: stxv vs2, 32(r3)
591 ; CHECK-BE-NEXT: xvf32gerpp acc1, v2, v2
592 ; CHECK-BE-NEXT: xxmfacc acc1
593 ; CHECK-BE-NEXT: stxvp vsp4, 112(r1)
594 ; CHECK-BE-NEXT: stxvp vsp6, 144(r1)
595 ; CHECK-BE-NEXT: bl testRedundantPrimeUnprimeF
597 ; CHECK-BE-NEXT: lxvp vsp0, 112(r1)
598 ; CHECK-BE-NEXT: lxvp vsp2, 144(r1)
599 ; CHECK-BE-NEXT: stxv vs3, 112(r30)
600 ; CHECK-BE-NEXT: stxv vs2, 96(r30)
601 ; CHECK-BE-NEXT: stxv vs1, 80(r30)
602 ; CHECK-BE-NEXT: stxv vs0, 64(r30)
603 ; CHECK-BE-NEXT: ld r30, 176(r1) # 8-byte Folded Reload
604 ; CHECK-BE-NEXT: addi r1, r1, 192
605 ; CHECK-BE-NEXT: ld r0, 16(r1)
606 ; CHECK-BE-NEXT: mtlr r0
609 %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
610 store <512 x i1> %0, <512 x i1>* %dst, align 64
611 %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
612 %call = tail call signext i32 bitcast (i32 ()* @testRedundantPrimeUnprimeF to i32 ()*)()
613 %add.ptr1 = getelementptr inbounds <512 x i1>, <512 x i1>* %dst, i64 1
614 store <512 x i1> %1, <512 x i1>* %add.ptr1, align 64
618 declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*)
619 declare void @llvm.ppc.vsx.stxvp(<256 x i1>, i8*)
621 ; Function Attrs: nofree nounwind
622 define void @test_ldst_1(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) {
623 ; CHECK-LABEL: test_ldst_1:
624 ; CHECK: # %bb.0: # %entry
625 ; CHECK-NEXT: lxv vs1, 32(r3)
626 ; CHECK-NEXT: lxv vs0, 48(r3)
627 ; CHECK-NEXT: lxv vs3, 0(r3)
628 ; CHECK-NEXT: lxv vs2, 16(r3)
629 ; CHECK-NEXT: li r3, 8
630 ; CHECK-NEXT: lxvpx vsp36, r4, r3
631 ; CHECK-NEXT: xxmtacc acc0
632 ; CHECK-NEXT: pmxvf64gernn acc0, vsp36, v2, 0, 0
633 ; CHECK-NEXT: xxmfacc acc0
634 ; CHECK-NEXT: stxv vs0, 48(r7)
635 ; CHECK-NEXT: stxv vs1, 32(r7)
636 ; CHECK-NEXT: stxv vs2, 16(r7)
637 ; CHECK-NEXT: stxv vs3, 0(r7)
640 ; CHECK-BE-LABEL: test_ldst_1:
641 ; CHECK-BE: # %bb.0: # %entry
642 ; CHECK-BE-NEXT: lxv vs1, 16(r3)
643 ; CHECK-BE-NEXT: lxv vs0, 0(r3)
644 ; CHECK-BE-NEXT: lxv vs3, 48(r3)
645 ; CHECK-BE-NEXT: lxv vs2, 32(r3)
646 ; CHECK-BE-NEXT: li r3, 8
647 ; CHECK-BE-NEXT: lxvpx vsp36, r4, r3
648 ; CHECK-BE-NEXT: xxmtacc acc0
649 ; CHECK-BE-NEXT: pmxvf64gernn acc0, vsp36, v2, 0, 0
650 ; CHECK-BE-NEXT: xxmfacc acc0
651 ; CHECK-BE-NEXT: stxv vs1, 16(r7)
652 ; CHECK-BE-NEXT: stxv vs0, 0(r7)
653 ; CHECK-BE-NEXT: stxv vs3, 48(r7)
654 ; CHECK-BE-NEXT: stxv vs2, 32(r7)
657 %0 = bitcast i8* %vqp to <512 x i1>*
658 %1 = load <512 x i1>, <512 x i1>* %0, align 64
659 %2 = bitcast <256 x i1>* %vpp to i8*
660 %3 = getelementptr i8, i8* %2, i64 8
661 %4 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %3)
662 %5 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %1, <256 x i1> %4, <16 x i8> %vc, i32 0, i32 0)
663 %6 = bitcast i8* %resp to <512 x i1>*
664 store <512 x i1> %5, <512 x i1>* %6, align 64
668 ; Function Attrs: nofree nounwind
669 define void @test_ldst_2(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) {
670 ; CHECK-LABEL: test_ldst_2:
671 ; CHECK: # %bb.0: # %entry
672 ; CHECK-NEXT: lxv vs1, 32(r3)
673 ; CHECK-NEXT: lxv vs0, 48(r3)
674 ; CHECK-NEXT: lxv vs3, 0(r3)
675 ; CHECK-NEXT: lxv vs2, 16(r3)
676 ; CHECK-NEXT: lxvp vsp36, 0(r4)
677 ; CHECK-NEXT: xxmtacc acc0
678 ; CHECK-NEXT: xvf64gernp acc0, vsp36, v2
679 ; CHECK-NEXT: xxmfacc acc0
680 ; CHECK-NEXT: stxv vs0, 48(r7)
681 ; CHECK-NEXT: stxv vs1, 32(r7)
682 ; CHECK-NEXT: stxv vs2, 16(r7)
683 ; CHECK-NEXT: stxv vs3, 0(r7)
686 ; CHECK-BE-LABEL: test_ldst_2:
687 ; CHECK-BE: # %bb.0: # %entry
688 ; CHECK-BE-NEXT: lxv vs1, 16(r3)
689 ; CHECK-BE-NEXT: lxv vs0, 0(r3)
690 ; CHECK-BE-NEXT: lxv vs3, 48(r3)
691 ; CHECK-BE-NEXT: lxv vs2, 32(r3)
692 ; CHECK-BE-NEXT: lxvp vsp36, 0(r4)
693 ; CHECK-BE-NEXT: xxmtacc acc0
694 ; CHECK-BE-NEXT: xvf64gernp acc0, vsp36, v2
695 ; CHECK-BE-NEXT: xxmfacc acc0
696 ; CHECK-BE-NEXT: stxv vs1, 16(r7)
697 ; CHECK-BE-NEXT: stxv vs0, 0(r7)
698 ; CHECK-BE-NEXT: stxv vs3, 48(r7)
699 ; CHECK-BE-NEXT: stxv vs2, 32(r7)
702 %0 = bitcast i8* %vqp to <512 x i1>*
703 %1 = load <512 x i1>, <512 x i1>* %0, align 64
704 %2 = bitcast <256 x i1>* %vpp to i8*
705 %3 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %2)
706 %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc)
707 %5 = bitcast i8* %resp to <512 x i1>*
708 store <512 x i1> %4, <512 x i1>* %5, align 64
712 ; Function Attrs: nofree nounwind
713 define void @test_ldst_3(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) {
714 ; CHECK-LABEL: test_ldst_3:
715 ; CHECK: # %bb.0: # %entry
716 ; CHECK-NEXT: lxv vs1, 32(r3)
717 ; CHECK-NEXT: lxv vs0, 48(r3)
718 ; CHECK-NEXT: lxv vs3, 0(r3)
719 ; CHECK-NEXT: lxv vs2, 16(r3)
720 ; CHECK-NEXT: lxvp vsp36, 0(r5)
721 ; CHECK-NEXT: xxmtacc acc0
722 ; CHECK-NEXT: xvf64gernp acc0, vsp36, v2
723 ; CHECK-NEXT: xxmfacc acc0
724 ; CHECK-NEXT: stxv vs0, 48(r9)
725 ; CHECK-NEXT: stxv vs1, 32(r9)
726 ; CHECK-NEXT: stxv vs2, 16(r9)
727 ; CHECK-NEXT: stxv vs3, 0(r9)
730 ; CHECK-BE-LABEL: test_ldst_3:
731 ; CHECK-BE: # %bb.0: # %entry
732 ; CHECK-BE-NEXT: lxv vs1, 16(r3)
733 ; CHECK-BE-NEXT: lxv vs0, 0(r3)
734 ; CHECK-BE-NEXT: lxv vs3, 48(r3)
735 ; CHECK-BE-NEXT: lxv vs2, 32(r3)
736 ; CHECK-BE-NEXT: lxvp vsp36, 0(r5)
737 ; CHECK-BE-NEXT: xxmtacc acc0
738 ; CHECK-BE-NEXT: xvf64gernp acc0, vsp36, v2
739 ; CHECK-BE-NEXT: xxmfacc acc0
740 ; CHECK-BE-NEXT: stxv vs1, 16(r9)
741 ; CHECK-BE-NEXT: stxv vs0, 0(r9)
742 ; CHECK-BE-NEXT: stxv vs3, 48(r9)
743 ; CHECK-BE-NEXT: stxv vs2, 32(r9)
746 %0 = bitcast i8* %vqp to <512 x i1>*
747 %1 = load <512 x i1>, <512 x i1>* %0, align 64
748 %2 = bitcast <256 x i1>* %vpp to i8*
749 %3 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %2)
750 %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc)
751 %5 = bitcast i8* %resp to <512 x i1>*
752 store <512 x i1> %4, <512 x i1>* %5, align 64
756 declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32)
757 declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>)