1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2 ; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2
3 ; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE
4 ; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
6 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
7 target triple = "aarch64-none-unknown-elf"
9 define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
10 ; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0(
11 ; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
12 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
13 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
14 ; CHECK-SVE2: [[VECTOR_BODY]]:
15 ; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
16 ; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
17 ; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
18 ; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
19 ; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
20 ; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
21 ; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
22 ; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 0)
23 ; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 0)
24 ; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
25 ; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
26 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
27 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
28 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
29 ; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
31 ; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
32 ; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
33 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
34 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
35 ; CHECK-SVE: [[VECTOR_BODY]]:
36 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
37 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
38 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
39 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
40 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
41 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
42 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
43 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
44 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
45 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
46 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
47 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
48 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
49 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
50 ; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
51 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
52 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
53 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
54 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
55 ; CHECK-SVE-NEXT: ret i32 [[TMP11]]
57 ; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0(
58 ; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
59 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
60 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
61 ; CHECK-NOSVE: [[VECTOR_BODY]]:
62 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
63 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
64 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
65 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
66 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
67 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
68 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
69 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
70 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
71 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
72 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
73 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
74 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
75 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
76 ; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
77 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
78 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
79 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
80 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
81 ; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
86 vector.body: ; preds = %vector.body, %entry
87 %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
88 %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
89 %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
90 %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
91 %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
92 %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
93 %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
94 %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
95 %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
96 %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
97 %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
98 %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
99 %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
100 %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
101 %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
102 %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
103 br i1 true, label %middle.block, label %vector.body
105 middle.block: ; preds = %vector.body
106 %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
110 define i32 @cdotp_i8_rot90(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
111 ; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90(
112 ; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
113 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
114 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
115 ; CHECK-SVE2: [[VECTOR_BODY]]:
116 ; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
117 ; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
118 ; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
119 ; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
120 ; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
121 ; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
122 ; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
123 ; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 90)
124 ; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 90)
125 ; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
126 ; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
127 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
128 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
129 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
130 ; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
132 ; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90(
133 ; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
134 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
135 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
136 ; CHECK-SVE: [[VECTOR_BODY]]:
137 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
138 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
139 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
140 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
141 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
142 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
143 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
144 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
145 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
146 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
147 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
148 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
149 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
150 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
151 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
152 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
153 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
154 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
155 ; CHECK-SVE-NEXT: ret i32 [[TMP11]]
157 ; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90(
158 ; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
159 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
160 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
161 ; CHECK-NOSVE: [[VECTOR_BODY]]:
162 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
163 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
164 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
165 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
166 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
167 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
168 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
169 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
170 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
171 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
172 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
173 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
174 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
175 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
176 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
177 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
178 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
179 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
180 ; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
183 br label %vector.body
185 vector.body: ; preds = %vector.body, %entry
186 %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
187 %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
188 %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
189 %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
190 %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
191 %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
192 %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
193 %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
194 %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
195 %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
196 %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
197 %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
198 %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
199 %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
200 %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
201 br i1 true, label %middle.block, label %vector.body
203 middle.block: ; preds = %vector.body
204 %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
208 define i32 @cdotp_i8_rot180(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
209 ; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180(
210 ; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
211 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
212 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
213 ; CHECK-SVE2: [[VECTOR_BODY]]:
214 ; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
215 ; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
216 ; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
217 ; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
218 ; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
219 ; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
220 ; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
221 ; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 180)
222 ; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 180)
223 ; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
224 ; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
225 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
226 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
227 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
228 ; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
230 ; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180(
231 ; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
232 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
233 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
234 ; CHECK-SVE: [[VECTOR_BODY]]:
235 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
236 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
237 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
238 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
239 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
240 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
241 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
242 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
243 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
244 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
245 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
246 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
247 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
248 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
249 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
250 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
251 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
252 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
253 ; CHECK-SVE-NEXT: ret i32 [[TMP11]]
255 ; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180(
256 ; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
257 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
258 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
259 ; CHECK-NOSVE: [[VECTOR_BODY]]:
260 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
261 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
262 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
263 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
264 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
265 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
266 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
267 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
268 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
269 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
270 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
271 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
272 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
273 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
274 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
275 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
276 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
277 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
278 ; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
281 br label %vector.body
283 vector.body: ; preds = %vector.body, %entry
284 %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
285 %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
286 %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
287 %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
288 %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
289 %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
290 %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
291 %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
292 %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
293 %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
294 %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
295 %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
296 %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
297 %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
298 %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
299 br i1 true, label %middle.block, label %vector.body
301 middle.block: ; preds = %vector.body
302 %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
306 define i32 @cdotp_i8_rot270(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
307 ; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270(
308 ; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
309 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
310 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
311 ; CHECK-SVE2: [[VECTOR_BODY]]:
312 ; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
313 ; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
314 ; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
315 ; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
316 ; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
317 ; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
318 ; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
319 ; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 270)
320 ; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 270)
321 ; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
322 ; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
323 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
324 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
325 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
326 ; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
328 ; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270(
329 ; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
330 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
331 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
332 ; CHECK-SVE: [[VECTOR_BODY]]:
333 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
334 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
335 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
336 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
337 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
338 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
339 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
340 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
341 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
342 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
343 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
344 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
345 ; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
346 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
347 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
348 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
349 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
350 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
351 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
352 ; CHECK-SVE-NEXT: ret i32 [[TMP11]]
354 ; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270(
355 ; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
356 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
357 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
358 ; CHECK-NOSVE: [[VECTOR_BODY]]:
359 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
360 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
361 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
362 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
363 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
364 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
365 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
366 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
367 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
368 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
369 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
370 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
371 ; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
372 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
373 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
374 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
375 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
376 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
377 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
378 ; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
381 br label %vector.body
383 vector.body: ; preds = %vector.body, %entry
384 %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
385 %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
386 %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
387 %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
388 %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
389 %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
390 %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
391 %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
392 %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
393 %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
394 %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
395 %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
396 %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
397 %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
398 %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
399 %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
400 br i1 true, label %middle.block, label %vector.body
402 middle.block: ; preds = %vector.body
403 %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
407 define i64 @cdotp_i16_rot0(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
408 ; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0(
409 ; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
410 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
411 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
412 ; CHECK-SVE2: [[VECTOR_BODY]]:
413 ; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
414 ; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
415 ; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
416 ; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
417 ; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
418 ; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
419 ; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
420 ; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 0)
421 ; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 0)
422 ; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
423 ; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
424 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
425 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
426 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
427 ; CHECK-SVE2-NEXT: ret i64 [[TMP0]]
429 ; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0(
430 ; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
431 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
432 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
433 ; CHECK-SVE: [[VECTOR_BODY]]:
434 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
435 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
436 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
437 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
438 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
439 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
440 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
441 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
442 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
443 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
444 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
445 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
446 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
447 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
448 ; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
449 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
450 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
451 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
452 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
453 ; CHECK-SVE-NEXT: ret i64 [[TMP11]]
455 ; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0(
456 ; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
457 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
458 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
459 ; CHECK-NOSVE: [[VECTOR_BODY]]:
460 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
461 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
462 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
463 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
464 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
465 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
466 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
467 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
468 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
469 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
470 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
471 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
472 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
473 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
474 ; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
475 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
476 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
477 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
478 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
479 ; CHECK-NOSVE-NEXT: ret i64 [[TMP0]]
482 br label %vector.body
484 vector.body: ; preds = %vector.body, %entry
485 %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
486 %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
487 %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
488 %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
489 %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
490 %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
491 %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
492 %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
493 %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
494 %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
495 %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
496 %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
497 %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
498 %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
499 %imag.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %imag.mul
500 %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul.neg)
501 br i1 true, label %middle.block, label %vector.body
503 middle.block: ; preds = %vector.body
504 %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
508 define i64 @cdotp_i16_rot90(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
509 ; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90(
510 ; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
511 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
512 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
513 ; CHECK-SVE2: [[VECTOR_BODY]]:
514 ; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
515 ; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
516 ; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
517 ; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
518 ; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
519 ; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
520 ; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
521 ; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 90)
522 ; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 90)
523 ; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
524 ; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
525 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
526 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
527 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
528 ; CHECK-SVE2-NEXT: ret i64 [[TMP0]]
530 ; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90(
531 ; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
532 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
533 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
534 ; CHECK-SVE: [[VECTOR_BODY]]:
535 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
536 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
537 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
538 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
539 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
540 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
541 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
542 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
543 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
544 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
545 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
546 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
547 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
548 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
549 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
550 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
551 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
552 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
553 ; CHECK-SVE-NEXT: ret i64 [[TMP11]]
555 ; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90(
556 ; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
557 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
558 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
559 ; CHECK-NOSVE: [[VECTOR_BODY]]:
560 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
561 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
562 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
563 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
564 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
565 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
566 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
567 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
568 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
569 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
570 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
571 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
572 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
573 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
574 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
575 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
576 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
577 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
578 ; CHECK-NOSVE-NEXT: ret i64 [[TMP0]]
581 br label %vector.body
583 vector.body: ; preds = %vector.body, %entry
584 %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
585 %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
586 %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
587 %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
588 %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
589 %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
590 %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
591 %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
592 %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
593 %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
594 %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
595 %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
596 %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
597 %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
598 %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
599 br i1 true, label %middle.block, label %vector.body
601 middle.block: ; preds = %vector.body
602 %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
606 define i64 @cdotp_i16_rot180(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
607 ; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180(
608 ; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
609 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
610 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
611 ; CHECK-SVE2: [[VECTOR_BODY]]:
612 ; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
613 ; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
614 ; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
615 ; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
616 ; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
617 ; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
618 ; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
619 ; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 180)
620 ; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 180)
621 ; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
622 ; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
623 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
624 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
625 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
626 ; CHECK-SVE2-NEXT: ret i64 [[TMP0]]
628 ; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180(
629 ; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
630 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
631 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
632 ; CHECK-SVE: [[VECTOR_BODY]]:
633 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
634 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
635 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
636 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
637 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
638 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
639 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
640 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
641 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
642 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
643 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
644 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
645 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
646 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
647 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
648 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
649 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
650 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
651 ; CHECK-SVE-NEXT: ret i64 [[TMP11]]
653 ; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180(
654 ; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
655 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
656 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
657 ; CHECK-NOSVE: [[VECTOR_BODY]]:
658 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
659 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
660 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
661 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
662 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
663 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
664 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
665 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
666 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
667 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
668 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
669 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
670 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
671 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
672 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
673 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
674 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
675 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
676 ; CHECK-NOSVE-NEXT: ret i64 [[TMP0]]
679 br label %vector.body
681 vector.body: ; preds = %vector.body, %entry
682 %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
683 %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
684 %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
685 %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
686 %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
687 %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
688 %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
689 %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
690 %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
691 %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
692 %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
693 %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
694 %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
695 %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
696 %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
697 br i1 true, label %middle.block, label %vector.body
699 middle.block: ; preds = %vector.body
700 %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
704 define i64 @cdotp_i16_rot270(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
705 ; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270(
706 ; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
707 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
708 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
709 ; CHECK-SVE2: [[VECTOR_BODY]]:
710 ; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
711 ; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
712 ; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
713 ; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
714 ; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
715 ; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
716 ; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
717 ; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 270)
718 ; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 270)
719 ; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
720 ; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
721 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
722 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
723 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
724 ; CHECK-SVE2-NEXT: ret i64 [[TMP0]]
726 ; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270(
727 ; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
728 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
729 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
730 ; CHECK-SVE: [[VECTOR_BODY]]:
731 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
732 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
733 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
734 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
735 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
736 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
737 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
738 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
739 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
740 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
741 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
742 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
743 ; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
744 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
745 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
746 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
747 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
748 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
749 ; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
750 ; CHECK-SVE-NEXT: ret i64 [[TMP11]]
752 ; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270(
753 ; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
754 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
755 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
756 ; CHECK-NOSVE: [[VECTOR_BODY]]:
757 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
758 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
759 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
760 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
761 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
762 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
763 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
764 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
765 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
766 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
767 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
768 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
769 ; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
770 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
771 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
772 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
773 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
774 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
775 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
776 ; CHECK-NOSVE-NEXT: ret i64 [[TMP0]]
779 br label %vector.body
781 vector.body: ; preds = %vector.body, %entry
782 %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
783 %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
784 %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
785 %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
786 %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
787 %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
788 %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
789 %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
790 %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
791 %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
792 %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
793 %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
794 %real.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %real.mul
795 %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul.neg)
796 %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
797 %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
798 br i1 true, label %middle.block, label %vector.body
800 middle.block: ; preds = %vector.body
801 %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
806 define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
807 ; CHECK-SVE2-LABEL: define i32 @not_cdotp(
808 ; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
809 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
810 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
811 ; CHECK-SVE2: [[VECTOR_BODY]]:
812 ; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
813 ; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
814 ; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
815 ; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
816 ; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
817 ; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
818 ; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
819 ; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
820 ; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
821 ; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
822 ; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
823 ; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
824 ; CHECK-SVE2-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
825 ; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
826 ; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
827 ; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
828 ; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
829 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
830 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
831 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
832 ; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
834 ; CHECK-SVE-LABEL: define i32 @not_cdotp(
835 ; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
836 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
837 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
838 ; CHECK-SVE: [[VECTOR_BODY]]:
839 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
840 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
841 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
842 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
843 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
844 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
845 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
846 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
847 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
848 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
849 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
850 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
851 ; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
852 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
853 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
854 ; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
855 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
856 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
857 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
858 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
859 ; CHECK-SVE-NEXT: ret i32 [[TMP0]]
861 ; CHECK-NOSVE-LABEL: define i32 @not_cdotp(
862 ; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
863 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
864 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
865 ; CHECK-NOSVE: [[VECTOR_BODY]]:
866 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
867 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
868 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
869 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
870 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
871 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
872 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
873 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
874 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
875 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
876 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
877 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
878 ; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
879 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
880 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
881 ; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
882 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
883 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
884 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
885 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
886 ; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
889 br label %vector.body
891 vector.body: ; preds = %vector.body, %entry
892 %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
893 %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
894 %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
895 %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
896 %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
897 %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
898 %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
899 %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
900 %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
901 %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
902 %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
903 %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
904 %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
905 %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
906 %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
907 %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
908 %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
909 br i1 true, label %middle.block, label %vector.body
911 middle.block: ; preds = %vector.body
912 %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
916 define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
917 ; CHECK-SVE2-LABEL: define i16 @invalid_type(
918 ; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
919 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
920 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
921 ; CHECK-SVE2: [[VECTOR_BODY]]:
922 ; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
923 ; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
924 ; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
925 ; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
926 ; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
927 ; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
928 ; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
929 ; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
930 ; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
931 ; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
932 ; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
933 ; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
934 ; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
935 ; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
936 ; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
937 ; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
938 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
939 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
940 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
941 ; CHECK-SVE2-NEXT: ret i16 [[TMP0]]
943 ; CHECK-SVE-LABEL: define i16 @invalid_type(
944 ; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
945 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
946 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
947 ; CHECK-SVE: [[VECTOR_BODY]]:
948 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
949 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
950 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
951 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
952 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
953 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
954 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
955 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
956 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
957 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
958 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
959 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
960 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
961 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
962 ; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
963 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
964 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
965 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
966 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
967 ; CHECK-SVE-NEXT: ret i16 [[TMP0]]
969 ; CHECK-NOSVE-LABEL: define i16 @invalid_type(
970 ; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
971 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
972 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
973 ; CHECK-NOSVE: [[VECTOR_BODY]]:
974 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
975 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
976 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
977 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
978 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
979 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
980 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
981 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
982 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
983 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
984 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
985 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
986 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
987 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
988 ; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
989 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
990 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
991 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
992 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
993 ; CHECK-NOSVE-NEXT: ret i16 [[TMP0]]
996 br label %vector.body
998 vector.body: ; preds = %vector.body, %entry
999 %vec.phi = phi <vscale x 8 x i16> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
1000 %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
1001 %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
1002 %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
1003 %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
1004 %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
1005 %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
1006 %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
1007 %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
1008 %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
1009 %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
1010 %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
1011 %real.mul.reduced = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %vec.phi, <vscale x 16 x i32> %real.mul)
1012 %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
1013 %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
1014 %partial.reduce.sub = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
1015 br i1 true, label %middle.block, label %vector.body
1017 middle.block: ; preds = %vector.body
1018 %0 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %partial.reduce.sub)
1022 define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
1023 ; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
1024 ; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
1025 ; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
1026 ; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
1027 ; CHECK-SVE2: [[VECTOR_BODY]]:
1028 ; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
1029 ; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
1030 ; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
1031 ; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
1032 ; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
1033 ; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
1034 ; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
1035 ; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
1036 ; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
1037 ; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
1038 ; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
1039 ; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
1040 ; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
1041 ; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
1042 ; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
1043 ; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
1044 ; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
1045 ; CHECK-SVE2: [[MIDDLE_BLOCK]]:
1046 ; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
1047 ; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
1049 ; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
1050 ; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
1051 ; CHECK-SVE-NEXT: [[ENTRY:.*]]:
1052 ; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
1053 ; CHECK-SVE: [[VECTOR_BODY]]:
1054 ; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
1055 ; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
1056 ; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
1057 ; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
1058 ; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
1059 ; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
1060 ; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
1061 ; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
1062 ; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
1063 ; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
1064 ; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
1065 ; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
1066 ; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
1067 ; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
1068 ; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
1069 ; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
1070 ; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
1071 ; CHECK-SVE: [[MIDDLE_BLOCK]]:
1072 ; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
1073 ; CHECK-SVE-NEXT: ret i32 [[TMP0]]
1075 ; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
1076 ; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) {
1077 ; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
1078 ; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
1079 ; CHECK-NOSVE: [[VECTOR_BODY]]:
1080 ; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
1081 ; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
1082 ; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
1083 ; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
1084 ; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
1085 ; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
1086 ; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
1087 ; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
1088 ; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
1089 ; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
1090 ; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
1091 ; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
1092 ; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
1093 ; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
1094 ; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
1095 ; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
1096 ; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
1097 ; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
1098 ; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
1099 ; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
1102 br label %vector.body
1104 vector.body: ; preds = %vector.body, %entry
1105 %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
1106 %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a)
1107 %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b)
1108 %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0
1109 %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1
1110 %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0
1111 %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1
1112 %a.real.ext = sext <16 x i8> %a.real to <16 x i32>
1113 %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32>
1114 %b.real.ext = sext <16 x i8> %b.real to <16 x i32>
1115 %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32>
1116 %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext
1117 %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul)
1118 %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext
1119 %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul
1120 %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg)
1121 br i1 true, label %middle.block, label %vector.body
1123 middle.block: ; preds = %vector.body
1124 %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub)
1128 declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
1129 declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
1130 declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
1132 declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
1133 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1135 declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
1136 declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)