1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
4 target triple="aarch64-linux-gnu"
7 ; == Multi, multi (16-bit float) ==
9 define void @fdot_multi_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3) #0 {
10 ; CHECK-LABEL: fdot_multi_za32_f16_vg1x2:
12 ; CHECK-NEXT: mov z5.d, z4.d
13 ; CHECK-NEXT: mov z7.d, z2.d
14 ; CHECK-NEXT: mov w8, w0
15 ; CHECK-NEXT: mov z4.d, z3.d
16 ; CHECK-NEXT: mov z6.d, z1.d
17 ; CHECK-NEXT: fdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
18 ; CHECK-NEXT: fdot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
20 call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3)
21 %slice2 = add i32 %slice, 7
22 call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3)
26 define void @fdot_multi_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
27 ; CHECK-LABEL: fdot_multi_za32_f16_vg1x4:
29 ; CHECK-NEXT: mov z26.d, z7.d
30 ; CHECK-NEXT: mov z31.d, z4.d
31 ; CHECK-NEXT: mov w8, w0
32 ; CHECK-NEXT: ptrue p0.h
33 ; CHECK-NEXT: mov z25.d, z6.d
34 ; CHECK-NEXT: mov z30.d, z3.d
35 ; CHECK-NEXT: mov z24.d, z5.d
36 ; CHECK-NEXT: mov z29.d, z2.d
37 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
38 ; CHECK-NEXT: mov z28.d, z1.d
39 ; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
40 ; CHECK-NEXT: fdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
42 <vscale x 8 x half> %zn4, <vscale x 8 x half> %zn5, <vscale x 8 x half> %zn6, <vscale x 8 x half> %zn7) #0 {
43 call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
44 <vscale x 8 x half> %zn4, <vscale x 8 x half> %zn5, <vscale x 8 x half> %zn6, <vscale x 8 x half> %zn7)
45 %slice2 = add i32 %slice, 7
46 call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
47 <vscale x 8 x half> %zn4, <vscale x 8 x half> %zn5, <vscale x 8 x half> %zn6, <vscale x 8 x half> %zn7)
52 ; == Multi, multi (16-bit bfloat) ==
54 define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3) #0 {
55 ; CHECK-LABEL: bfdot_multi_za32_bf16_vg1x2:
57 ; CHECK-NEXT: mov z5.d, z4.d
58 ; CHECK-NEXT: mov z7.d, z2.d
59 ; CHECK-NEXT: mov w8, w0
60 ; CHECK-NEXT: mov z4.d, z3.d
61 ; CHECK-NEXT: mov z6.d, z1.d
62 ; CHECK-NEXT: bfdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
63 ; CHECK-NEXT: bfdot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
65 call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3)
66 %slice2 = add i32 %slice, 7
67 call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3)
71 define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
72 ; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4:
74 ; CHECK-NEXT: mov z26.d, z7.d
75 ; CHECK-NEXT: mov z31.d, z4.d
76 ; CHECK-NEXT: mov w8, w0
77 ; CHECK-NEXT: ptrue p0.h
78 ; CHECK-NEXT: mov z25.d, z6.d
79 ; CHECK-NEXT: mov z30.d, z3.d
80 ; CHECK-NEXT: mov z24.d, z5.d
81 ; CHECK-NEXT: mov z29.d, z2.d
82 ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1]
83 ; CHECK-NEXT: mov z28.d, z1.d
84 ; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
85 ; CHECK-NEXT: bfdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
87 <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zn5, <vscale x 8 x bfloat> %zn6, <vscale x 8 x bfloat> %zn7) #0 {
88 call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
89 <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zn5, <vscale x 8 x bfloat> %zn6, <vscale x 8 x bfloat> %zn7)
90 %slice2 = add i32 %slice, 7
91 call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
92 <vscale x 8 x bfloat> %zn4, <vscale x 8 x bfloat> %zn5, <vscale x 8 x bfloat> %zn6, <vscale x 8 x bfloat> %zn7)
97 ; == Multi, single (16-bit float) ==
99 define void @fdot_single_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) #0 {
100 ; CHECK-LABEL: fdot_single_za32_f16_vg1x2:
102 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
103 ; CHECK-NEXT: mov w8, w0
104 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
105 ; CHECK-NEXT: fdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
106 ; CHECK-NEXT: fdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
108 call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
109 %slice2 = add i32 %slice, 7
110 call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2)
114 define void @fdot_single_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) #0 {
115 ; CHECK-LABEL: fdot_single_za32_f16_vg1x4:
117 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
118 ; CHECK-NEXT: mov w8, w0
119 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
120 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
121 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
122 ; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
123 ; CHECK-NEXT: fdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
125 call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
126 %slice2 = add i32 %slice, 7
127 call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4)
132 ; == Multi, single (16-bit bfloat) ==
134 define void @bfdot_single_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) #0 {
135 ; CHECK-LABEL: bfdot_single_za32_bf16_vg1x2:
137 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
138 ; CHECK-NEXT: mov w8, w0
139 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
140 ; CHECK-NEXT: bfdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
141 ; CHECK-NEXT: bfdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
143 call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
144 %slice2 = add i32 %slice, 7
145 call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2)
149 define void @bfdot_single_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) #0 {
150 ; CHECK-LABEL: bfdot_single_za32_bf16_vg1x4:
152 ; CHECK-NEXT: // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
153 ; CHECK-NEXT: mov w8, w0
154 ; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
155 ; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
156 ; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
157 ; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
158 ; CHECK-NEXT: bfdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
160 call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
161 %slice2 = add i32 %slice, 7
162 call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4)
167 ; == Multi, indexed (16-bit float) ==
169 define void @fdot_lane_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) #0 {
170 ; CHECK-LABEL: fdot_lane_za32_f16_vg1x2:
172 ; CHECK-NEXT: mov z5.d, z2.d
173 ; CHECK-NEXT: mov w8, w0
174 ; CHECK-NEXT: mov z4.d, z1.d
175 ; CHECK-NEXT: fdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
176 ; CHECK-NEXT: fdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
178 call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, i32 3)
179 %slice2 = add i32 %slice, 7
180 call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, i32 3)
184 define void @fdot_lane_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) #0 {
185 ; CHECK-LABEL: fdot_lane_za32_f16_vg1x4:
187 ; CHECK-NEXT: mov z27.d, z4.d
188 ; CHECK-NEXT: mov w8, w0
189 ; CHECK-NEXT: mov z26.d, z3.d
190 ; CHECK-NEXT: mov z25.d, z2.d
191 ; CHECK-NEXT: mov z24.d, z1.d
192 ; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
193 ; CHECK-NEXT: fdot za.s[w8, 7, vgx4], { z24.h - z27.h }, z5.h[3]
195 call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
196 <vscale x 8 x half> %zn4, i32 3)
197 %slice2 = add i32 %slice, 7
198 call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 %slice2, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
199 <vscale x 8 x half> %zn4, i32 3)
204 ; == Multi, indexed (16-bit bfloat) ==
206 define void @bfdot_lane_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) #0 {
207 ; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x2:
209 ; CHECK-NEXT: mov z5.d, z2.d
210 ; CHECK-NEXT: mov w8, w0
211 ; CHECK-NEXT: mov z4.d, z1.d
212 ; CHECK-NEXT: bfdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
213 ; CHECK-NEXT: bfdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
215 call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, i32 3)
216 %slice2 = add i32 %slice, 7
217 call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, i32 3)
221 define void @bfdot_lane_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) #0 {
222 ; CHECK-LABEL: bfdot_lane_za32_bf16_vg1x4:
224 ; CHECK-NEXT: mov z27.d, z4.d
225 ; CHECK-NEXT: mov w8, w0
226 ; CHECK-NEXT: mov z26.d, z3.d
227 ; CHECK-NEXT: mov z25.d, z2.d
228 ; CHECK-NEXT: mov z24.d, z1.d
229 ; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
230 ; CHECK-NEXT: bfdot za.s[w8, 7, vgx4], { z24.h - z27.h }, z5.h[3]
232 call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
233 <vscale x 8 x bfloat> %zn4, i32 3)
234 %slice2 = add i32 %slice, 7
235 call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 %slice2, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
236 <vscale x 8 x bfloat> %zn4, i32 3)
241 attributes #0 = { nounwind "target-features"="+sme2" }
244 ; == Multi, multi (16-bit float)
246 declare void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
247 declare void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
248 <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
250 ; == Multi, multi (16-bit bfloat)
252 declare void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
253 declare void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
254 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
256 ; == Multi, single (16-bit float)
258 declare void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
259 declare void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
261 ; == Multi, single (16-bit bfloat)
263 declare void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
264 declare void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
266 ; == Multi, indexed (16-bit float)
268 declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
269 declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
271 ; == Multi, indexed (16-bit bfloat)
273 declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
274 declare void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)