1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -mattr=+sme2 -mattr=+faminmax -force-streaming -verify-machineinstrs < %s | FileCheck %s
6 define { <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_max_multi_x2_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) {
7 ; CHECK-LABEL: multi_vec_max_multi_x2_f16:
9 ; CHECK-NEXT: mov z7.d, z4.d
10 ; CHECK-NEXT: mov z5.d, z2.d
11 ; CHECK-NEXT: mov z6.d, z3.d
12 ; CHECK-NEXT: mov z4.d, z1.d
13 ; CHECK-NEXT: famax { z4.h, z5.h }, { z4.h, z5.h }, { z6.h, z7.h }
14 ; CHECK-NEXT: mov z0.d, z4.d
15 ; CHECK-NEXT: mov z1.d, z5.d
17 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x2.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
18 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
21 define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_max_multi_x2_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
22 ; CHECK-LABEL: multi_vec_max_multi_x2_f32:
24 ; CHECK-NEXT: mov z7.d, z4.d
25 ; CHECK-NEXT: mov z5.d, z2.d
26 ; CHECK-NEXT: mov z6.d, z3.d
27 ; CHECK-NEXT: mov z4.d, z1.d
28 ; CHECK-NEXT: famax { z4.s, z5.s }, { z4.s, z5.s }, { z6.s, z7.s }
29 ; CHECK-NEXT: mov z0.d, z4.d
30 ; CHECK-NEXT: mov z1.d, z5.d
32 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x2.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
33 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
36 define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_max_multi_x2_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
37 ; CHECK-LABEL: multi_vec_max_multi_x2_f64:
39 ; CHECK-NEXT: mov z7.d, z4.d
40 ; CHECK-NEXT: mov z5.d, z2.d
41 ; CHECK-NEXT: mov z6.d, z3.d
42 ; CHECK-NEXT: mov z4.d, z1.d
43 ; CHECK-NEXT: famax { z4.d, z5.d }, { z4.d, z5.d }, { z6.d, z7.d }
44 ; CHECK-NEXT: mov z0.d, z4.d
45 ; CHECK-NEXT: mov z1.d, z5.d
47 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famax.x2.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
48 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
53 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }@multi_vec_max_multi_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
54 ; CHECK-LABEL: multi_vec_max_multi_x4_f16:
56 ; CHECK-NEXT: mov z30.d, z7.d
57 ; CHECK-NEXT: mov z27.d, z4.d
58 ; CHECK-NEXT: ptrue p0.h
59 ; CHECK-NEXT: mov z29.d, z6.d
60 ; CHECK-NEXT: mov z26.d, z3.d
61 ; CHECK-NEXT: mov z28.d, z5.d
62 ; CHECK-NEXT: mov z25.d, z2.d
63 ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0]
64 ; CHECK-NEXT: mov z24.d, z1.d
65 ; CHECK-NEXT: famax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
66 ; CHECK-NEXT: mov z0.d, z24.d
67 ; CHECK-NEXT: mov z1.d, z25.d
68 ; CHECK-NEXT: mov z2.d, z26.d
69 ; CHECK-NEXT: mov z3.d, z27.d
71 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famax.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
72 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
75 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_max_multi_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
76 ; CHECK-LABEL: multi_vec_max_multi_x4_f32:
78 ; CHECK-NEXT: mov z30.d, z7.d
79 ; CHECK-NEXT: mov z27.d, z4.d
80 ; CHECK-NEXT: ptrue p0.s
81 ; CHECK-NEXT: mov z29.d, z6.d
82 ; CHECK-NEXT: mov z26.d, z3.d
83 ; CHECK-NEXT: mov z28.d, z5.d
84 ; CHECK-NEXT: mov z25.d, z2.d
85 ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0]
86 ; CHECK-NEXT: mov z24.d, z1.d
87 ; CHECK-NEXT: famax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
88 ; CHECK-NEXT: mov z0.d, z24.d
89 ; CHECK-NEXT: mov z1.d, z25.d
90 ; CHECK-NEXT: mov z2.d, z26.d
91 ; CHECK-NEXT: mov z3.d, z27.d
93 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famax.x4.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
94 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
97 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_max_multi_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
98 ; CHECK-LABEL: multi_vec_max_multi_x4_f64:
100 ; CHECK-NEXT: mov z30.d, z7.d
101 ; CHECK-NEXT: mov z27.d, z4.d
102 ; CHECK-NEXT: ptrue p0.d
103 ; CHECK-NEXT: mov z29.d, z6.d
104 ; CHECK-NEXT: mov z26.d, z3.d
105 ; CHECK-NEXT: mov z28.d, z5.d
106 ; CHECK-NEXT: mov z25.d, z2.d
107 ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0]
108 ; CHECK-NEXT: mov z24.d, z1.d
109 ; CHECK-NEXT: famax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
110 ; CHECK-NEXT: mov z0.d, z24.d
111 ; CHECK-NEXT: mov z1.d, z25.d
112 ; CHECK-NEXT: mov z2.d, z26.d
113 ; CHECK-NEXT: mov z3.d, z27.d
115 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
116 @llvm.aarch64.sme.famax.x4.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
117 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
118 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
124 define { <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_min_multi_x2_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2) {
125 ; CHECK-LABEL: multi_vec_min_multi_x2_f16:
127 ; CHECK-NEXT: mov z7.d, z4.d
128 ; CHECK-NEXT: mov z5.d, z2.d
129 ; CHECK-NEXT: mov z6.d, z3.d
130 ; CHECK-NEXT: mov z4.d, z1.d
131 ; CHECK-NEXT: famin { z4.h, z5.h }, { z4.h, z5.h }, { z6.h, z7.h }
132 ; CHECK-NEXT: mov z0.d, z4.d
133 ; CHECK-NEXT: mov z1.d, z5.d
135 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.famin.x2.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2)
136 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
139 define { <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_min_multi_x2_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
140 ; CHECK-LABEL: multi_vec_min_multi_x2_f32:
142 ; CHECK-NEXT: mov z7.d, z4.d
143 ; CHECK-NEXT: mov z5.d, z2.d
144 ; CHECK-NEXT: mov z6.d, z3.d
145 ; CHECK-NEXT: mov z4.d, z1.d
146 ; CHECK-NEXT: famin { z4.s, z5.s }, { z4.s, z5.s }, { z6.s, z7.s }
147 ; CHECK-NEXT: mov z0.d, z4.d
148 ; CHECK-NEXT: mov z1.d, z5.d
150 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.famin.x2.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
151 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
154 define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_main_multi_x2_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
155 ; CHECK-LABEL: multi_vec_main_multi_x2_f64:
157 ; CHECK-NEXT: mov z7.d, z4.d
158 ; CHECK-NEXT: mov z5.d, z2.d
159 ; CHECK-NEXT: mov z6.d, z3.d
160 ; CHECK-NEXT: mov z4.d, z1.d
161 ; CHECK-NEXT: famin { z4.d, z5.d }, { z4.d, z5.d }, { z6.d, z7.d }
162 ; CHECK-NEXT: mov z0.d, z4.d
163 ; CHECK-NEXT: mov z1.d, z5.d
165 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.famin.x2.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
166 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
171 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_min_multi_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
172 ; CHECK-LABEL: multi_vec_min_multi_x4_f16:
174 ; CHECK-NEXT: mov z30.d, z7.d
175 ; CHECK-NEXT: mov z27.d, z4.d
176 ; CHECK-NEXT: ptrue p0.h
177 ; CHECK-NEXT: mov z29.d, z6.d
178 ; CHECK-NEXT: mov z26.d, z3.d
179 ; CHECK-NEXT: mov z28.d, z5.d
180 ; CHECK-NEXT: mov z25.d, z2.d
181 ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0]
182 ; CHECK-NEXT: mov z24.d, z1.d
183 ; CHECK-NEXT: famin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
184 ; CHECK-NEXT: mov z0.d, z24.d
185 ; CHECK-NEXT: mov z1.d, z25.d
186 ; CHECK-NEXT: mov z2.d, z26.d
187 ; CHECK-NEXT: mov z3.d, z27.d
189 %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
190 @llvm.aarch64.sme.famin.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
191 <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
192 ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
195 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_min_multi_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
196 ; CHECK-LABEL: multi_vec_min_multi_x4_f32:
198 ; CHECK-NEXT: mov z30.d, z7.d
199 ; CHECK-NEXT: mov z27.d, z4.d
200 ; CHECK-NEXT: ptrue p0.s
201 ; CHECK-NEXT: mov z29.d, z6.d
202 ; CHECK-NEXT: mov z26.d, z3.d
203 ; CHECK-NEXT: mov z28.d, z5.d
204 ; CHECK-NEXT: mov z25.d, z2.d
205 ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0]
206 ; CHECK-NEXT: mov z24.d, z1.d
207 ; CHECK-NEXT: famin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
208 ; CHECK-NEXT: mov z0.d, z24.d
209 ; CHECK-NEXT: mov z1.d, z25.d
210 ; CHECK-NEXT: mov z2.d, z26.d
211 ; CHECK-NEXT: mov z3.d, z27.d
213 %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
214 @llvm.aarch64.sme.famin.x4.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4,
215 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
216 ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
219 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_min_multi_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
220 ; CHECK-LABEL: multi_vec_min_multi_x4_f64:
222 ; CHECK-NEXT: mov z30.d, z7.d
223 ; CHECK-NEXT: mov z27.d, z4.d
224 ; CHECK-NEXT: ptrue p0.d
225 ; CHECK-NEXT: mov z29.d, z6.d
226 ; CHECK-NEXT: mov z26.d, z3.d
227 ; CHECK-NEXT: mov z28.d, z5.d
228 ; CHECK-NEXT: mov z25.d, z2.d
229 ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0]
230 ; CHECK-NEXT: mov z24.d, z1.d
231 ; CHECK-NEXT: famin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
232 ; CHECK-NEXT: mov z0.d, z24.d
233 ; CHECK-NEXT: mov z1.d, z25.d
234 ; CHECK-NEXT: mov z2.d, z26.d
235 ; CHECK-NEXT: mov z3.d, z27.d
237 %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
238 @llvm.aarch64.sme.famin.x4.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
239 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
240 ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res