1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 ; This test only tests the legal types for a given vector width, as mulh nodes
7 ; do not get generated for non-legal types.
9 target triple = "aarch64-unknown-linux-gnu"
15 ; Don't use SVE for 64-bit vectors.
16 define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
17 ; CHECK-LABEL: smulh_v8i8:
19 ; CHECK-NEXT: ptrue p0.b, vl8
20 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
21 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
22 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
23 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
25 %insert = insertelement <8 x i16> undef, i16 8, i64 0
26 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
27 %1 = sext <8 x i8> %op1 to <8 x i16>
28 %2 = sext <8 x i8> %op2 to <8 x i16>
29 %mul = mul <8 x i16> %1, %2
30 %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
31 %res = trunc <8 x i16> %shr to <8 x i8>
35 ; Don't use SVE for 128-bit vectors.
36 define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
37 ; CHECK-LABEL: smulh_v16i8:
39 ; CHECK-NEXT: ptrue p0.b, vl16
40 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
41 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
42 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
43 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
45 %1 = sext <16 x i8> %op1 to <16 x i16>
46 %2 = sext <16 x i8> %op2 to <16 x i16>
47 %mul = mul <16 x i16> %1, %2
48 %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
49 %res = trunc <16 x i16> %shr to <16 x i8>
53 define void @smulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
54 ; CHECK-LABEL: smulh_v32i8:
56 ; CHECK-NEXT: ptrue p0.b, vl32
57 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
58 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
59 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
60 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
62 %op1 = load <32 x i8>, ptr %a
63 %op2 = load <32 x i8>, ptr %b
64 %1 = sext <32 x i8> %op1 to <32 x i16>
65 %2 = sext <32 x i8> %op2 to <32 x i16>
66 %mul = mul <32 x i16> %1, %2
67 %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
68 %res = trunc <32 x i16> %shr to <32 x i8>
69 store <32 x i8> %res, ptr %a
73 define void @smulh_v64i8(ptr %a, ptr %b) #0 {
74 ; VBITS_GE_256-LABEL: smulh_v64i8:
75 ; VBITS_GE_256: // %bb.0:
76 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
77 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
78 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
79 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
80 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
81 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
82 ; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z2.b
83 ; VBITS_GE_256-NEXT: smulh z1.b, p0/m, z1.b, z3.b
84 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
85 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
86 ; VBITS_GE_256-NEXT: ret
88 ; VBITS_GE_512-LABEL: smulh_v64i8:
89 ; VBITS_GE_512: // %bb.0:
90 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
91 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
92 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
93 ; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b
94 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
95 ; VBITS_GE_512-NEXT: ret
96 %op1 = load <64 x i8>, ptr %a
97 %op2 = load <64 x i8>, ptr %b
98 %insert = insertelement <64 x i16> undef, i16 8, i64 0
99 %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
100 %1 = sext <64 x i8> %op1 to <64 x i16>
101 %2 = sext <64 x i8> %op2 to <64 x i16>
102 %mul = mul <64 x i16> %1, %2
103 %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
104 %res = trunc <64 x i16> %shr to <64 x i8>
105 store <64 x i8> %res, ptr %a
109 define void @smulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
110 ; CHECK-LABEL: smulh_v128i8:
112 ; CHECK-NEXT: ptrue p0.b, vl128
113 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
114 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
115 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
116 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
118 %op1 = load <128 x i8>, ptr %a
119 %op2 = load <128 x i8>, ptr %b
120 %1 = sext <128 x i8> %op1 to <128 x i16>
121 %2 = sext <128 x i8> %op2 to <128 x i16>
122 %mul = mul <128 x i16> %1, %2
123 %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
124 %res = trunc <128 x i16> %shr to <128 x i8>
125 store <128 x i8> %res, ptr %a
129 define void @smulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
130 ; CHECK-LABEL: smulh_v256i8:
132 ; CHECK-NEXT: ptrue p0.b, vl256
133 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
134 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
135 ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
136 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
138 %op1 = load <256 x i8>, ptr %a
139 %op2 = load <256 x i8>, ptr %b
140 %1 = sext <256 x i8> %op1 to <256 x i16>
141 %2 = sext <256 x i8> %op2 to <256 x i16>
142 %mul = mul <256 x i16> %1, %2
143 %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
144 %res = trunc <256 x i16> %shr to <256 x i8>
145 store <256 x i8> %res, ptr %a
149 ; Don't use SVE for 64-bit vectors.
150 define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
151 ; CHECK-LABEL: smulh_v4i16:
153 ; CHECK-NEXT: ptrue p0.h, vl4
154 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
155 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
156 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
157 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
159 %1 = sext <4 x i16> %op1 to <4 x i32>
160 %2 = sext <4 x i16> %op2 to <4 x i32>
161 %mul = mul <4 x i32> %1, %2
162 %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
163 %res = trunc <4 x i32> %shr to <4 x i16>
167 ; Don't use SVE for 128-bit vectors.
168 define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
169 ; CHECK-LABEL: smulh_v8i16:
171 ; CHECK-NEXT: ptrue p0.h, vl8
172 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
173 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
174 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
175 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
177 %1 = sext <8 x i16> %op1 to <8 x i32>
178 %2 = sext <8 x i16> %op2 to <8 x i32>
179 %mul = mul <8 x i32> %1, %2
180 %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
181 %res = trunc <8 x i32> %shr to <8 x i16>
185 define void @smulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
186 ; CHECK-LABEL: smulh_v16i16:
188 ; CHECK-NEXT: ptrue p0.h, vl16
189 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
190 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
191 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
192 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
194 %op1 = load <16 x i16>, ptr %a
195 %op2 = load <16 x i16>, ptr %b
196 %1 = sext <16 x i16> %op1 to <16 x i32>
197 %2 = sext <16 x i16> %op2 to <16 x i32>
198 %mul = mul <16 x i32> %1, %2
199 %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
200 %res = trunc <16 x i32> %shr to <16 x i16>
201 store <16 x i16> %res, ptr %a
205 define void @smulh_v32i16(ptr %a, ptr %b) #0 {
206 ; VBITS_GE_256-LABEL: smulh_v32i16:
207 ; VBITS_GE_256: // %bb.0:
208 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
209 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
210 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
211 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
212 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
213 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
214 ; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z2.h
215 ; VBITS_GE_256-NEXT: smulh z1.h, p0/m, z1.h, z3.h
216 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
217 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
218 ; VBITS_GE_256-NEXT: ret
220 ; VBITS_GE_512-LABEL: smulh_v32i16:
221 ; VBITS_GE_512: // %bb.0:
222 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
223 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
224 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
225 ; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h
226 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
227 ; VBITS_GE_512-NEXT: ret
228 %op1 = load <32 x i16>, ptr %a
229 %op2 = load <32 x i16>, ptr %b
230 %1 = sext <32 x i16> %op1 to <32 x i32>
231 %2 = sext <32 x i16> %op2 to <32 x i32>
232 %mul = mul <32 x i32> %1, %2
233 %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
234 %res = trunc <32 x i32> %shr to <32 x i16>
235 store <32 x i16> %res, ptr %a
239 define void @smulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
240 ; CHECK-LABEL: smulh_v64i16:
242 ; CHECK-NEXT: ptrue p0.h, vl64
243 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
244 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
245 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
246 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
248 %op1 = load <64 x i16>, ptr %a
249 %op2 = load <64 x i16>, ptr %b
250 %1 = sext <64 x i16> %op1 to <64 x i32>
251 %2 = sext <64 x i16> %op2 to <64 x i32>
252 %mul = mul <64 x i32> %1, %2
253 %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
254 %res = trunc <64 x i32> %shr to <64 x i16>
255 store <64 x i16> %res, ptr %a
259 define void @smulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
260 ; CHECK-LABEL: smulh_v128i16:
262 ; CHECK-NEXT: ptrue p0.h, vl128
263 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
264 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
265 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
266 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
268 %op1 = load <128 x i16>, ptr %a
269 %op2 = load <128 x i16>, ptr %b
270 %1 = sext <128 x i16> %op1 to <128 x i32>
271 %2 = sext <128 x i16> %op2 to <128 x i32>
272 %mul = mul <128 x i32> %1, %2
273 %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
274 %res = trunc <128 x i32> %shr to <128 x i16>
275 store <128 x i16> %res, ptr %a
279 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
280 define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
281 ; CHECK-LABEL: smulh_v2i32:
283 ; CHECK-NEXT: ptrue p0.s, vl2
284 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
285 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
286 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
287 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
289 %1 = sext <2 x i32> %op1 to <2 x i64>
290 %2 = sext <2 x i32> %op2 to <2 x i64>
291 %mul = mul <2 x i64> %1, %2
292 %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
293 %res = trunc <2 x i64> %shr to <2 x i32>
297 ; Don't use SVE for 128-bit vectors.
298 define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
299 ; CHECK-LABEL: smulh_v4i32:
301 ; CHECK-NEXT: ptrue p0.s, vl4
302 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
303 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
304 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
305 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
307 %1 = sext <4 x i32> %op1 to <4 x i64>
308 %2 = sext <4 x i32> %op2 to <4 x i64>
309 %mul = mul <4 x i64> %1, %2
310 %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
311 %res = trunc <4 x i64> %shr to <4 x i32>
315 define void @smulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
316 ; CHECK-LABEL: smulh_v8i32:
318 ; CHECK-NEXT: ptrue p0.s, vl8
319 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
320 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
321 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
322 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
324 %op1 = load <8 x i32>, ptr %a
325 %op2 = load <8 x i32>, ptr %b
326 %1 = sext <8 x i32> %op1 to <8 x i64>
327 %2 = sext <8 x i32> %op2 to <8 x i64>
328 %mul = mul <8 x i64> %1, %2
329 %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
330 %res = trunc <8 x i64> %shr to <8 x i32>
331 store <8 x i32> %res, ptr %a
335 define void @smulh_v16i32(ptr %a, ptr %b) #0 {
336 ; VBITS_GE_256-LABEL: smulh_v16i32:
337 ; VBITS_GE_256: // %bb.0:
338 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
339 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
340 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
341 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
342 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
343 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
344 ; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z2.s
345 ; VBITS_GE_256-NEXT: smulh z1.s, p0/m, z1.s, z3.s
346 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
347 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
348 ; VBITS_GE_256-NEXT: ret
350 ; VBITS_GE_512-LABEL: smulh_v16i32:
351 ; VBITS_GE_512: // %bb.0:
352 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
353 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
354 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
355 ; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s
356 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
357 ; VBITS_GE_512-NEXT: ret
358 %op1 = load <16 x i32>, ptr %a
359 %op2 = load <16 x i32>, ptr %b
360 %1 = sext <16 x i32> %op1 to <16 x i64>
361 %2 = sext <16 x i32> %op2 to <16 x i64>
362 %mul = mul <16 x i64> %1, %2
363 %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
364 %res = trunc <16 x i64> %shr to <16 x i32>
365 store <16 x i32> %res, ptr %a
369 define void @smulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
370 ; CHECK-LABEL: smulh_v32i32:
372 ; CHECK-NEXT: ptrue p0.s, vl32
373 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
374 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
375 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
376 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
378 %op1 = load <32 x i32>, ptr %a
379 %op2 = load <32 x i32>, ptr %b
380 %1 = sext <32 x i32> %op1 to <32 x i64>
381 %2 = sext <32 x i32> %op2 to <32 x i64>
382 %mul = mul <32 x i64> %1, %2
383 %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
384 %res = trunc <32 x i64> %shr to <32 x i32>
385 store <32 x i32> %res, ptr %a
389 define void @smulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
390 ; CHECK-LABEL: smulh_v64i32:
392 ; CHECK-NEXT: ptrue p0.s, vl64
393 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
394 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
395 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
396 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
398 %op1 = load <64 x i32>, ptr %a
399 %op2 = load <64 x i32>, ptr %b
400 %1 = sext <64 x i32> %op1 to <64 x i64>
401 %2 = sext <64 x i32> %op2 to <64 x i64>
402 %mul = mul <64 x i64> %1, %2
403 %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
404 %res = trunc <64 x i64> %shr to <64 x i32>
405 store <64 x i32> %res, ptr %a
409 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
410 define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
411 ; CHECK-LABEL: smulh_v1i64:
413 ; CHECK-NEXT: ptrue p0.d, vl1
414 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
415 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
416 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
417 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
419 %insert = insertelement <1 x i128> undef, i128 64, i128 0
420 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
421 %1 = sext <1 x i64> %op1 to <1 x i128>
422 %2 = sext <1 x i64> %op2 to <1 x i128>
423 %mul = mul <1 x i128> %1, %2
424 %shr = lshr <1 x i128> %mul, %splat
425 %res = trunc <1 x i128> %shr to <1 x i64>
429 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
430 define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
431 ; CHECK-LABEL: smulh_v2i64:
433 ; CHECK-NEXT: ptrue p0.d, vl2
434 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
435 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
436 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
437 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
439 %1 = sext <2 x i64> %op1 to <2 x i128>
440 %2 = sext <2 x i64> %op2 to <2 x i128>
441 %mul = mul <2 x i128> %1, %2
442 %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
443 %res = trunc <2 x i128> %shr to <2 x i64>
447 define void @smulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
448 ; CHECK-LABEL: smulh_v4i64:
450 ; CHECK-NEXT: ptrue p0.d, vl4
451 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
452 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
453 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
454 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
456 %op1 = load <4 x i64>, ptr %a
457 %op2 = load <4 x i64>, ptr %b
458 %1 = sext <4 x i64> %op1 to <4 x i128>
459 %2 = sext <4 x i64> %op2 to <4 x i128>
460 %mul = mul <4 x i128> %1, %2
461 %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
462 %res = trunc <4 x i128> %shr to <4 x i64>
463 store <4 x i64> %res, ptr %a
467 define void @smulh_v8i64(ptr %a, ptr %b) #0 {
468 ; VBITS_GE_256-LABEL: smulh_v8i64:
469 ; VBITS_GE_256: // %bb.0:
470 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
471 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
472 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
473 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
474 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
475 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
476 ; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z2.d
477 ; VBITS_GE_256-NEXT: smulh z1.d, p0/m, z1.d, z3.d
478 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
479 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
480 ; VBITS_GE_256-NEXT: ret
482 ; VBITS_GE_512-LABEL: smulh_v8i64:
483 ; VBITS_GE_512: // %bb.0:
484 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
485 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
486 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
487 ; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d
488 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
489 ; VBITS_GE_512-NEXT: ret
490 %op1 = load <8 x i64>, ptr %a
491 %op2 = load <8 x i64>, ptr %b
492 %1 = sext <8 x i64> %op1 to <8 x i128>
493 %2 = sext <8 x i64> %op2 to <8 x i128>
494 %mul = mul <8 x i128> %1, %2
495 %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
496 %res = trunc <8 x i128> %shr to <8 x i64>
497 store <8 x i64> %res, ptr %a
501 define void @smulh_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
502 ; CHECK-LABEL: smulh_v16i64:
504 ; CHECK-NEXT: ptrue p0.d, vl16
505 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
506 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
507 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
508 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
510 %op1 = load <16 x i64>, ptr %a
511 %op2 = load <16 x i64>, ptr %b
512 %1 = sext <16 x i64> %op1 to <16 x i128>
513 %2 = sext <16 x i64> %op2 to <16 x i128>
514 %mul = mul <16 x i128> %1, %2
515 %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
516 %res = trunc <16 x i128> %shr to <16 x i64>
517 store <16 x i64> %res, ptr %a
521 define void @smulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
522 ; CHECK-LABEL: smulh_v32i64:
524 ; CHECK-NEXT: ptrue p0.d, vl32
525 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
526 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
527 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d
528 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
530 %op1 = load <32 x i64>, ptr %a
531 %op2 = load <32 x i64>, ptr %b
532 %1 = sext <32 x i64> %op1 to <32 x i128>
533 %2 = sext <32 x i64> %op2 to <32 x i128>
534 %mul = mul <32 x i128> %1, %2
535 %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
536 %res = trunc <32 x i128> %shr to <32 x i64>
537 store <32 x i64> %res, ptr %a
545 ; Don't use SVE for 64-bit vectors.
546 ; FIXME: The codegen for the >=256 bits case can be improved.
547 define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
548 ; CHECK-LABEL: umulh_v8i8:
550 ; CHECK-NEXT: ptrue p0.b, vl8
551 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
552 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
553 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
554 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
556 %1 = zext <8 x i8> %op1 to <8 x i16>
557 %2 = zext <8 x i8> %op2 to <8 x i16>
558 %mul = mul <8 x i16> %1, %2
559 %shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
560 %res = trunc <8 x i16> %shr to <8 x i8>
564 ; Don't use SVE for 128-bit vectors.
565 define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
566 ; CHECK-LABEL: umulh_v16i8:
568 ; CHECK-NEXT: ptrue p0.b, vl16
569 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
570 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
571 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
572 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
574 %1 = zext <16 x i8> %op1 to <16 x i16>
575 %2 = zext <16 x i8> %op2 to <16 x i16>
576 %mul = mul <16 x i16> %1, %2
577 %shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
578 %res = trunc <16 x i16> %shr to <16 x i8>
582 define void @umulh_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
583 ; CHECK-LABEL: umulh_v32i8:
585 ; CHECK-NEXT: ptrue p0.b, vl32
586 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
587 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
588 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
589 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
591 %op1 = load <32 x i8>, ptr %a
592 %op2 = load <32 x i8>, ptr %b
593 %1 = zext <32 x i8> %op1 to <32 x i16>
594 %2 = zext <32 x i8> %op2 to <32 x i16>
595 %mul = mul <32 x i16> %1, %2
596 %shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
597 %res = trunc <32 x i16> %shr to <32 x i8>
598 store <32 x i8> %res, ptr %a
602 define void @umulh_v64i8(ptr %a, ptr %b) #0 {
603 ; VBITS_GE_256-LABEL: umulh_v64i8:
604 ; VBITS_GE_256: // %bb.0:
605 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
606 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
607 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
608 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
609 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
610 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
611 ; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z2.b
612 ; VBITS_GE_256-NEXT: umulh z1.b, p0/m, z1.b, z3.b
613 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
614 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
615 ; VBITS_GE_256-NEXT: ret
617 ; VBITS_GE_512-LABEL: umulh_v64i8:
618 ; VBITS_GE_512: // %bb.0:
619 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
620 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
621 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
622 ; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b
623 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
624 ; VBITS_GE_512-NEXT: ret
625 %op1 = load <64 x i8>, ptr %a
626 %op2 = load <64 x i8>, ptr %b
627 %1 = zext <64 x i8> %op1 to <64 x i16>
628 %2 = zext <64 x i8> %op2 to <64 x i16>
629 %mul = mul <64 x i16> %1, %2
630 %shr = lshr <64 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
631 %res = trunc <64 x i16> %shr to <64 x i8>
632 store <64 x i8> %res, ptr %a
636 define void @umulh_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
637 ; CHECK-LABEL: umulh_v128i8:
639 ; CHECK-NEXT: ptrue p0.b, vl128
640 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
641 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
642 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
643 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
645 %op1 = load <128 x i8>, ptr %a
646 %op2 = load <128 x i8>, ptr %b
647 %insert = insertelement <128 x i16> undef, i16 8, i64 0
648 %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
649 %1 = zext <128 x i8> %op1 to <128 x i16>
650 %2 = zext <128 x i8> %op2 to <128 x i16>
651 %mul = mul <128 x i16> %1, %2
652 %shr = lshr <128 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
653 %res = trunc <128 x i16> %shr to <128 x i8>
654 store <128 x i8> %res, ptr %a
658 define void @umulh_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
659 ; CHECK-LABEL: umulh_v256i8:
661 ; CHECK-NEXT: ptrue p0.b, vl256
662 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
663 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
664 ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
665 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
667 %op1 = load <256 x i8>, ptr %a
668 %op2 = load <256 x i8>, ptr %b
669 %1 = zext <256 x i8> %op1 to <256 x i16>
670 %2 = zext <256 x i8> %op2 to <256 x i16>
671 %mul = mul <256 x i16> %1, %2
672 %shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
673 %res = trunc <256 x i16> %shr to <256 x i8>
674 store <256 x i8> %res, ptr %a
678 ; Don't use SVE for 64-bit vectors.
679 ; FIXME: The codegen for the >=256 bits case can be improved.
680 define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
681 ; CHECK-LABEL: umulh_v4i16:
683 ; CHECK-NEXT: ptrue p0.h, vl4
684 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
685 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
686 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
687 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
689 %1 = zext <4 x i16> %op1 to <4 x i32>
690 %2 = zext <4 x i16> %op2 to <4 x i32>
691 %mul = mul <4 x i32> %1, %2
692 %shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
693 %res = trunc <4 x i32> %shr to <4 x i16>
697 ; Don't use SVE for 128-bit vectors.
698 define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
699 ; CHECK-LABEL: umulh_v8i16:
701 ; CHECK-NEXT: ptrue p0.h, vl8
702 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
703 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
704 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
705 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
707 %1 = zext <8 x i16> %op1 to <8 x i32>
708 %2 = zext <8 x i16> %op2 to <8 x i32>
709 %mul = mul <8 x i32> %1, %2
710 %shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
711 %res = trunc <8 x i32> %shr to <8 x i16>
715 define void @umulh_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
716 ; CHECK-LABEL: umulh_v16i16:
718 ; CHECK-NEXT: ptrue p0.h, vl16
719 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
720 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
721 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
722 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
724 %op1 = load <16 x i16>, ptr %a
725 %op2 = load <16 x i16>, ptr %b
726 %1 = zext <16 x i16> %op1 to <16 x i32>
727 %2 = zext <16 x i16> %op2 to <16 x i32>
728 %mul = mul <16 x i32> %1, %2
729 %shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
730 %res = trunc <16 x i32> %shr to <16 x i16>
731 store <16 x i16> %res, ptr %a
735 define void @umulh_v32i16(ptr %a, ptr %b) #0 {
736 ; VBITS_GE_256-LABEL: umulh_v32i16:
737 ; VBITS_GE_256: // %bb.0:
738 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
739 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
740 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
741 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
742 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
743 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
744 ; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z2.h
745 ; VBITS_GE_256-NEXT: umulh z1.h, p0/m, z1.h, z3.h
746 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
747 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
748 ; VBITS_GE_256-NEXT: ret
750 ; VBITS_GE_512-LABEL: umulh_v32i16:
751 ; VBITS_GE_512: // %bb.0:
752 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
753 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
754 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
755 ; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h
756 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
757 ; VBITS_GE_512-NEXT: ret
758 %op1 = load <32 x i16>, ptr %a
759 %op2 = load <32 x i16>, ptr %b
760 %1 = zext <32 x i16> %op1 to <32 x i32>
761 %2 = zext <32 x i16> %op2 to <32 x i32>
762 %mul = mul <32 x i32> %1, %2
763 %shr = lshr <32 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
764 %res = trunc <32 x i32> %shr to <32 x i16>
765 store <32 x i16> %res, ptr %a
769 define void @umulh_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
770 ; CHECK-LABEL: umulh_v64i16:
772 ; CHECK-NEXT: ptrue p0.h, vl64
773 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
774 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
775 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
776 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
778 %op1 = load <64 x i16>, ptr %a
779 %op2 = load <64 x i16>, ptr %b
780 %1 = zext <64 x i16> %op1 to <64 x i32>
781 %2 = zext <64 x i16> %op2 to <64 x i32>
782 %mul = mul <64 x i32> %1, %2
783 %shr = lshr <64 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
784 %res = trunc <64 x i32> %shr to <64 x i16>
785 store <64 x i16> %res, ptr %a
789 define void @umulh_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
790 ; CHECK-LABEL: umulh_v128i16:
792 ; CHECK-NEXT: ptrue p0.h, vl128
793 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
794 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
795 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
796 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
798 %op1 = load <128 x i16>, ptr %a
799 %op2 = load <128 x i16>, ptr %b
800 %1 = zext <128 x i16> %op1 to <128 x i32>
801 %2 = zext <128 x i16> %op2 to <128 x i32>
802 %mul = mul <128 x i32> %1, %2
803 %shr = lshr <128 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
804 %res = trunc <128 x i32> %shr to <128 x i16>
805 store <128 x i16> %res, ptr %a
809 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
810 define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
811 ; CHECK-LABEL: umulh_v2i32:
813 ; CHECK-NEXT: ptrue p0.s, vl2
814 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
815 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
816 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
817 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
819 %1 = zext <2 x i32> %op1 to <2 x i64>
820 %2 = zext <2 x i32> %op2 to <2 x i64>
821 %mul = mul <2 x i64> %1, %2
822 %shr = lshr <2 x i64> %mul, <i64 32, i64 32>
823 %res = trunc <2 x i64> %shr to <2 x i32>
827 ; Don't use SVE for 128-bit vectors.
828 define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
829 ; CHECK-LABEL: umulh_v4i32:
831 ; CHECK-NEXT: ptrue p0.s, vl4
832 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
833 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
834 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
835 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
837 %1 = zext <4 x i32> %op1 to <4 x i64>
838 %2 = zext <4 x i32> %op2 to <4 x i64>
839 %mul = mul <4 x i64> %1, %2
840 %shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
841 %res = trunc <4 x i64> %shr to <4 x i32>
845 define void @umulh_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
846 ; CHECK-LABEL: umulh_v8i32:
848 ; CHECK-NEXT: ptrue p0.s, vl8
849 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
850 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
851 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
852 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
854 %op1 = load <8 x i32>, ptr %a
855 %op2 = load <8 x i32>, ptr %b
856 %insert = insertelement <8 x i64> undef, i64 32, i64 0
857 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
858 %1 = zext <8 x i32> %op1 to <8 x i64>
859 %2 = zext <8 x i32> %op2 to <8 x i64>
860 %mul = mul <8 x i64> %1, %2
861 %shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
862 %res = trunc <8 x i64> %shr to <8 x i32>
863 store <8 x i32> %res, ptr %a
867 define void @umulh_v16i32(ptr %a, ptr %b) #0 {
868 ; VBITS_GE_256-LABEL: umulh_v16i32:
869 ; VBITS_GE_256: // %bb.0:
870 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
871 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
872 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
873 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
874 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
875 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
876 ; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z2.s
877 ; VBITS_GE_256-NEXT: umulh z1.s, p0/m, z1.s, z3.s
878 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
879 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
880 ; VBITS_GE_256-NEXT: ret
882 ; VBITS_GE_512-LABEL: umulh_v16i32:
883 ; VBITS_GE_512: // %bb.0:
884 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
885 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
886 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
887 ; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s
888 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
889 ; VBITS_GE_512-NEXT: ret
890 %op1 = load <16 x i32>, ptr %a
891 %op2 = load <16 x i32>, ptr %b
892 %1 = zext <16 x i32> %op1 to <16 x i64>
893 %2 = zext <16 x i32> %op2 to <16 x i64>
894 %mul = mul <16 x i64> %1, %2
895 %shr = lshr <16 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
896 %res = trunc <16 x i64> %shr to <16 x i32>
897 store <16 x i32> %res, ptr %a
901 define void @umulh_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
902 ; CHECK-LABEL: umulh_v32i32:
904 ; CHECK-NEXT: ptrue p0.s, vl32
905 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
906 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
907 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
908 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
910 %op1 = load <32 x i32>, ptr %a
911 %op2 = load <32 x i32>, ptr %b
912 %1 = zext <32 x i32> %op1 to <32 x i64>
913 %2 = zext <32 x i32> %op2 to <32 x i64>
914 %mul = mul <32 x i64> %1, %2
915 %shr = lshr <32 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
916 %res = trunc <32 x i64> %shr to <32 x i32>
917 store <32 x i32> %res, ptr %a
921 define void @umulh_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
922 ; CHECK-LABEL: umulh_v64i32:
924 ; CHECK-NEXT: ptrue p0.s, vl64
925 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
926 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
927 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
928 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
930 %op1 = load <64 x i32>, ptr %a
931 %op2 = load <64 x i32>, ptr %b
932 %1 = zext <64 x i32> %op1 to <64 x i64>
933 %2 = zext <64 x i32> %op2 to <64 x i64>
934 %mul = mul <64 x i64> %1, %2
935 %shr = lshr <64 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
936 %res = trunc <64 x i64> %shr to <64 x i32>
937 store <64 x i32> %res, ptr %a
941 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
942 define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
943 ; CHECK-LABEL: umulh_v1i64:
945 ; CHECK-NEXT: ptrue p0.d, vl1
946 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
947 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
948 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
949 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
951 %1 = zext <1 x i64> %op1 to <1 x i128>
952 %2 = zext <1 x i64> %op2 to <1 x i128>
953 %mul = mul <1 x i128> %1, %2
954 %shr = lshr <1 x i128> %mul, <i128 64>
955 %res = trunc <1 x i128> %shr to <1 x i64>
959 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
960 define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
961 ; CHECK-LABEL: umulh_v2i64:
963 ; CHECK-NEXT: ptrue p0.d, vl2
964 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
965 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
966 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
967 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
969 %1 = zext <2 x i64> %op1 to <2 x i128>
970 %2 = zext <2 x i64> %op2 to <2 x i128>
971 %mul = mul <2 x i128> %1, %2
972 %shr = lshr <2 x i128> %mul, <i128 64, i128 64>
973 %res = trunc <2 x i128> %shr to <2 x i64>
977 define void @umulh_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
978 ; CHECK-LABEL: umulh_v4i64:
980 ; CHECK-NEXT: ptrue p0.d, vl4
981 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
982 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
983 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
984 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
986 %op1 = load <4 x i64>, ptr %a
987 %op2 = load <4 x i64>, ptr %b
988 %1 = zext <4 x i64> %op1 to <4 x i128>
989 %2 = zext <4 x i64> %op2 to <4 x i128>
990 %mul = mul <4 x i128> %1, %2
991 %shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
992 %res = trunc <4 x i128> %shr to <4 x i64>
993 store <4 x i64> %res, ptr %a
997 define void @umulh_v8i64(ptr %a, ptr %b) #0 {
998 ; VBITS_GE_256-LABEL: umulh_v8i64:
999 ; VBITS_GE_256: // %bb.0:
1000 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1001 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1002 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1003 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1004 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
1005 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
1006 ; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z2.d
1007 ; VBITS_GE_256-NEXT: umulh z1.d, p0/m, z1.d, z3.d
1008 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1009 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1010 ; VBITS_GE_256-NEXT: ret
1012 ; VBITS_GE_512-LABEL: umulh_v8i64:
1013 ; VBITS_GE_512: // %bb.0:
1014 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1015 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1016 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1017 ; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d
1018 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1019 ; VBITS_GE_512-NEXT: ret
1020 %op1 = load <8 x i64>, ptr %a
1021 %op2 = load <8 x i64>, ptr %b
1022 %1 = zext <8 x i64> %op1 to <8 x i128>
1023 %2 = zext <8 x i64> %op2 to <8 x i128>
1024 %mul = mul <8 x i128> %1, %2
1025 %shr = lshr <8 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1026 %res = trunc <8 x i128> %shr to <8 x i64>
1027 store <8 x i64> %res, ptr %a
1031 define void @umulh_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1032 ; CHECK-LABEL: umulh_v16i64:
1034 ; CHECK-NEXT: ptrue p0.d, vl16
1035 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1036 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1037 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
1038 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1040 %op1 = load <16 x i64>, ptr %a
1041 %op2 = load <16 x i64>, ptr %b
1042 %1 = zext <16 x i64> %op1 to <16 x i128>
1043 %2 = zext <16 x i64> %op2 to <16 x i128>
1044 %mul = mul <16 x i128> %1, %2
1045 %shr = lshr <16 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1046 %res = trunc <16 x i128> %shr to <16 x i64>
1047 store <16 x i64> %res, ptr %a
1051 define void @umulh_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1052 ; CHECK-LABEL: umulh_v32i64:
1054 ; CHECK-NEXT: ptrue p0.d, vl32
1055 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1056 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1057 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
1058 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1060 %op1 = load <32 x i64>, ptr %a
1061 %op2 = load <32 x i64>, ptr %b
1062 %1 = zext <32 x i64> %op1 to <32 x i128>
1063 %2 = zext <32 x i64> %op2 to <32 x i128>
1064 %mul = mul <32 x i128> %1, %2
1065 %shr = lshr <32 x i128> %mul, <i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64, i128 64>
1066 %res = trunc <32 x i128> %shr to <32 x i64>
1067 store <32 x i64> %res, ptr %a
1070 attributes #0 = { "target-features"="+sve" }