llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll

   1 ; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -D#VBYTES=16  -check-prefix=NO_SVE
   2 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_GE_256,VBITS_EQ_256
   3 ; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -D#VBYTES=32  -check-prefixes=CHECK,VBITS_GE_256
   4 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_512
   5 ; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
   6 ; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
   7 ; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -D#VBYTES=64  -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
   8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_1024
   9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
  10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
  11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
  12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
  13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
  14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
  15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
  16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
  17
  18 ; VBYTES represents the useful byte size of a vector register from the code
  19 ; generator's point of view. It is clamped to power-of-2 values because
  20 ; only power-of-2 vector lengths are considered legal, regardless of the
  21 ; user specified vector length.
  22
  23 ; This test only tests the legal types for a given vector width, as mulh nodes
  24 ; do not get generated for non-legal types.
  25
  26 target triple = "aarch64-unknown-linux-gnu"
  27
  28 ; Don't use SVE when its registers are no bigger than NEON.
  29 ; NO_SVE-NOT: ptrue
  30
  31 ;
  32 ; SMULH
  33 ;
  34
  35 ; Don't use SVE for 64-bit vectors.
  36 define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
  37 ; CHECK-LABEL: smulh_v8i8:
  38 ; CHECK: smull v0.8h, v0.8b, v1.8b
  39 ; CHECK: ushr v1.8h, v0.8h, #8
  40 ; CHECK: umov w8, v1.h[0]
  41 ; CHECK: fmov s0, w8
  42 ; CHECK: umov w8, v1.h[1]
  43 ; CHECK: mov v0.b[1], w8
  44 ; CHECK: umov w8, v1.h[2]
  45 ; CHECK: mov v0.b[2], w8
  46 ; CHECK: umov w8, v1.h[3]
  47 ; CHECK: mov v0.b[3], w8
  48 ; CHECK: ret
  49   %insert = insertelement <8 x i16> undef, i16 8, i64 0
  50   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
  51   %1 = sext <8 x i8> %op1 to <8 x i16>
  52   %2 = sext <8 x i8> %op2 to <8 x i16>
  53   %mul = mul <8 x i16> %1, %2
  54   %shr = lshr <8 x i16> %mul, %splat
  55   %res = trunc <8 x i16> %shr to <8 x i8>
  56   ret <8 x i8> %res
  57 }
  58
  59 ; Don't use SVE for 128-bit vectors.
  60 define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
  61 ; CHECK-LABEL: smulh_v16i8:
  62 ; CHECK: smull2 v2.8h, v0.16b, v1.16b
  63 ; CHECK: smull v0.8h, v0.8b, v1.8b
  64 ; CHECK: uzp2 v0.16b, v0.16b, v2.16b
  65 ; CHECK: ret
  66   %insert = insertelement <16 x i16> undef, i16 8, i64 0
  67   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
  68   %1 = sext <16 x i8> %op1 to <16 x i16>
  69   %2 = sext <16 x i8> %op2 to <16 x i16>
  70   %mul = mul <16 x i16> %1, %2
  71   %shr = lshr <16 x i16> %mul, %splat
  72   %res = trunc <16 x i16> %shr to <16 x i8>
  73   ret <16 x i8> %res
  74 }
  75
  76 define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
  77 ; CHECK-LABEL: smulh_v32i8:
  78 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
  79 ; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
  80 ; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
  81 ; VBITS_EQ_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
  82 ; VBITS_EQ_256: ret
  83
  84 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
  85 ; VBITS_GE_512-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
  86 ; VBITS_GE_512-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
  87 ; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
  88 ; VBITS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
  89 ; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
  90 ; VBITS_GE_512: ret
  91   %op1 = load <32 x i8>, <32 x i8>* %a
  92   %op2 = load <32 x i8>, <32 x i8>* %b
  93   %insert = insertelement <32 x i16> undef, i16 8, i64 0
  94   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
  95   %1 = sext <32 x i8> %op1 to <32 x i16>
  96   %2 = sext <32 x i8> %op2 to <32 x i16>
  97   %mul = mul <32 x i16> %1, %2
  98   %shr = lshr <32 x i16> %mul, %splat
  99   %res = trunc <32 x i16> %shr to <32 x i8>
 100   store <32 x i8> %res, <32 x i8>* %a
 101   ret void
 102 }
 103
 104 define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
 105 ; CHECK-LABEL: smulh_v64i8:
 106 ; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
 107 ; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
 108 ; VBITS_EQ_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
 109 ; VBITS_EQ_512: st1b { [[RES]].b }, [[PG]], [x0]
 110 ; VBITS_EQ_512: ret
 111
 112 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
 113 ; VBITS_GE_1024-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 114 ; VBITS_GE_1024-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 115 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 116 ; VBITS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
 117 ; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
 118 ; VBITS_GE_1024: ret
 119   %op1 = load <64 x i8>, <64 x i8>* %a
 120   %op2 = load <64 x i8>, <64 x i8>* %b
 121   %insert = insertelement <64 x i16> undef, i16 8, i64 0
 122   %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
 123   %1 = sext <64 x i8> %op1 to <64 x i16>
 124   %2 = sext <64 x i8> %op2 to <64 x i16>
 125   %mul = mul <64 x i16> %1, %2
 126   %shr = lshr <64 x i16> %mul, %splat
 127   %res = trunc <64 x i16> %shr to <64 x i8>
 128   store <64 x i8> %res, <64 x i8>* %a
 129   ret void
 130 }
 131
 132 define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
 133 ; CHECK-LABEL: smulh_v128i8:
 134 ; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
 135 ; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
 136 ; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
 137 ; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
 138 ; VBITS_EQ_1024: ret
 139
 140 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
 141 ; VBITS_GE_2048-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 142 ; VBITS_GE_2048-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 143 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 144 ; VBITS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
 145 ; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
 146 ; VBITS_GE_2048: ret
 147   %op1 = load <128 x i8>, <128 x i8>* %a
 148   %op2 = load <128 x i8>, <128 x i8>* %b
 149   %insert = insertelement <128 x i16> undef, i16 8, i64 0
 150   %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
 151   %1 = sext <128 x i8> %op1 to <128 x i16>
 152   %2 = sext <128 x i8> %op2 to <128 x i16>
 153   %mul = mul <128 x i16> %1, %2
 154   %shr = lshr <128 x i16> %mul, %splat
 155   %res = trunc <128 x i16> %shr to <128 x i8>
 156   store <128 x i8> %res, <128 x i8>* %a
 157   ret void
 158 }
 159
 160 define void @smulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 161 ; CHECK-LABEL: smulh_v256i8:
 162 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
 163 ; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
 164 ; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
 165 ; VBITS_GE_2048: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
 166 ; VBITS_GE_2048: st1b { [[RES]].b }, [[PG]], [x0]
 167 ; VBITS_GE_2048: ret
 168   %op1 = load <256 x i8>, <256 x i8>* %a
 169   %op2 = load <256 x i8>, <256 x i8>* %b
 170   %insert = insertelement <256 x i16> undef, i16 8, i64 0
 171   %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
 172   %1 = sext <256 x i8> %op1 to <256 x i16>
 173   %2 = sext <256 x i8> %op2 to <256 x i16>
 174   %mul = mul <256 x i16> %1, %2
 175   %shr = lshr <256 x i16> %mul, %splat
 176   %res = trunc <256 x i16> %shr to <256 x i8>
 177   store <256 x i8> %res, <256 x i8>* %a
 178   ret void
 179 }
 180
 181 ; Don't use SVE for 64-bit vectors.
 182 define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 183 ; CHECK-LABEL: smulh_v4i16:
 184 ; CHECK: smull v0.4s, v0.4h, v1.4h
 185 ; CHECK: ushr v0.4s, v0.4s, #16
 186 ; CHECK: mov w8, v0.s[1]
 187 ; CHECK: mov w9, v0.s[2]
 188 ; CHECK: mov w10, v0.s[3]
 189 ; CHECK: mov v0.h[1], w8
 190 ; CHECK: mov v0.h[2], w9
 191 ; CHECK: mov v0.h[3], w10
 192 ; CHECK: ret
 193   %insert = insertelement <4 x i32> undef, i32 16, i64 0
 194   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
 195   %1 = sext <4 x i16> %op1 to <4 x i32>
 196   %2 = sext <4 x i16> %op2 to <4 x i32>
 197   %mul = mul <4 x i32> %1, %2
 198   %shr = lshr <4 x i32> %mul, %splat
 199   %res = trunc <4 x i32> %shr to <4 x i16>
 200   ret <4 x i16> %res
 201 }
 202
 203 ; Don't use SVE for 128-bit vectors.
 204 define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 205 ; CHECK-LABEL: smulh_v8i16:
 206 ; CHECK: smull2 v2.4s, v0.8h, v1.8h
 207 ; CHECK: smull v0.4s, v0.4h, v1.4h
 208 ; CHECK: uzp2 v0.8h, v0.8h, v2.8h
 209 ; CHECK: ret
 210   %insert = insertelement <8 x i32> undef, i32 16, i64 0
 211   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 212   %1 = sext <8 x i16> %op1 to <8 x i32>
 213   %2 = sext <8 x i16> %op2 to <8 x i32>
 214   %mul = mul <8 x i32> %1, %2
 215   %shr = lshr <8 x i32> %mul, %splat
 216   %res = trunc <8 x i32> %shr to <8 x i16>
 217   ret <8 x i16> %res
 218 }
 219
 220 define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 221 ; CHECK-LABEL: smulh_v16i16:
 222 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
 223 ; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 224 ; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 225 ; VBITS_EQ_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 226 ; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
 227 ; VBITS_EQ_256: ret
 228
 229 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
 230 ; VBITS_GE_512-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 231 ; VBITS_GE_512-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 232 ; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 233 ; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
 234 ; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
 235 ; VBITS_GE_512: ret
 236   %op1 = load <16 x i16>, <16 x i16>* %a
 237   %op2 = load <16 x i16>, <16 x i16>* %b
 238   %insert = insertelement <16 x i32> undef, i32 16, i64 0
 239   %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 240   %1 = sext <16 x i16> %op1 to <16 x i32>
 241   %2 = sext <16 x i16> %op2 to <16 x i32>
 242   %mul = mul <16 x i32> %1, %2
 243   %shr = lshr <16 x i32> %mul, %splat
 244   %res = trunc <16 x i32> %shr to <16 x i16>
 245   store <16 x i16> %res, <16 x i16>* %a
 246   ret void
 247 }
 248
 249 define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
 250 ; CHECK-LABEL: smulh_v32i16:
 251 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
 252 ; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 253 ; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 254 ; VBITS_EQ_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 255 ; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
 256 ; VBITS_EQ_512: ret
 257
 258 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
 259 ; VBITS_GE_1024-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 260 ; VBITS_GE_1024-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 261 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 262 ; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
 263 ; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
 264 ; VBITS_GE_1024: ret
 265   %op1 = load <32 x i16>, <32 x i16>* %a
 266   %op2 = load <32 x i16>, <32 x i16>* %b
 267   %insert = insertelement <32 x i32> undef, i32 16, i64 0
 268   %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
 269   %1 = sext <32 x i16> %op1 to <32 x i32>
 270   %2 = sext <32 x i16> %op2 to <32 x i32>
 271   %mul = mul <32 x i32> %1, %2
 272   %shr = lshr <32 x i32> %mul, %splat
 273   %res = trunc <32 x i32> %shr to <32 x i16>
 274   store <32 x i16> %res, <32 x i16>* %a
 275   ret void
 276 }
 277
 278 define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
 279 ; CHECK-LABEL: smulh_v64i16:
 280 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
 281 ; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 282 ; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 283 ; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 284 ; VBITS_EQ_1024: st1h { [[RES]].h }, [[PG]], [x0]
 285 ; VBITS_EQ_1024: ret
 286
 287 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
 288 ; VBITS_GE_2048-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 289 ; VBITS_GE_2048-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 290 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 291 ; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
 292 ; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
 293 ; VBITS_GE_2048: ret
 294   %op1 = load <64 x i16>, <64 x i16>* %a
 295   %op2 = load <64 x i16>, <64 x i16>* %b
 296   %insert = insertelement <64 x i32> undef, i32 16, i64 0
 297   %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
 298   %1 = sext <64 x i16> %op1 to <64 x i32>
 299   %2 = sext <64 x i16> %op2 to <64 x i32>
 300   %mul = mul <64 x i32> %1, %2
 301   %shr = lshr <64 x i32> %mul, %splat
 302   %res = trunc <64 x i32> %shr to <64 x i16>
 303   store <64 x i16> %res, <64 x i16>* %a
 304   ret void
 305 }
 306
 307 define void @smulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 308 ; CHECK-LABEL: smulh_v128i16:
 309 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
 310 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 311 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 312 ; VBITS_GE_2048: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 313 ; VBITS_GE_2048: st1h { [[RES]].h }, [[PG]], [x0]
 314 ; VBITS_GE_2048: ret
 315   %op1 = load <128 x i16>, <128 x i16>* %a
 316   %op2 = load <128 x i16>, <128 x i16>* %b
 317   %insert = insertelement <128 x i32> undef, i32 16, i64 0
 318   %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
 319   %1 = sext <128 x i16> %op1 to <128 x i32>
 320   %2 = sext <128 x i16> %op2 to <128 x i32>
 321   %mul = mul <128 x i32> %1, %2
 322   %shr = lshr <128 x i32> %mul, %splat
 323   %res = trunc <128 x i32> %shr to <128 x i16>
 324   store <128 x i16> %res, <128 x i16>* %a
 325   ret void
 326 }
 327
 328 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
 329 define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 330 ; CHECK-LABEL: smulh_v2i32:
 331 ; CHECK: sshll v0.2d, v0.2s, #0
 332 ; CHECK: sshll v1.2d, v1.2s, #0
 333 ; CHECK: ptrue p0.d, vl2
 334 ; CHECK: mul z0.d, p0/m, z0.d, z1.d
 335 ; CHECK: shrn v0.2s, v0.2d, #32
 336 ; CHECK: ret
 337   %insert = insertelement <2 x i64> undef, i64 32, i64 0
 338   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
 339   %1 = sext <2 x i32> %op1 to <2 x i64>
 340   %2 = sext <2 x i32> %op2 to <2 x i64>
 341   %mul = mul <2 x i64> %1, %2
 342   %shr = lshr <2 x i64> %mul, %splat
 343   %res = trunc <2 x i64> %shr to <2 x i32>
 344   ret <2 x i32> %res
 345 }
 346
 347 ; Don't use SVE for 128-bit vectors.
 348 define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 349 ; CHECK-LABEL: smulh_v4i32:
 350 ; CHECK: smull2 v2.2d, v0.4s, v1.4s
 351 ; CHECK: smull v0.2d, v0.2s, v1.2s
 352 ; CHECK: uzp2 v0.4s, v0.4s, v2.4s
 353 ; CHECK: ret
 354   %insert = insertelement <4 x i64> undef, i64 32, i64 0
 355   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 356   %1 = sext <4 x i32> %op1 to <4 x i64>
 357   %2 = sext <4 x i32> %op2 to <4 x i64>
 358   %mul = mul <4 x i64> %1, %2
 359   %shr = lshr <4 x i64> %mul, %splat
 360   %res = trunc <4 x i64> %shr to <4 x i32>
 361   ret <4 x i32> %res
 362 }
 363
 364 define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 365 ; CHECK-LABEL: smulh_v8i32:
 366 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
 367 ; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 368 ; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 369 ; VBITS_EQ_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 370 ; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
 371 ; VBITS_EQ_256: ret
 372
 373 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
 374 ; VBITS_GE_512-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 375 ; VBITS_GE_512-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 376 ; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 377 ; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
 378 ; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
 379 ; VBITS_GE_512: ret
 380   %op1 = load <8 x i32>, <8 x i32>* %a
 381   %op2 = load <8 x i32>, <8 x i32>* %b
 382   %insert = insertelement <8 x i64> undef, i64 32, i64 0
 383   %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 384   %1 = sext <8 x i32> %op1 to <8 x i64>
 385   %2 = sext <8 x i32> %op2 to <8 x i64>
 386   %mul = mul <8 x i64> %1, %2
 387   %shr = lshr <8 x i64> %mul, %splat
 388   %res = trunc <8 x i64> %shr to <8 x i32>
 389   store <8 x i32> %res, <8 x i32>* %a
 390   ret void
 391 }
 392
 393 define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
 394 ; CHECK-LABEL: smulh_v16i32:
 395 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
 396 ; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 397 ; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 398 ; VBITS_EQ_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 399 ; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
 400 ; VBITS_EQ_512: ret
 401
 402 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
 403 ; VBITS_GE_1024-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 404 ; VBITS_GE_1024-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 405 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 406 ; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
 407 ; VBITS_GE_1024: ret
 408   %op1 = load <16 x i32>, <16 x i32>* %a
 409   %op2 = load <16 x i32>, <16 x i32>* %b
 410   %insert = insertelement <16 x i64> undef, i64 32, i64 0
 411   %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
 412   %1 = sext <16 x i32> %op1 to <16 x i64>
 413   %2 = sext <16 x i32> %op2 to <16 x i64>
 414   %mul = mul <16 x i64> %1, %2
 415   %shr = lshr <16 x i64> %mul, %splat
 416   %res = trunc <16 x i64> %shr to <16 x i32>
 417   store <16 x i32> %res, <16 x i32>* %a
 418   ret void
 419 }
 420
 421 define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
 422 ; CHECK-LABEL: smulh_v32i32:
 423 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
 424 ; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 425 ; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 426 ; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 427 ; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
 428 ; VBITS_EQ_1024: ret
 429
 430 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
 431 ; VBITS_GE_2048-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 432 ; VBITS_GE_2048-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 433 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 434 ; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
 435 ; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
 436 ; VBITS_GE_2048: ret
 437   %op1 = load <32 x i32>, <32 x i32>* %a
 438   %op2 = load <32 x i32>, <32 x i32>* %b
 439   %insert = insertelement <32 x i64> undef, i64 32, i64 0
 440   %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
 441   %1 = sext <32 x i32> %op1 to <32 x i64>
 442   %2 = sext <32 x i32> %op2 to <32 x i64>
 443   %mul = mul <32 x i64> %1, %2
 444   %shr = lshr <32 x i64> %mul, %splat
 445   %res = trunc <32 x i64> %shr to <32 x i32>
 446   store <32 x i32> %res, <32 x i32>* %a
 447   ret void
 448 }
 449
 450 define void @smulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
 451 ; CHECK-LABEL: smulh_v64i32:
 452 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
 453 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 454 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 455 ; VBITS_GE_2048: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 456 ; VBITS_GE_2048: st1w { [[RES]].s }, [[PG]], [x0]
 457 ; VBITS_GE_2048: ret
 458   %op1 = load <64 x i32>, <64 x i32>* %a
 459   %op2 = load <64 x i32>, <64 x i32>* %b
 460   %insert = insertelement <64 x i64> undef, i64 32, i64 0
 461   %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
 462   %1 = sext <64 x i32> %op1 to <64 x i64>
 463   %2 = sext <64 x i32> %op2 to <64 x i64>
 464   %mul = mul <64 x i64> %1, %2
 465   %shr = lshr <64 x i64> %mul, %splat
 466   %res = trunc <64 x i64> %shr to <64 x i32>
 467   store <64 x i32> %res, <64 x i32>* %a
 468   ret void
 469 }
 470
 471 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
 472 define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
 473 ; CHECK-LABEL: smulh_v1i64:
 474 ; CHECK: ptrue p0.d, vl1
 475 ; CHECK: smulh z0.d, p0/m, z0.d, z1.d
 476 ; CHECK: ret
 477   %insert = insertelement <1 x i128> undef, i128 64, i128 0
 478   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
 479   %1 = sext <1 x i64> %op1 to <1 x i128>
 480   %2 = sext <1 x i64> %op2 to <1 x i128>
 481   %mul = mul <1 x i128> %1, %2
 482   %shr = lshr <1 x i128> %mul, %splat
 483   %res = trunc <1 x i128> %shr to <1 x i64>
 484   ret <1 x i64> %res
 485 }
 486
 487 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
 488 define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
 489 ; CHECK-LABEL: smulh_v2i64:
 490 ; CHECK: ptrue p0.d, vl2
 491 ; CHECK: smulh z0.d, p0/m, z0.d, z1.d
 492 ; CHECK: ret
 493   %insert = insertelement <2 x i128> undef, i128 64, i128 0
 494   %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
 495   %1 = sext <2 x i64> %op1 to <2 x i128>
 496   %2 = sext <2 x i64> %op2 to <2 x i128>
 497   %mul = mul <2 x i128> %1, %2
 498   %shr = lshr <2 x i128> %mul, %splat
 499   %res = trunc <2 x i128> %shr to <2 x i64>
 500   ret <2 x i64> %res
 501 }
 502
 503 define void @smulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
 504 ; CHECK-LABEL: smulh_v4i64:
 505 ; VBITS_GE_256: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,4)]]
 506 ; VBITS_GE_256-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 507 ; VBITS_GE_256-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 508 ; VBITS_GE_256: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 509 ; VBITS_GE_256: st1d { [[RES]].d }, [[PG]], [x0]
 510 ; VBITS_GE_256: ret
 511   %op1 = load <4 x i64>, <4 x i64>* %a
 512   %op2 = load <4 x i64>, <4 x i64>* %b
 513   %insert = insertelement <4 x i128> undef, i128 64, i128 0
 514   %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
 515   %1 = sext <4 x i64> %op1 to <4 x i128>
 516   %2 = sext <4 x i64> %op2 to <4 x i128>
 517   %mul = mul <4 x i128> %1, %2
 518   %shr = lshr <4 x i128> %mul, %splat
 519   %res = trunc <4 x i128> %shr to <4 x i64>
 520   store <4 x i64> %res, <4 x i64>* %a
 521   ret void
 522 }
 523
 524 define void @smulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
 525 ; CHECK-LABEL: smulh_v8i64:
 526 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
 527 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 528 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 529 ; VBITS_GE_512: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 530 ; VBITS_GE_512: st1d { [[RES]].d }, [[PG]], [x0]
 531 ; VBITS_GE_512: ret
 532   %op1 = load <8 x i64>, <8 x i64>* %a
 533   %op2 = load <8 x i64>, <8 x i64>* %b
 534   %insert = insertelement <8 x i128> undef, i128 64, i128 0
 535   %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
 536   %1 = sext <8 x i64> %op1 to <8 x i128>
 537   %2 = sext <8 x i64> %op2 to <8 x i128>
 538   %mul = mul <8 x i128> %1, %2
 539   %shr = lshr <8 x i128> %mul, %splat
 540   %res = trunc <8 x i128> %shr to <8 x i64>
 541   store <8 x i64> %res, <8 x i64>* %a
 542   ret void
 543 }
 544
 545 define void @smulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
 546 ; CHECK-LABEL: smulh_v16i64:
 547 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
 548 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 549 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 550 ; VBITS_GE_1024: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 551 ; VBITS_GE_1024: st1d { [[RES]].d }, [[PG]], [x0]
 552 ; VBITS_GE_1024: ret
 553   %op1 = load <16 x i64>, <16 x i64>* %a
 554   %op2 = load <16 x i64>, <16 x i64>* %b
 555   %insert = insertelement <16 x i128> undef, i128 64, i128 0
 556   %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
 557   %1 = sext <16 x i64> %op1 to <16 x i128>
 558   %2 = sext <16 x i64> %op2 to <16 x i128>
 559   %mul = mul <16 x i128> %1, %2
 560   %shr = lshr <16 x i128> %mul, %splat
 561   %res = trunc <16 x i128> %shr to <16 x i64>
 562   store <16 x i64> %res, <16 x i64>* %a
 563   ret void
 564 }
 565
 566 define void @smulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
 567 ; CHECK-LABEL: smulh_v32i64:
 568 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
 569 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 570 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 571 ; VBITS_GE_2048: smulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 572 ; VBITS_GE_2048: st1d { [[RES]].d }, [[PG]], [x0]
 573 ; VBITS_GE_2048: ret
 574   %op1 = load <32 x i64>, <32 x i64>* %a
 575   %op2 = load <32 x i64>, <32 x i64>* %b
 576   %insert = insertelement <32 x i128> undef, i128 64, i128 0
 577   %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
 578   %1 = sext <32 x i64> %op1 to <32 x i128>
 579   %2 = sext <32 x i64> %op2 to <32 x i128>
 580   %mul = mul <32 x i128> %1, %2
 581   %shr = lshr <32 x i128> %mul, %splat
 582   %res = trunc <32 x i128> %shr to <32 x i64>
 583   store <32 x i64> %res, <32 x i64>* %a
 584   ret void
 585 }
 586
 587 ;
 588 ; UMULH
 589 ;
 590
 591 ; Don't use SVE for 64-bit vectors.
 592 define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 593 ; CHECK-LABEL: umulh_v8i8:
 594 ; CHECK: umull v0.8h, v0.8b, v1.8b
 595 ; CHECK: ushr v1.8h, v0.8h, #8
 596 ; CHECK: umov w8, v1.h[0]
 597 ; CHECK: fmov s0, w8
 598 ; CHECK: umov w8, v1.h[1]
 599 ; CHECK: mov v0.b[1], w8
 600 ; CHECK: umov w8, v1.h[2]
 601 ; CHECK: mov v0.b[2], w8
 602 ; CHECK: umov w8, v1.h[3]
 603 ; CHECK: mov v0.b[3], w8
 604 ; CHECK: ret
 605   %insert = insertelement <8 x i16> undef, i16 8, i64 0
 606   %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
 607   %1 = zext <8 x i8> %op1 to <8 x i16>
 608   %2 = zext <8 x i8> %op2 to <8 x i16>
 609   %mul = mul <8 x i16> %1, %2
 610   %shr = lshr <8 x i16> %mul, %splat
 611   %res = trunc <8 x i16> %shr to <8 x i8>
 612   ret <8 x i8> %res
 613 }
 614
 615 ; Don't use SVE for 128-bit vectors.
 616 define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 617 ; CHECK-LABEL: umulh_v16i8:
 618 ; CHECK: umull2 v2.8h, v0.16b, v1.16b
 619 ; CHECK: umull v0.8h, v0.8b, v1.8b
 620 ; CHECK: uzp2 v0.16b, v0.16b, v2.16b
 621 ; CHECK: ret
 622   %insert = insertelement <16 x i16> undef, i16 8, i64 0
 623   %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
 624   %1 = zext <16 x i8> %op1 to <16 x i16>
 625   %2 = zext <16 x i8> %op2 to <16 x i16>
 626   %mul = mul <16 x i16> %1, %2
 627   %shr = lshr <16 x i16> %mul, %splat
 628   %res = trunc <16 x i16> %shr to <16 x i8>
 629   ret <16 x i8> %res
 630 }
 631
 632 define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
 633 ; CHECK-LABEL: umulh_v32i8:
 634 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
 635 ; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
 636 ; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
 637 ; VBITS_EQ_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
 638 ; VBITS_EQ_256: st1b { [[RES]].b }, [[PG]], [x0]
 639 ; VBITS_EQ_256: ret
 640
 641 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
 642 ; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 643 ; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 644 ; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 645 ; VBIGS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
 646 ; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
 647 ; VBITS_GE_512: ret
 648   %op1 = load <32 x i8>, <32 x i8>* %a
 649   %op2 = load <32 x i8>, <32 x i8>* %b
 650   %insert = insertelement <32 x i16> undef, i16 8, i64 0
 651   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 652   %1 = zext <32 x i8> %op1 to <32 x i16>
 653   %2 = zext <32 x i8> %op2 to <32 x i16>
 654   %mul = mul <32 x i16> %1, %2
 655   %shr = lshr <32 x i16> %mul, %splat
 656   %res = trunc <32 x i16> %shr to <32 x i8>
 657   store <32 x i8> %res, <32 x i8>* %a
 658   ret void
 659 }
 660
 661 define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
 662 ; CHECK-LABEL: umulh_v64i8:
 663 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
 664 ; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
 665 ; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
 666 ; VBITS_EQ_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
 667 ; VBITS_EQ_512: ret
 668
 669 ; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 670 ; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 671 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 672 ; VBIGS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
 673 ; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
 674 ; VBITS_GE_1024: ret
 675   %op1 = load <64 x i8>, <64 x i8>* %a
 676   %op2 = load <64 x i8>, <64 x i8>* %b
 677   %insert = insertelement <64 x i16> undef, i16 8, i64 0
 678   %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
 679   %1 = zext <64 x i8> %op1 to <64 x i16>
 680   %2 = zext <64 x i8> %op2 to <64 x i16>
 681   %mul = mul <64 x i16> %1, %2
 682   %shr = lshr <64 x i16> %mul, %splat
 683   %res = trunc <64 x i16> %shr to <64 x i8>
 684   store <64 x i8> %res, <64 x i8>* %a
 685   ret void
 686 }
 687
 688 define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
 689 ; CHECK-LABEL: umulh_v128i8:
 690 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
 691 ; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
 692 ; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
 693 ; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
 694 ; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
 695 ; VBITS_EQ_1024: ret
 696
 697 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
 698 ; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 699 ; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 700 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 701 ; VBIGS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
 702 ; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
 703 ; VBITS_GE_2048: ret
 704   %op1 = load <128 x i8>, <128 x i8>* %a
 705   %op2 = load <128 x i8>, <128 x i8>* %b
 706   %insert = insertelement <128 x i16> undef, i16 8, i64 0
 707   %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
 708   %1 = zext <128 x i8> %op1 to <128 x i16>
 709   %2 = zext <128 x i8> %op2 to <128 x i16>
 710   %mul = mul <128 x i16> %1, %2
 711   %shr = lshr <128 x i16> %mul, %splat
 712   %res = trunc <128 x i16> %shr to <128 x i8>
 713   store <128 x i8> %res, <128 x i8>* %a
 714   ret void
 715 }
 716
 717 define void @umulh_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 {
 718 ; CHECK-LABEL: umulh_v256i8:
 719 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
 720 ; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
 721 ; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
 722 ; VBITS_GE_2048: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
 723 ; VBITS_GE_2048: st1b { [[RES]].b }, [[PG]], [x0]
 724 ; VBITS_GE_2048: ret
 725   %op1 = load <256 x i8>, <256 x i8>* %a
 726   %op2 = load <256 x i8>, <256 x i8>* %b
 727   %insert = insertelement <256 x i16> undef, i16 8, i64 0
 728   %splat = shufflevector <256 x i16> %insert, <256 x i16> undef, <256 x i32> zeroinitializer
 729   %1 = zext <256 x i8> %op1 to <256 x i16>
 730   %2 = zext <256 x i8> %op2 to <256 x i16>
 731   %mul = mul <256 x i16> %1, %2
 732   %shr = lshr <256 x i16> %mul, %splat
 733   %res = trunc <256 x i16> %shr to <256 x i8>
 734   store <256 x i8> %res, <256 x i8>* %a
 735   ret void
 736 }
 737
 738 ; Don't use SVE for 64-bit vectors.
 739 define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 740 ; CHECK-LABEL: umulh_v4i16:
 741 ; CHECK: umull v0.4s, v0.4h, v1.4h
 742 ; CHECK: ushr v0.4s, v0.4s, #16
 743 ; CHECK: mov w8, v0.s[1]
 744 ; CHECK: mov w9, v0.s[2]
 745 ; CHECK: mov w10, v0.s[3]
 746 ; CHECK: mov v0.h[1], w8
 747 ; CHECK: mov v0.h[2], w9
 748 ; CHECK: mov v0.h[3], w10
 749 ; CHECK: ret
 750   %insert = insertelement <4 x i32> undef, i32 16, i64 0
 751   %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
 752   %1 = zext <4 x i16> %op1 to <4 x i32>
 753   %2 = zext <4 x i16> %op2 to <4 x i32>
 754   %mul = mul <4 x i32> %1, %2
 755   %shr = lshr <4 x i32> %mul, %splat
 756   %res = trunc <4 x i32> %shr to <4 x i16>
 757   ret <4 x i16> %res
 758 }
 759
 760 ; Don't use SVE for 128-bit vectors.
 761 define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 762 ; CHECK-LABEL: umulh_v8i16:
 763 ; CHECK: umull2 v2.4s, v0.8h, v1.8h
 764 ; CHECK: umull v0.4s, v0.4h, v1.4h
 765 ; CHECK: uzp2 v0.8h, v0.8h, v2.8h
 766 ; CHECK: ret
 767   %insert = insertelement <8 x i32> undef, i32 16, i64 0
 768   %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
 769   %1 = zext <8 x i16> %op1 to <8 x i32>
 770   %2 = zext <8 x i16> %op2 to <8 x i32>
 771   %mul = mul <8 x i32> %1, %2
 772   %shr = lshr <8 x i32> %mul, %splat
 773   %res = trunc <8 x i32> %shr to <8 x i16>
 774   ret <8 x i16> %res
 775 }
 776
 777 define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
 778 ; CHECK-LABEL: umulh_v16i16:
 779 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
 780 ; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 781 ; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 782 ; VBITS_EQ_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 783 ; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
 784 ; VBITS_EQ_256: ret
 785
 786 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
 787 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 788 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 789 ; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 790 ; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
 791 ; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
 792 ; VBITS_GE_512: ret
 793   %op1 = load <16 x i16>, <16 x i16>* %a
 794   %op2 = load <16 x i16>, <16 x i16>* %b
 795   %insert = insertelement <16 x i32> undef, i32 16, i64 0
 796   %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
 797   %1 = zext <16 x i16> %op1 to <16 x i32>
 798   %2 = zext <16 x i16> %op2 to <16 x i32>
 799   %mul = mul <16 x i32> %1, %2
 800   %shr = lshr <16 x i32> %mul, %splat
 801   %res = trunc <16 x i32> %shr to <16 x i16>
 802   store <16 x i16> %res, <16 x i16>* %a
 803   ret void
 804 }
 805
 806 define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
 807 ; CHECK-LABEL: umulh_v32i16:
 808 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
 809 ; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 810 ; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 811 ; VBITS_EQ_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 812 ; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
 813 ; VBITS_EQ_512: ret
 814
 815 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
 816 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 817 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 818 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 819 ; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
 820 ; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
 821 ; VBITS_GE_1024: ret
 822   %op1 = load <32 x i16>, <32 x i16>* %a
 823   %op2 = load <32 x i16>, <32 x i16>* %b
 824   %insert = insertelement <32 x i32> undef, i32 16, i64 0
 825   %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
 826   %1 = zext <32 x i16> %op1 to <32 x i32>
 827   %2 = zext <32 x i16> %op2 to <32 x i32>
 828   %mul = mul <32 x i32> %1, %2
 829   %shr = lshr <32 x i32> %mul, %splat
 830   %res = trunc <32 x i32> %shr to <32 x i16>
 831   store <32 x i16> %res, <32 x i16>* %a
 832   ret void
 833 }
 834
 835 define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
 836 ; CHECK-LABEL: umulh_v64i16:
 837 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
 838 ; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 839 ; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 840 ; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 841 ; VBITS_EQ_1024: ret
 842
 843 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
 844 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 845 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 846 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 847 ; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
 848 ; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
 849 ; VBITS_GE_2048: ret
 850   %op1 = load <64 x i16>, <64 x i16>* %a
 851   %op2 = load <64 x i16>, <64 x i16>* %b
 852   %insert = insertelement <64 x i32> undef, i32 16, i64 0
 853   %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
 854   %1 = zext <64 x i16> %op1 to <64 x i32>
 855   %2 = zext <64 x i16> %op2 to <64 x i32>
 856   %mul = mul <64 x i32> %1, %2
 857   %shr = lshr <64 x i32> %mul, %splat
 858   %res = trunc <64 x i32> %shr to <64 x i16>
 859   store <64 x i16> %res, <64 x i16>* %a
 860   ret void
 861 }
 862
 863 define void @umulh_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 {
 864 ; CHECK-LABEL: umulh_v128i16:
 865 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
 866 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 867 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 868 ; VBITS_GE_2048: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 869 ; VBITS_GE_2048: st1h { [[RES]].h }, [[PG]], [x0]
 870 ; VBITS_GE_2048: ret
 871   %op1 = load <128 x i16>, <128 x i16>* %a
 872   %op2 = load <128 x i16>, <128 x i16>* %b
 873   %insert = insertelement <128 x i32> undef, i32 16, i64 0
 874   %splat = shufflevector <128 x i32> %insert, <128 x i32> undef, <128 x i32> zeroinitializer
 875   %1 = zext <128 x i16> %op1 to <128 x i32>
 876   %2 = zext <128 x i16> %op2 to <128 x i32>
 877   %mul = mul <128 x i32> %1, %2
 878   %shr = lshr <128 x i32> %mul, %splat
 879   %res = trunc <128 x i32> %shr to <128 x i16>
 880   store <128 x i16> %res, <128 x i16>* %a
 881   ret void
 882 }
 883
 884 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
 885 define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) #0 {
 886 ; CHECK-LABEL: umulh_v2i32:
 887 ; CHECK: ushll v0.2d, v0.2s, #0
 888 ; CHECK: ushll v1.2d, v1.2s, #0
 889 ; CHECK: ptrue p0.d, vl2
 890 ; CHECK: mul z0.d, p0/m, z0.d, z1.d
 891 ; CHECK: shrn v0.2s, v0.2d, #32
 892 ; CHECK: ret
 893   %insert = insertelement <2 x i64> undef, i64 32, i64 0
 894   %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
 895   %1 = zext <2 x i32> %op1 to <2 x i64>
 896   %2 = zext <2 x i32> %op2 to <2 x i64>
 897   %mul = mul <2 x i64> %1, %2
 898   %shr = lshr <2 x i64> %mul, %splat
 899   %res = trunc <2 x i64> %shr to <2 x i32>
 900   ret <2 x i32> %res
 901 }
 902
 903 ; Don't use SVE for 128-bit vectors.
 904 define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 905 ; CHECK-LABEL: umulh_v4i32:
 906 ; CHECK: umull2 v2.2d, v0.4s, v1.4s
 907 ; CHECK: umull v0.2d, v0.2s, v1.2s
 908 ; CHECK: uzp2 v0.4s, v0.4s, v2.4s
 909 ; CHECK: ret
 910   %insert = insertelement <4 x i64> undef, i64 32, i64 0
 911   %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
 912   %1 = zext <4 x i32> %op1 to <4 x i64>
 913   %2 = zext <4 x i32> %op2 to <4 x i64>
 914   %mul = mul <4 x i64> %1, %2
 915   %shr = lshr <4 x i64> %mul, %splat
 916   %res = trunc <4 x i64> %shr to <4 x i32>
 917   ret <4 x i32> %res
 918 }
 919
 920 define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
 921 ; CHECK-LABEL: umulh_v8i32:
 922 ; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
 923 ; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 924 ; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 925 ; VBITS_EQ_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 926 ; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
 927 ; VBITS_EQ_256: ret
 928
 929 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
 930 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 931 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 932 ; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 933 ; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
 934 ; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
 935 ; VBITS_GE_512: ret
 936   %op1 = load <8 x i32>, <8 x i32>* %a
 937   %op2 = load <8 x i32>, <8 x i32>* %b
 938   %insert = insertelement <8 x i64> undef, i64 32, i64 0
 939   %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
 940   %1 = zext <8 x i32> %op1 to <8 x i64>
 941   %2 = zext <8 x i32> %op2 to <8 x i64>
 942   %mul = mul <8 x i64> %1, %2
 943   %shr = lshr <8 x i64> %mul, %splat
 944   %res = trunc <8 x i64> %shr to <8 x i32>
 945   store <8 x i32> %res, <8 x i32>* %a
 946   ret void
 947 }
 948
 949 define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
 950 ; CHECK-LABEL: umulh_v16i32:
 951 ; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
 952 ; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 953 ; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 954 ; VBITS_EQ_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 955 ; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
 956
 957 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
 958 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 959 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 960 ; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 961 ; VBITS_GE_1024: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
 962 ; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
 963 ; VBITS_GE_1024: ret
 964   %op1 = load <16 x i32>, <16 x i32>* %a
 965   %op2 = load <16 x i32>, <16 x i32>* %b
 966   %insert = insertelement <16 x i64> undef, i64 32, i64 0
 967   %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
 968   %1 = zext <16 x i32> %op1 to <16 x i64>
 969   %2 = zext <16 x i32> %op2 to <16 x i64>
 970   %mul = mul <16 x i64> %1, %2
 971   %shr = lshr <16 x i64> %mul, %splat
 972   %res = trunc <16 x i64> %shr to <16 x i32>
 973   store <16 x i32> %res, <16 x i32>* %a
 974   ret void
 975 }
 976
 977 define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
 978 ; CHECK-LABEL: umulh_v32i32:
 979 ; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
 980 ; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 981 ; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 982 ; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 983 ; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
 984 ; VBITS_EQ_1024: ret
 985
 986 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
 987 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 988 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 989 ; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 990 ; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
 991 ; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
 992 ; VBITS_GE_2048: ret
 993   %op1 = load <32 x i32>, <32 x i32>* %a
 994   %op2 = load <32 x i32>, <32 x i32>* %b
 995   %insert = insertelement <32 x i64> undef, i64 32, i64 0
 996   %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
 997   %1 = zext <32 x i32> %op1 to <32 x i64>
 998   %2 = zext <32 x i32> %op2 to <32 x i64>
 999   %mul = mul <32 x i64> %1, %2
1000   %shr = lshr <32 x i64> %mul, %splat
1001   %res = trunc <32 x i64> %shr to <32 x i32>
1002   store <32 x i32> %res, <32 x i32>* %a
1003   ret void
1004 }
1005
1006 define void @umulh_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 {
1007 ; CHECK-LABEL: umulh_v64i32:
1008 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
1009 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
1010 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
1011 ; VBITS_GE_2048: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
1012 ; VBITS_GE_2048: st1w { [[RES]].s }, [[PG]], [x0]
1013 ; VBITS_GE_2048: ret
1014   %op1 = load <64 x i32>, <64 x i32>* %a
1015   %op2 = load <64 x i32>, <64 x i32>* %b
1016   %insert = insertelement <64 x i64> undef, i64 32, i64 0
1017   %splat = shufflevector <64 x i64> %insert, <64 x i64> undef, <64 x i32> zeroinitializer
1018   %1 = zext <64 x i32> %op1 to <64 x i64>
1019   %2 = zext <64 x i32> %op2 to <64 x i64>
1020   %mul = mul <64 x i64> %1, %2
1021   %shr = lshr <64 x i64> %mul, %splat
1022   %res = trunc <64 x i64> %shr to <64 x i32>
1023   store <64 x i32> %res, <64 x i32>* %a
1024   ret void
1025 }
1026
1027 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
1028 define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
1029 ; CHECK-LABEL: umulh_v1i64:
1030 ; CHECK: ptrue p0.d, vl1
1031 ; CHECK: umulh z0.d, p0/m, z0.d, z1.d
1032 ; CHECK: ret
1033   %insert = insertelement <1 x i128> undef, i128 64, i128 0
1034   %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer
1035   %1 = zext <1 x i64> %op1 to <1 x i128>
1036   %2 = zext <1 x i64> %op2 to <1 x i128>
1037   %mul = mul <1 x i128> %1, %2
1038   %shr = lshr <1 x i128> %mul, %splat
1039   %res = trunc <1 x i128> %shr to <1 x i64>
1040   ret <1 x i64> %res
1041 }
1042
1043 ; Vector i64 multiplications are not legal for NEON so use SVE when available.
1044 define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
1045 ; CHECK-LABEL: umulh_v2i64:
1046 ; CHECK: ptrue p0.d, vl2
1047 ; CHECK: umulh z0.d, p0/m, z0.d, z1.d
1048 ; CHECK: ret
1049   %insert = insertelement <2 x i128> undef, i128 64, i128 0
1050   %splat = shufflevector <2 x i128> %insert, <2 x i128> undef, <2 x i32> zeroinitializer
1051   %1 = zext <2 x i64> %op1 to <2 x i128>
1052   %2 = zext <2 x i64> %op2 to <2 x i128>
1053   %mul = mul <2 x i128> %1, %2
1054   %shr = lshr <2 x i128> %mul, %splat
1055   %res = trunc <2 x i128> %shr to <2 x i64>
1056   ret <2 x i64> %res
1057 }
1058
1059 define void @umulh_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 {
1060 ; CHECK-LABEL: umulh_v4i64:
1061 ; VBITS_GE_256: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,4)]]
1062 ; VBITS_GE_256-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1063 ; VBITS_GE_256-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1064 ; VBITS_GE_256: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1065 ; VBITS_GE_256: st1d { [[RES]].d }, [[PG]], [x0]
1066 ; VBITS_GE_256: ret
1067   %op1 = load <4 x i64>, <4 x i64>* %a
1068   %op2 = load <4 x i64>, <4 x i64>* %b
1069   %insert = insertelement <4 x i128> undef, i128 64, i128 0
1070   %splat = shufflevector <4 x i128> %insert, <4 x i128> undef, <4 x i32> zeroinitializer
1071   %1 = zext <4 x i64> %op1 to <4 x i128>
1072   %2 = zext <4 x i64> %op2 to <4 x i128>
1073   %mul = mul <4 x i128> %1, %2
1074   %shr = lshr <4 x i128> %mul, %splat
1075   %res = trunc <4 x i128> %shr to <4 x i64>
1076   store <4 x i64> %res, <4 x i64>* %a
1077   ret void
1078 }
1079
1080 define void @umulh_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 {
1081 ; CHECK-LABEL: umulh_v8i64:
1082 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
1083 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1084 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1085 ; VBITS_GE_512: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1086 ; VBITS_GE_512: st1d { [[RES]].d }, [[PG]], [x0]
1087 ; VBITS_GE_512: ret
1088   %op1 = load <8 x i64>, <8 x i64>* %a
1089   %op2 = load <8 x i64>, <8 x i64>* %b
1090   %insert = insertelement <8 x i128> undef, i128 64, i128 0
1091   %splat = shufflevector <8 x i128> %insert, <8 x i128> undef, <8 x i32> zeroinitializer
1092   %1 = zext <8 x i64> %op1 to <8 x i128>
1093   %2 = zext <8 x i64> %op2 to <8 x i128>
1094   %mul = mul <8 x i128> %1, %2
1095   %shr = lshr <8 x i128> %mul, %splat
1096   %res = trunc <8 x i128> %shr to <8 x i64>
1097   store <8 x i64> %res, <8 x i64>* %a
1098   ret void
1099 }
1100
1101 define void @umulh_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 {
1102 ; CHECK-LABEL: umulh_v16i64:
1103 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
1104 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1105 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1106 ; VBITS_GE_1024: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1107 ; VBITS_GE_1024: st1d { [[RES]].d }, [[PG]], [x0]
1108 ; VBITS_GE_1024: ret
1109   %op1 = load <16 x i64>, <16 x i64>* %a
1110   %op2 = load <16 x i64>, <16 x i64>* %b
1111   %insert = insertelement <16 x i128> undef, i128 64, i128 0
1112   %splat = shufflevector <16 x i128> %insert, <16 x i128> undef, <16 x i32> zeroinitializer
1113   %1 = zext <16 x i64> %op1 to <16 x i128>
1114   %2 = zext <16 x i64> %op2 to <16 x i128>
1115   %mul = mul <16 x i128> %1, %2
1116   %shr = lshr <16 x i128> %mul, %splat
1117   %res = trunc <16 x i128> %shr to <16 x i64>
1118   store <16 x i64> %res, <16 x i64>* %a
1119   ret void
1120 }
1121
1122 define void @umulh_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 {
1123 ; CHECK-LABEL: umulh_v32i64:
1124 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
1125 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1126 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1127 ; VBITS_GE_2048: umulh [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1128 ; VBITS_GE_2048: st1d { [[RES]].d }, [[PG]], [x0]
1129 ; VBITS_GE_2048: ret
1130   %op1 = load <32 x i64>, <32 x i64>* %a
1131   %op2 = load <32 x i64>, <32 x i64>* %b
1132   %insert = insertelement <32 x i128> undef, i128 64, i128 0
1133   %splat = shufflevector <32 x i128> %insert, <32 x i128> undef, <32 x i32> zeroinitializer
1134   %1 = zext <32 x i64> %op1 to <32 x i128>
1135   %2 = zext <32 x i64> %op2 to <32 x i128>
1136   %mul = mul <32 x i128> %1, %2
1137   %shr = lshr <32 x i128> %mul, %splat
1138   %res = trunc <32 x i128> %shr to <32 x i64>
1139   store <32 x i64> %res, <32 x i64>* %a
1140   ret void
1141 }
1142 attributes #0 = { "target-features"="+sve" }