llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll

   1 ; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
   2 ; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
   3 ; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
   4 ; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
   5 ; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
   6 ; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
   7 ; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
   8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
   9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
  17
  18 target triple = "aarch64-unknown-linux-gnu"
  19
  20 ; Don't use SVE when its registers are no bigger than NEON.
  21 ; NO_SVE-NOT: ptrue
  22
  23 ;
  24 ; FADDA
  25 ;
  26
  27 ; No single instruction NEON support. Use SVE.
  28 define half @fadda_v4f16(half %start, <4 x half> %a) #0 {
  29 ; CHECK-LABEL: fadda_v4f16:
  30 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
  31 ; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
  32 ; CHECK-NEXT: ret
  33   %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
  34   ret half %res
  35 }
  36
  37 ; No single instruction NEON support. Use SVE.
  38 define half @fadda_v8f16(half %start, <8 x half> %a) #0 {
  39 ; CHECK-LABEL: fadda_v8f16:
  40 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
  41 ; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
  42 ; CHECK-NEXT: ret
  43   %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
  44   ret half %res
  45 }
  46
  47 define half @fadda_v16f16(half %start, <16 x half>* %a) #0 {
  48 ; CHECK-LABEL: fadda_v16f16:
  49 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
  50 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
  51 ; CHECK-NEXT: fadda h0, [[PG]], h0, [[OP]].h
  52 ; CHECK-NEXT: ret
  53   %op = load <16 x half>, <16 x half>* %a
  54   %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
  55   ret half %res
  56 }
  57
  58 define half @fadda_v32f16(half %start, <32 x half>* %a) #0 {
  59 ; CHECK-LABEL: fadda_v32f16:
  60 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
  61 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
  62 ; VBITS_GE_512-NEXT: fadda h0, [[PG]], h0, [[OP]].h
  63 ; VBITS_GE_512-NEXT: ret
  64
  65 ; Ensure sensible type legalisation.
  66 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
  67 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
  68 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
  69 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
  70 ; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[LO]].h
  71 ; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[HI]].h
  72 ; VBITS_EQ_256-NEXT: ret
  73   %op = load <32 x half>, <32 x half>* %a
  74   %res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
  75   ret half %res
  76 }
  77
  78 define half @fadda_v64f16(half %start, <64 x half>* %a) #0 {
  79 ; CHECK-LABEL: fadda_v64f16:
  80 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
  81 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
  82 ; VBITS_GE_1024-NEXT: fadda h0, [[PG]], h0, [[OP]].h
  83 ; VBITS_GE_1024-NEXT: ret
  84   %op = load <64 x half>, <64 x half>* %a
  85   %res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
  86   ret half %res
  87 }
  88
  89 define half @fadda_v128f16(half %start, <128 x half>* %a) #0 {
  90 ; CHECK-LABEL: fadda_v128f16:
  91 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
  92 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
  93 ; VBITS_GE_2048-NEXT: fadda h0, [[PG]], h0, [[OP]].h
  94 ; VBITS_GE_2048-NEXT: ret
  95   %op = load <128 x half>, <128 x half>* %a
  96   %res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
  97   ret half %res
  98 }
  99
 100 ; No single instruction NEON support. Use SVE.
 101 define float @fadda_v2f32(float %start, <2 x float> %a) #0 {
 102 ; CHECK-LABEL: fadda_v2f32:
 103 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
 104 ; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
 105 ; CHECK-NEXT: ret
 106   %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
 107   ret float %res
 108 }
 109
 110 ; No single instruction NEON support. Use SVE.
 111 define float @fadda_v4f32(float %start, <4 x float> %a) #0 {
 112 ; CHECK-LABEL: fadda_v4f32:
 113 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
 114 ; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
 115 ; CHECK-NEXT: ret
 116   %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
 117   ret float %res
 118 }
 119
 120 define float @fadda_v8f32(float %start, <8 x float>* %a) #0 {
 121 ; CHECK-LABEL: fadda_v8f32:
 122 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 123 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 124 ; CHECK-NEXT: fadda s0, [[PG]], s0, [[OP]].s
 125 ; CHECK-NEXT: ret
 126   %op = load <8 x float>, <8 x float>* %a
 127   %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
 128   ret float %res
 129 }
 130
 131 define float @fadda_v16f32(float %start, <16 x float>* %a) #0 {
 132 ; CHECK-LABEL: fadda_v16f32:
 133 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
 134 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 135 ; VBITS_GE_512-NEXT: fadda s0, [[PG]], s0, [[OP]].s
 136 ; VBITS_GE_512-NEXT: ret
 137
 138 ; Ensure sensible type legalisation.
 139 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
 140 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 141 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
 142 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
 143 ; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[LO]].s
 144 ; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[HI]].s
 145 ; VBITS_EQ_256-NEXT: ret
 146   %op = load <16 x float>, <16 x float>* %a
 147   %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
 148   ret float %res
 149 }
 150
 151 define float @fadda_v32f32(float %start, <32 x float>* %a) #0 {
 152 ; CHECK-LABEL: fadda_v32f32:
 153 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
 154 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 155 ; VBITS_GE_1024-NEXT: fadda s0, [[PG]], s0, [[OP]].s
 156 ; VBITS_GE_1024-NEXT: ret
 157   %op = load <32 x float>, <32 x float>* %a
 158   %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
 159   ret float %res
 160 }
 161
 162 define float @fadda_v64f32(float %start, <64 x float>* %a) #0 {
 163 ; CHECK-LABEL: fadda_v64f32:
 164 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
 165 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 166 ; VBITS_GE_2048-NEXT: fadda s0, [[PG]], s0, [[OP]].s
 167 ; VBITS_GE_2048-NEXT: ret
 168   %op = load <64 x float>, <64 x float>* %a
 169   %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
 170   ret float %res
 171 }
 172
 173 ; No single instruction NEON support. Use SVE.
 174 define double @fadda_v1f64(double %start, <1 x double> %a) #0 {
 175 ; CHECK-LABEL: fadda_v1f64:
 176 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
 177 ; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
 178 ; CHECK-NEXT: ret
 179   %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
 180   ret double %res
 181 }
 182
 183 ; No single instruction NEON support. Use SVE.
 184 define double @fadda_v2f64(double %start, <2 x double> %a) #0 {
 185 ; CHECK-LABEL: fadda_v2f64:
 186 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
 187 ; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
 188 ; CHECK-NEXT: ret
 189   %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
 190   ret double %res
 191 }
 192
 193 define double @fadda_v4f64(double %start, <4 x double>* %a) #0 {
 194 ; CHECK-LABEL: fadda_v4f64:
 195 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
 196 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 197 ; CHECK-NEXT: fadda d0, [[PG]], d0, [[OP]].d
 198 ; CHECK-NEXT: ret
 199   %op = load <4 x double>, <4 x double>* %a
 200   %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
 201   ret double %res
 202 }
 203
 204 define double @fadda_v8f64(double %start, <8 x double>* %a) #0 {
 205 ; CHECK-LABEL: fadda_v8f64:
 206 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
 207 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 208 ; VBITS_GE_512-NEXT: fadda d0, [[PG]], d0, [[OP]].d
 209 ; VBITS_GE_512-NEXT: ret
 210
 211 ; Ensure sensible type legalisation.
 212 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
 213 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 214 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
 215 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
 216 ; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[LO]].d
 217 ; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[HI]].d
 218 ; VBITS_EQ_256-NEXT: ret
 219   %op = load <8 x double>, <8 x double>* %a
 220   %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
 221   ret double %res
 222 }
 223
 224 define double @fadda_v16f64(double %start, <16 x double>* %a) #0 {
 225 ; CHECK-LABEL: fadda_v16f64:
 226 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
 227 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 228 ; VBITS_GE_1024-NEXT: fadda d0, [[PG]], d0, [[OP]].d
 229 ; VBITS_GE_1024-NEXT: ret
 230   %op = load <16 x double>, <16 x double>* %a
 231   %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
 232   ret double %res
 233 }
 234
 235 define double @fadda_v32f64(double %start, <32 x double>* %a) #0 {
 236 ; CHECK-LABEL: fadda_v32f64:
 237 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
 238 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 239 ; VBITS_GE_2048-NEXT: fadda d0, [[PG]], d0, [[OP]].d
 240 ; VBITS_GE_2048-NEXT: ret
 241   %op = load <32 x double>, <32 x double>* %a
 242   %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
 243   ret double %res
 244 }
 245
 246 ;
 247 ; FADDV
 248 ;
 249
 250 ; No single instruction NEON support for 4 element vectors.
 251 define half @faddv_v4f16(half %start, <4 x half> %a) #0 {
 252 ; CHECK-LABEL: faddv_v4f16:
 253 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
 254 ; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
 255 ; CHECK-NEXT: fadd h0, h0, [[RDX]]
 256 ; CHECK-NEXT: ret
 257   %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
 258   ret half %res
 259 }
 260
 261 ; No single instruction NEON support for 8 element vectors.
 262 define half @faddv_v8f16(half %start, <8 x half> %a) #0 {
 263 ; CHECK-LABEL: faddv_v8f16:
 264 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
 265 ; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
 266 ; CHECK-NEXT: fadd h0, h0, [[RDX]]
 267 ; CHECK-NEXT: ret
 268   %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
 269   ret half %res
 270 }
 271
 272 define half @faddv_v16f16(half %start, <16 x half>* %a) #0 {
 273 ; CHECK-LABEL: faddv_v16f16:
 274 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
 275 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 276 ; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
 277 ; CHECK-NEXT: fadd h0, h0, [[RDX]]
 278 ; CHECK-NEXT: ret
 279   %op = load <16 x half>, <16 x half>* %a
 280   %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
 281   ret half %res
 282 }
 283
 284 define half @faddv_v32f16(half %start, <32 x half>* %a) #0 {
 285 ; CHECK-LABEL: faddv_v32f16:
 286 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
 287 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 288 ; VBITS_GE_512-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
 289 ; VBITS_GE_512-NEXT: fadd h0, h0, [[RDX]]
 290 ; VBITS_GE_512-NEXT: ret
 291
 292 ; Ensure sensible type legalisation.
 293 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
 294 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
 295 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
 296 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
 297 ; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
 298 ; VBITS_EQ_256-DAG: faddv h1, [[PG]], [[ADD]].h
 299 ; VBITS_EQ_256-DAG: fadd h0, h0, [[RDX]]
 300 ; VBITS_EQ_256-NEXT: ret
 301   %op = load <32 x half>, <32 x half>* %a
 302   %res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
 303   ret half %res
 304 }
 305
 306 define half @faddv_v64f16(half %start, <64 x half>* %a) #0 {
 307 ; CHECK-LABEL: faddv_v64f16:
 308 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
 309 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 310 ; VBITS_GE_1024-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
 311 ; VBITS_GE_1024-NEXT: fadd h0, h0, [[RDX]]
 312 ; VBITS_GE_1024-NEXT: ret
 313   %op = load <64 x half>, <64 x half>* %a
 314   %res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
 315   ret half %res
 316 }
 317
 318 define half @faddv_v128f16(half %start, <128 x half>* %a) #0 {
 319 ; CHECK-LABEL: faddv_v128f16:
 320 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
 321 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 322 ; VBITS_GE_2048-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
 323 ; VBITS_GE_2048-NEXT: fadd h0, h0, [[RDX]]
 324 ; VBITS_GE_2048-NEXT: ret
 325   %op = load <128 x half>, <128 x half>* %a
 326   %res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
 327   ret half %res
 328 }
 329
 330 ; Don't use SVE for 2 element vectors.
 331 define float @faddv_v2f32(float %start, <2 x float> %a) #0 {
 332 ; CHECK-LABEL: faddv_v2f32:
 333 ; CHECK: faddp s1, v1.2s
 334 ; CHECK-NEXT: fadd s0, s0, s1
 335 ; CHECK-NEXT: ret
 336   %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
 337   ret float %res
 338 }
 339
 340 ; No single instruction NEON support for 4 element vectors.
 341 define float @faddv_v4f32(float %start, <4 x float> %a) #0 {
 342 ; CHECK-LABEL: faddv_v4f32:
 343 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
 344 ; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], z1.s
 345 ; CHECK-NEXT: fadd s0, s0, [[RDX]]
 346 ; CHECK-NEXT: ret
 347   %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
 348   ret float %res
 349 }
 350
 351 define float @faddv_v8f32(float %start, <8 x float>* %a) #0 {
 352 ; CHECK-LABEL: faddv_v8f32:
 353 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 354 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 355 ; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
 356 ; CHECK-NEXT: fadd s0, s0, [[RDX]]
 357 ; CHECK-NEXT: ret
 358   %op = load <8 x float>, <8 x float>* %a
 359   %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
 360   ret float %res
 361 }
 362
 363 define float @faddv_v16f32(float %start, <16 x float>* %a) #0 {
 364 ; CHECK-LABEL: faddv_v16f32:
 365 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
 366 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 367 ; VBITS_GE_512-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
 368 ; VBITS_GE_512-NEXT: fadd s0, s0, [[RDX]]
 369 ; VBITS_GE_512-NEXT: ret
 370
 371 ; Ensure sensible type legalisation.
 372 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
 373 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 374 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
 375 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
 376 ; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
 377 ; VBITS_EQ_256-DAG: faddv [[RDX:s[0-9]+]], [[PG]], [[ADD]].s
 378 ; VBITS_EQ_256-DAG: fadd s0, s0, [[RDX]]
 379 ; VBITS_EQ_256-NEXT: ret
 380   %op = load <16 x float>, <16 x float>* %a
 381   %res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
 382   ret float %res
 383 }
 384
 385 define float @faddv_v32f32(float %start, <32 x float>* %a) #0 {
 386 ; CHECK-LABEL: faddv_v32f32:
 387 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
 388 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 389 ; VBITS_GE_1024-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
 390 ; VBITS_GE_1024-NEXT: fadd s0, s0, [[RDX]]
 391 ; VBITS_GE_1024-NEXT: ret
 392   %op = load <32 x float>, <32 x float>* %a
 393   %res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
 394   ret float %res
 395 }
 396
 397 define float @faddv_v64f32(float %start, <64 x float>* %a) #0 {
 398 ; CHECK-LABEL: faddv_v64f32:
 399 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
 400 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 401 ; VBITS_GE_2048-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
 402 ; VBITS_GE_2048-NEXT: fadd s0, s0, [[RDX]]
 403 ; VBITS_GE_2048-NEXT: ret
 404   %op = load <64 x float>, <64 x float>* %a
 405   %res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
 406   ret float %res
 407 }
 408
 409 ; Don't use SVE for 1 element vectors.
 410 define double @faddv_v1f64(double %start, <1 x double> %a) #0 {
 411 ; CHECK-LABEL: faddv_v1f64:
 412 ; CHECK: fadd d0, d0, d1
 413 ; CHECK-NEXT: ret
 414   %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
 415   ret double %res
 416 }
 417
 418 ; Don't use SVE for 2 element vectors.
 419 define double @faddv_v2f64(double %start, <2 x double> %a) #0 {
 420 ; CHECK-LABEL: faddv_v2f64:
 421 ; CHECK: faddp d1, v1.2d
 422 ; CHECK-NEXT: fadd d0, d0, d1
 423 ; CHECK-NEXT: ret
 424   %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
 425   ret double %res
 426 }
 427
 428 define double @faddv_v4f64(double %start, <4 x double>* %a) #0 {
 429 ; CHECK-LABEL: faddv_v4f64:
 430 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
 431 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 432 ; CHECK-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
 433 ; CHECK-NEXT: fadd d0, d0, [[RDX]]
 434 ; CHECK-NEXT: ret
 435   %op = load <4 x double>, <4 x double>* %a
 436   %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
 437   ret double %res
 438 }
 439
 440 define double @faddv_v8f64(double %start, <8 x double>* %a) #0 {
 441 ; CHECK-LABEL: faddv_v8f64:
 442 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
 443 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 444 ; VBITS_GE_512-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
 445 ; VBITS_GE_512-NEXT: fadd d0, d0, [[RDX]]
 446 ; VBITS_GE_512-NEXT: ret
 447
 448 ; Ensure sensible type legalisation.
 449 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
 450 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 451 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
 452 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
 453 ; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
 454 ; VBITS_EQ_256-DAG: faddv [[RDX:d[0-9]+]], [[PG]], [[ADD]].d
 455 ; VBITS_EQ_256-DAG: fadd d0, d0, [[RDX]]
 456 ; VBITS_EQ_256-NEXT: ret
 457   %op = load <8 x double>, <8 x double>* %a
 458   %res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
 459   ret double %res
 460 }
 461
 462 define double @faddv_v16f64(double %start, <16 x double>* %a) #0 {
 463 ; CHECK-LABEL: faddv_v16f64:
 464 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
 465 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 466 ; VBITS_GE_1024-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
 467 ; VBITS_GE_1024-NEXT: fadd d0, d0, [[RDX]]
 468 ; VBITS_GE_1024-NEXT: ret
 469   %op = load <16 x double>, <16 x double>* %a
 470   %res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
 471   ret double %res
 472 }
 473
 474 define double @faddv_v32f64(double %start, <32 x double>* %a) #0 {
 475 ; CHECK-LABEL: faddv_v32f64:
 476 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
 477 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 478 ; VBITS_GE_2048-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
 479 ; VBITS_GE_2048-NEXT: fadd d0, d0, [[RDX]]
 480 ; VBITS_GE_2048-NEXT: ret
 481   %op = load <32 x double>, <32 x double>* %a
 482   %res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
 483   ret double %res
 484 }
 485
 486 ;
 487 ; FMAXV
 488 ;
 489
 490 ; No NEON 16-bit vector FMAXNMV support. Use SVE.
 491 define half @fmaxv_v4f16(<4 x half> %a) #0 {
 492 ; CHECK-LABEL: fmaxv_v4f16:
 493 ; CHECK: fmaxnmv h0, v0.4h
 494 ; CHECK-NEXT: ret
 495   %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
 496   ret half %res
 497 }
 498
 499 ; No NEON 16-bit vector FMAXNMV support. Use SVE.
 500 define half @fmaxv_v8f16(<8 x half> %a) #0 {
 501 ; CHECK-LABEL: fmaxv_v8f16:
 502 ; CHECK: fmaxnmv h0, v0.8h
 503 ; CHECK-NEXT: ret
 504   %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
 505   ret half %res
 506 }
 507
 508 define half @fmaxv_v16f16(<16 x half>* %a) #0 {
 509 ; CHECK-LABEL: fmaxv_v16f16:
 510 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
 511 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 512 ; CHECK-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 513 ; CHECK-NEXT: ret
 514   %op = load <16 x half>, <16 x half>* %a
 515   %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
 516   ret half %res
 517 }
 518
 519 define half @fmaxv_v32f16(<32 x half>* %a) #0 {
 520 ; CHECK-LABEL: fmaxv_v32f16:
 521 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
 522 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 523 ; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 524 ; VBITS_GE_512-NEXT: ret
 525
 526 ; Ensure sensible type legalisation.
 527 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
 528 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
 529 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
 530 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
 531 ; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
 532 ; VBITS_EQ_256-DAG: fmaxnmv h0, [[PG]], [[MAX]].h
 533 ; VBITS_EQ_256-NEXT: ret
 534   %op = load <32 x half>, <32 x half>* %a
 535   %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
 536   ret half %res
 537 }
 538
 539 define half @fmaxv_v64f16(<64 x half>* %a) #0 {
 540 ; CHECK-LABEL: fmaxv_v64f16:
 541 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
 542 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 543 ; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 544 ; VBITS_GE_1024-NEXT: ret
 545   %op = load <64 x half>, <64 x half>* %a
 546   %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
 547   ret half %res
 548 }
 549
 550 define half @fmaxv_v128f16(<128 x half>* %a) #0 {
 551 ; CHECK-LABEL: fmaxv_v128f16:
 552 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
 553 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 554 ; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
 555 ; VBITS_GE_2048-NEXT: ret
 556   %op = load <128 x half>, <128 x half>* %a
 557   %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
 558   ret half %res
 559 }
 560
 561 ; Don't use SVE for 64-bit f32 vectors.
 562 define float @fmaxv_v2f32(<2 x float> %a) #0 {
 563 ; CHECK-LABEL: fmaxv_v2f32:
 564 ; CHECK: fmaxnmp s0, v0.2s
 565 ; CHECK: ret
 566   %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
 567   ret float %res
 568 }
 569
 570 ; Don't use SVE for 128-bit f32 vectors.
 571 define float @fmaxv_v4f32(<4 x float> %a) #0 {
 572 ; CHECK-LABEL: fmaxv_v4f32:
 573 ; CHECK: fmaxnmv s0, v0.4s
 574 ; CHECK: ret
 575   %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
 576   ret float %res
 577 }
 578
 579 define float @fmaxv_v8f32(<8 x float>* %a) #0 {
 580 ; CHECK-LABEL: fmaxv_v8f32:
 581 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 582 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 583 ; CHECK-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 584 ; CHECK-NEXT: ret
 585   %op = load <8 x float>, <8 x float>* %a
 586   %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
 587   ret float %res
 588 }
 589
 590 define float @fmaxv_v16f32(<16 x float>* %a) #0 {
 591 ; CHECK-LABEL: fmaxv_v16f32:
 592 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
 593 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 594 ; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 595 ; VBITS_GE_512-NEXT: ret
 596
 597 ; Ensure sensible type legalisation.
 598 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
 599 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 600 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
 601 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
 602 ; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
 603 ; VBITS_EQ_256-DAG: fmaxnmv s0, [[PG]], [[MAX]].s
 604 ; VBITS_EQ_256-NEXT: ret
 605   %op = load <16 x float>, <16 x float>* %a
 606   %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
 607   ret float %res
 608 }
 609
 610 define float @fmaxv_v32f32(<32 x float>* %a) #0 {
 611 ; CHECK-LABEL: fmaxv_v32f32:
 612 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
 613 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 614 ; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 615 ; VBITS_GE_1024-NEXT: ret
 616   %op = load <32 x float>, <32 x float>* %a
 617   %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
 618   ret float %res
 619 }
 620
 621 define float @fmaxv_v64f32(<64 x float>* %a) #0 {
 622 ; CHECK-LABEL: fmaxv_v64f32:
 623 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
 624 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 625 ; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
 626 ; VBITS_GE_2048-NEXT: ret
 627   %op = load <64 x float>, <64 x float>* %a
 628   %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
 629   ret float %res
 630 }
 631
 632 ; Nothing to do for single element vectors.
 633 define double @fmaxv_v1f64(<1 x double> %a) #0 {
 634 ; CHECK-LABEL: fmaxv_v1f64:
 635 ; CHECK-NOT: fmax
 636 ; CHECK: ret
 637   %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
 638   ret double %res
 639 }
 640
 641 ; Don't use SVE for 128-bit f64 vectors.
 642 define double @fmaxv_v2f64(<2 x double> %a) #0 {
 643 ; CHECK-LABEL: fmaxv_v2f64:
 644 ; CHECK: fmaxnmp d0, v0.2d
 645 ; CHECK-NEXT: ret
 646   %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
 647   ret double %res
 648 }
 649
 650 define double @fmaxv_v4f64(<4 x double>* %a) #0 {
 651 ; CHECK-LABEL: fmaxv_v4f64:
 652 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
 653 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 654 ; CHECK-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 655 ; CHECK-NEXT: ret
 656   %op = load <4 x double>, <4 x double>* %a
 657   %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
 658   ret double %res
 659 }
 660
 661 define double @fmaxv_v8f64(<8 x double>* %a) #0 {
 662 ; CHECK-LABEL: fmaxv_v8f64:
 663 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
 664 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 665 ; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 666 ; VBITS_GE_512-NEXT: ret
 667
 668 ; Ensure sensible type legalisation.
 669 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
 670 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 671 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
 672 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
 673 ; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
 674 ; VBITS_EQ_256-DAG: fmaxnmv d0, [[PG]], [[MAX]].d
 675 ; VBITS_EQ_256-NEXT: ret
 676   %op = load <8 x double>, <8 x double>* %a
 677   %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
 678   ret double %res
 679 }
 680
 681 define double @fmaxv_v16f64(<16 x double>* %a) #0 {
 682 ; CHECK-LABEL: fmaxv_v16f64:
 683 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
 684 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 685 ; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 686 ; VBITS_GE_1024-NEXT: ret
 687   %op = load <16 x double>, <16 x double>* %a
 688   %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
 689   ret double %res
 690 }
 691
 692 define double @fmaxv_v32f64(<32 x double>* %a) #0 {
 693 ; CHECK-LABEL: fmaxv_v32f64:
 694 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
 695 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 696 ; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
 697 ; VBITS_GE_2048-NEXT: ret
 698   %op = load <32 x double>, <32 x double>* %a
 699   %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
 700   ret double %res
 701 }
 702
 703 ;
 704 ; FMINV
 705 ;
 706
 707 ; No NEON 16-bit vector FMINNMV support. Use SVE.
 708 define half @fminv_v4f16(<4 x half> %a) #0 {
 709 ; CHECK-LABEL: fminv_v4f16:
 710 ; CHECK: fminnmv h0, v0.4h
 711 ; CHECK-NEXT: ret
 712   %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
 713   ret half %res
 714 }
 715
 716 ; No NEON 16-bit vector FMINNMV support. Use SVE.
 717 define half @fminv_v8f16(<8 x half> %a) #0 {
 718 ; CHECK-LABEL: fminv_v8f16:
 719 ; CHECK: fminnmv h0, v0.8h
 720 ; CHECK-NEXT: ret
 721   %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
 722   ret half %res
 723 }
 724
 725 define half @fminv_v16f16(<16 x half>* %a) #0 {
 726 ; CHECK-LABEL: fminv_v16f16:
 727 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
 728 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 729 ; CHECK-NEXT: fminnmv h0, [[PG]], [[OP]].h
 730 ; CHECK-NEXT: ret
 731   %op = load <16 x half>, <16 x half>* %a
 732   %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
 733   ret half %res
 734 }
 735
 736 define half @fminv_v32f16(<32 x half>* %a) #0 {
 737 ; CHECK-LABEL: fminv_v32f16:
 738 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
 739 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 740 ; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h
 741 ; VBITS_GE_512-NEXT: ret
 742
 743 ; Ensure sensible type legalisation.
 744 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
 745 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
 746 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
 747 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
 748 ; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
 749 ; VBITS_EQ_256-DAG: fminnmv h0, [[PG]], [[MIN]].h
 750 ; VBITS_EQ_256-NEXT: ret
 751   %op = load <32 x half>, <32 x half>* %a
 752   %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
 753   ret half %res
 754 }
 755
 756 define half @fminv_v64f16(<64 x half>* %a) #0 {
 757 ; CHECK-LABEL: fminv_v64f16:
 758 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
 759 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 760 ; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h
 761 ; VBITS_GE_1024-NEXT: ret
 762   %op = load <64 x half>, <64 x half>* %a
 763   %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
 764   ret half %res
 765 }
 766
 767 define half @fminv_v128f16(<128 x half>* %a) #0 {
 768 ; CHECK-LABEL: fminv_v128f16:
 769 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
 770 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
 771 ; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h
 772 ; VBITS_GE_2048-NEXT: ret
 773   %op = load <128 x half>, <128 x half>* %a
 774   %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
 775   ret half %res
 776 }
 777
 778 ; Don't use SVE for 64-bit f32 vectors.
 779 define float @fminv_v2f32(<2 x float> %a) #0 {
 780 ; CHECK-LABEL: fminv_v2f32:
 781 ; CHECK: fminnmp s0, v0.2s
 782 ; CHECK: ret
 783   %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
 784   ret float %res
 785 }
 786
 787 ; Don't use SVE for 128-bit f32 vectors.
 788 define float @fminv_v4f32(<4 x float> %a) #0 {
 789 ; CHECK-LABEL: fminv_v4f32:
 790 ; CHECK: fminnmv s0, v0.4s
 791 ; CHECK: ret
 792   %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
 793   ret float %res
 794 }
 795
 796 define float @fminv_v8f32(<8 x float>* %a) #0 {
 797 ; CHECK-LABEL: fminv_v8f32:
 798 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 799 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 800 ; CHECK-NEXT: fminnmv s0, [[PG]], [[OP]].s
 801 ; CHECK-NEXT: ret
 802   %op = load <8 x float>, <8 x float>* %a
 803   %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
 804   ret float %res
 805 }
 806
 807 define float @fminv_v16f32(<16 x float>* %a) #0 {
 808 ; CHECK-LABEL: fminv_v16f32:
 809 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
 810 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 811 ; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s
 812 ; VBITS_GE_512-NEXT: ret
 813
 814 ; Ensure sensible type legalisation.
 815 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
 816 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 817 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
 818 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
 819 ; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
 820 ; VBITS_EQ_256-DAG: fminnmv s0, [[PG]], [[MIN]].s
 821 ; VBITS_EQ_256-NEXT: ret
 822   %op = load <16 x float>, <16 x float>* %a
 823   %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
 824   ret float %res
 825 }
 826
 827 define float @fminv_v32f32(<32 x float>* %a) #0 {
 828 ; CHECK-LABEL: fminv_v32f32:
 829 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
 830 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 831 ; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s
 832 ; VBITS_GE_1024-NEXT: ret
 833   %op = load <32 x float>, <32 x float>* %a
 834   %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
 835   ret float %res
 836 }
 837
 838 define float @fminv_v64f32(<64 x float>* %a) #0 {
 839 ; CHECK-LABEL: fminv_v64f32:
 840 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
 841 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
 842 ; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s
 843 ; VBITS_GE_2048-NEXT: ret
 844   %op = load <64 x float>, <64 x float>* %a
 845   %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
 846   ret float %res
 847 }
 848
 849 ; Nothing to do for single element vectors.
 850 define double @fminv_v1f64(<1 x double> %a) #0 {
 851 ; CHECK-LABEL: fminv_v1f64:
 852 ; CHECK-NOT: fmin
 853 ; CHECK: ret
 854   %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
 855   ret double %res
 856 }
 857
 858 ; Don't use SVE for 128-bit f64 vectors.
 859 define double @fminv_v2f64(<2 x double> %a) #0 {
 860 ; CHECK-LABEL: fminv_v2f64:
 861 ; CHECK: fminnmp d0, v0.2d
 862 ; CHECK-NEXT: ret
 863   %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
 864   ret double %res
 865 }
 866
 867 define double @fminv_v4f64(<4 x double>* %a) #0 {
 868 ; CHECK-LABEL: fminv_v4f64:
 869 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
 870 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 871 ; CHECK-NEXT: fminnmv d0, [[PG]], [[OP]].d
 872 ; CHECK-NEXT: ret
 873   %op = load <4 x double>, <4 x double>* %a
 874   %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
 875   ret double %res
 876 }
 877
 878 define double @fminv_v8f64(<8 x double>* %a) #0 {
 879 ; CHECK-LABEL: fminv_v8f64:
 880 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
 881 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 882 ; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d
 883 ; VBITS_GE_512-NEXT: ret
 884
 885 ; Ensure sensible type legalisation.
 886 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
 887 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 888 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
 889 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
 890 ; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
 891 ; VBITS_EQ_256-DAG: fminnmv d0, [[PG]], [[MIN]].d
 892 ; VBITS_EQ_256-NEXT: ret
 893   %op = load <8 x double>, <8 x double>* %a
 894   %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
 895   ret double %res
 896 }
 897
 898 define double @fminv_v16f64(<16 x double>* %a) #0 {
 899 ; CHECK-LABEL: fminv_v16f64:
 900 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
 901 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 902 ; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d
 903 ; VBITS_GE_1024-NEXT: ret
 904   %op = load <16 x double>, <16 x double>* %a
 905   %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
 906   ret double %res
 907 }
 908
 909 define double @fminv_v32f64(<32 x double>* %a) #0 {
 910 ; CHECK-LABEL: fminv_v32f64:
 911 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
 912 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
 913 ; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d
 914 ; VBITS_GE_2048-NEXT: ret
 915   %op = load <32 x double>, <32 x double>* %a
 916   %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
 917   ret double %res
 918 }
 919
 920 attributes #0 = { "target-features"="+sve" }
 921
 922 declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
 923 declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
 924 declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
 925 declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>)
 926 declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>)
 927 declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>)
 928
 929 declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
 930 declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
 931 declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
 932 declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>)
 933 declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>)
 934 declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>)
 935
 936 declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
 937 declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
 938 declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
 939 declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>)
 940 declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>)
 941 declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>)
 942
 943 declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
 944 declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
 945 declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
 946 declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>)
 947 declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>)
 948 declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>)
 949
 950 declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
 951 declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
 952 declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
 953 declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
 954 declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)
 955 declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>)
 956
 957 declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
 958 declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
 959 declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
 960 declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
 961 declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
 962 declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>)
 963
 964 declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
 965 declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
 966 declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
 967 declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>)
 968 declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>)
 969 declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>)
 970
 971 declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
 972 declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
 973 declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
 974 declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
 975 declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)
 976 declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>)
 977
 978 declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
 979 declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
 980 declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
 981 declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
 982 declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
 983 declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>)