llvm/test/CodeGen/AArch64/sve-fixed-length-fp-minmax.ll

   1 ; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
   2 ; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
   3 ; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
   4 ; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
   5 ; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
   6 ; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
   7 ; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
   8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
   9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
  17
  18 target triple = "aarch64-unknown-linux-gnu"
  19
  20 ; Don't use SVE when its registers are no bigger than NEON.
  21 ; NO_SVE-NOT: ptrue
  22
  23 ;
  24 ; FMAXNM
  25 ;
  26
  27 ; Don't use SVE for 64-bit vectors.
  28 define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
  29 ; CHECK-LABEL: fmaxnm_v4f16:
  30 ; CHECK: fmaxnm v0.4h, v0.4h, v1.4h
  31 ; CHECK-NEXT: ret
  32   %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
  33   ret <4 x half> %res
  34 }
  35
  36 ; Don't use SVE for 128-bit vectors.
  37 define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
  38 ; CHECK-LABEL: fmaxnm_v8f16:
  39 ; CHECK: fmaxnm v0.8h, v0.8h, v1.8h
  40 ; CHECK-NEXT: ret
  41   %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
  42   ret <8 x half> %res
  43 }
  44
  45 define void @fmaxnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
  46 ; CHECK-LABEL: fmaxnm_v16f16:
  47 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
  48 ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
  49 ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
  50 ; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
  51 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
  52 ; CHECK-NEXT: ret
  53   %op1 = load <16 x half>, <16 x half>* %a
  54   %op2 = load <16 x half>, <16 x half>* %b
  55   %res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2)
  56   store <16 x half> %res, <16 x half>* %a
  57   ret void
  58 }
  59
  60 define void @fmaxnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
  61 ; CHECK-LABEL: fmaxnm_v32f16:
  62 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
  63 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
  64 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
  65 ; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
  66 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
  67 ; VBITS_GE_512-NEXT: ret
  68
  69 ; Ensure sensible type legalisation.
  70 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
  71 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
  72 ; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
  73 ; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
  74 ; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
  75 ; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
  76 ; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
  77 ; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
  78 ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
  79 ; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
  80 ; VBITS_EQ_256-NEXT: ret
  81   %op1 = load <32 x half>, <32 x half>* %a
  82   %op2 = load <32 x half>, <32 x half>* %b
  83   %res = call <32 x half> @llvm.maxnum.v32f16(<32 x half> %op1, <32 x half> %op2)
  84   store <32 x half> %res, <32 x half>* %a
  85   ret void
  86 }
  87
  88 define void @fmaxnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
  89 ; CHECK-LABEL: fmaxnm_v64f16:
  90 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
  91 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
  92 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
  93 ; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
  94 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
  95 ; VBITS_GE_1024-NEXT: ret
  96   %op1 = load <64 x half>, <64 x half>* %a
  97   %op2 = load <64 x half>, <64 x half>* %b
  98   %res = call <64 x half> @llvm.maxnum.v64f16(<64 x half> %op1, <64 x half> %op2)
  99   store <64 x half> %res, <64 x half>* %a
 100   ret void
 101 }
 102
 103 define void @fmaxnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 104 ; CHECK-LABEL: fmaxnm_v128f16:
 105 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
 106 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 107 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 108 ; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 109 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 110 ; VBITS_GE_2048-NEXT: ret
 111   %op1 = load <128 x half>, <128 x half>* %a
 112   %op2 = load <128 x half>, <128 x half>* %b
 113   %res = call <128 x half> @llvm.maxnum.v128f16(<128 x half> %op1, <128 x half> %op2)
 114   store <128 x half> %res, <128 x half>* %a
 115   ret void
 116 }
 117
 118 ; Don't use SVE for 64-bit vectors.
 119 define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
 120 ; CHECK-LABEL: fmaxnm_v2f32:
 121 ; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
 122 ; CHECK-NEXT: ret
 123   %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
 124   ret <2 x float> %res
 125 }
 126
 127 ; Don't use SVE for 128-bit vectors.
 128 define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
 129 ; CHECK-LABEL: fmaxnm_v4f32:
 130 ; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
 131 ; CHECK-NEXT: ret
 132   %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
 133   ret <4 x float> %res
 134 }
 135
 136 define void @fmaxnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 137 ; CHECK-LABEL: fmaxnm_v8f32:
 138 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 139 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 140 ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 141 ; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 142 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 143 ; CHECK-NEXT: ret
 144   %op1 = load <8 x float>, <8 x float>* %a
 145   %op2 = load <8 x float>, <8 x float>* %b
 146   %res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2)
 147   store <8 x float> %res, <8 x float>* %a
 148   ret void
 149 }
 150
 151 define void @fmaxnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
 152 ; CHECK-LABEL: fmaxnm_v16f32:
 153 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
 154 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 155 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 156 ; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 157 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 158 ; VBITS_GE_512-NEXT: ret
 159
 160 ; Ensure sensible type legalisation.
 161 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
 162 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 163 ; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
 164 ; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
 165 ; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
 166 ; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
 167 ; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
 168 ; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
 169 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
 170 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
 171 ; VBITS_EQ_256-NEXT: ret
 172   %op1 = load <16 x float>, <16 x float>* %a
 173   %op2 = load <16 x float>, <16 x float>* %b
 174   %res = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %op1, <16 x float> %op2)
 175   store <16 x float> %res, <16 x float>* %a
 176   ret void
 177 }
 178
 179 define void @fmaxnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
 180 ; CHECK-LABEL: fmaxnm_v32f32:
 181 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
 182 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 183 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 184 ; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 185 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 186 ; VBITS_GE_1024-NEXT: ret
 187   %op1 = load <32 x float>, <32 x float>* %a
 188   %op2 = load <32 x float>, <32 x float>* %b
 189   %res = call <32 x float> @llvm.maxnum.v32f32(<32 x float> %op1, <32 x float> %op2)
 190   store <32 x float> %res, <32 x float>* %a
 191   ret void
 192 }
 193
 194 define void @fmaxnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 195 ; CHECK-LABEL: fmaxnm_v64f32:
 196 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
 197 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 198 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 199 ; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 200 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 201 ; VBITS_GE_2048-NEXT: ret
 202   %op1 = load <64 x float>, <64 x float>* %a
 203   %op2 = load <64 x float>, <64 x float>* %b
 204   %res = call <64 x float> @llvm.maxnum.v64f32(<64 x float> %op1, <64 x float> %op2)
 205   store <64 x float> %res, <64 x float>* %a
 206   ret void
 207 }
 208
 209 ; Don't use SVE for 64-bit vectors.
 210 define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
 211 ; CHECK-LABEL: fmaxnm_v1f64:
 212 ; CHECK: fmaxnm d0, d0, d1
 213 ; CHECK-NEXT: ret
 214   %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
 215   ret <1 x double> %res
 216 }
 217
 218 ; Don't use SVE for 128-bit vectors.
 219 define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
 220 ; CHECK-LABEL: fmaxnm_v2f64:
 221 ; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
 222 ; CHECK-NEXT: ret
 223   %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
 224   ret <2 x double> %res
 225 }
 226
 227 define void @fmaxnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 228 ; CHECK-LABEL: fmaxnm_v4f64:
 229 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
 230 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 231 ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 232 ; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 233 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 234 ; CHECK-NEXT: ret
 235   %op1 = load <4 x double>, <4 x double>* %a
 236   %op2 = load <4 x double>, <4 x double>* %b
 237   %res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2)
 238   store <4 x double> %res, <4 x double>* %a
 239   ret void
 240 }
 241
 242 define void @fmaxnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
 243 ; CHECK-LABEL: fmaxnm_v8f64:
 244 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
 245 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 246 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 247 ; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 248 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 249 ; VBITS_GE_512-NEXT: ret
 250
 251 ; Ensure sensible type legalisation.
 252 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
 253 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 254 ; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
 255 ; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
 256 ; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
 257 ; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
 258 ; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
 259 ; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
 260 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
 261 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
 262 ; VBITS_EQ_256-NEXT: ret
 263   %op1 = load <8 x double>, <8 x double>* %a
 264   %op2 = load <8 x double>, <8 x double>* %b
 265   %res = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %op1, <8 x double> %op2)
 266   store <8 x double> %res, <8 x double>* %a
 267   ret void
 268 }
 269
 270 define void @fmaxnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
 271 ; CHECK-LABEL: fmaxnm_v16f64:
 272 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
 273 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 274 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 275 ; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 276 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 277 ; VBITS_GE_1024-NEXT: ret
 278   %op1 = load <16 x double>, <16 x double>* %a
 279   %op2 = load <16 x double>, <16 x double>* %b
 280   %res = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %op1, <16 x double> %op2)
 281   store <16 x double> %res, <16 x double>* %a
 282   ret void
 283 }
 284
 285 define void @fmaxnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 286 ; CHECK-LABEL: fmaxnm_v32f64:
 287 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
 288 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 289 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 290 ; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 291 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 292 ; VBITS_GE_2048-NEXT: ret
 293   %op1 = load <32 x double>, <32 x double>* %a
 294   %op2 = load <32 x double>, <32 x double>* %b
 295   %res = call <32 x double> @llvm.maxnum.v32f64(<32 x double> %op1, <32 x double> %op2)
 296   store <32 x double> %res, <32 x double>* %a
 297   ret void
 298 }
 299
 300 ;
 301 ; FMINNM
 302 ;
 303
 304 ; Don't use SVE for 64-bit vectors.
 305 define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
 306 ; CHECK-LABEL: fminnm_v4f16:
 307 ; CHECK: fminnm v0.4h, v0.4h, v1.4h
 308 ; CHECK-NEXT: ret
 309   %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
 310   ret <4 x half> %res
 311 }
 312
 313 ; Don't use SVE for 128-bit vectors.
 314 define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
 315 ; CHECK-LABEL: fminnm_v8f16:
 316 ; CHECK: fminnm v0.8h, v0.8h, v1.8h
 317 ; CHECK-NEXT: ret
 318   %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
 319   ret <8 x half> %res
 320 }
 321
 322 define void @fminnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 323 ; CHECK-LABEL: fminnm_v16f16:
 324 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
 325 ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 326 ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 327 ; CHECK-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 328 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 329 ; CHECK-NEXT: ret
 330   %op1 = load <16 x half>, <16 x half>* %a
 331   %op2 = load <16 x half>, <16 x half>* %b
 332   %res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2)
 333   store <16 x half> %res, <16 x half>* %a
 334   ret void
 335 }
 336
 337 define void @fminnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
 338 ; CHECK-LABEL: fminnm_v32f16:
 339 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
 340 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 341 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 342 ; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 343 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 344 ; VBITS_GE_512-NEXT: ret
 345
 346 ; Ensure sensible type legalisation.
 347 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
 348 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
 349 ; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
 350 ; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
 351 ; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
 352 ; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
 353 ; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
 354 ; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
 355 ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
 356 ; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
 357 ; VBITS_EQ_256-NEXT: ret
 358   %op1 = load <32 x half>, <32 x half>* %a
 359   %op2 = load <32 x half>, <32 x half>* %b
 360   %res = call <32 x half> @llvm.minnum.v32f16(<32 x half> %op1, <32 x half> %op2)
 361   store <32 x half> %res, <32 x half>* %a
 362   ret void
 363 }
 364
 365 define void @fminnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
 366 ; CHECK-LABEL: fminnm_v64f16:
 367 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
 368 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 369 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 370 ; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 371 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 372 ; VBITS_GE_1024-NEXT: ret
 373   %op1 = load <64 x half>, <64 x half>* %a
 374   %op2 = load <64 x half>, <64 x half>* %b
 375   %res = call <64 x half> @llvm.minnum.v64f16(<64 x half> %op1, <64 x half> %op2)
 376   store <64 x half> %res, <64 x half>* %a
 377   ret void
 378 }
 379
 380 define void @fminnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 381 ; CHECK-LABEL: fminnm_v128f16:
 382 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
 383 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 384 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 385 ; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 386 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 387 ; VBITS_GE_2048-NEXT: ret
 388   %op1 = load <128 x half>, <128 x half>* %a
 389   %op2 = load <128 x half>, <128 x half>* %b
 390   %res = call <128 x half> @llvm.minnum.v128f16(<128 x half> %op1, <128 x half> %op2)
 391   store <128 x half> %res, <128 x half>* %a
 392   ret void
 393 }
 394
 395 ; Don't use SVE for 64-bit vectors.
 396 define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
 397 ; CHECK-LABEL: fminnm_v2f32:
 398 ; CHECK: fminnm v0.2s, v0.2s, v1.2s
 399 ; CHECK-NEXT: ret
 400   %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
 401   ret <2 x float> %res
 402 }
 403
 404 ; Don't use SVE for 128-bit vectors.
 405 define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
 406 ; CHECK-LABEL: fminnm_v4f32:
 407 ; CHECK: fminnm v0.4s, v0.4s, v1.4s
 408 ; CHECK-NEXT: ret
 409   %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
 410   ret <4 x float> %res
 411 }
 412
 413 define void @fminnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 414 ; CHECK-LABEL: fminnm_v8f32:
 415 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 416 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 417 ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 418 ; CHECK-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 419 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 420 ; CHECK-NEXT: ret
 421   %op1 = load <8 x float>, <8 x float>* %a
 422   %op2 = load <8 x float>, <8 x float>* %b
 423   %res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2)
 424   store <8 x float> %res, <8 x float>* %a
 425   ret void
 426 }
 427
 428 define void @fminnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
 429 ; CHECK-LABEL: fminnm_v16f32:
 430 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
 431 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 432 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 433 ; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 434 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 435 ; VBITS_GE_512-NEXT: ret
 436
 437 ; Ensure sensible type legalisation.
 438 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
 439 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 440 ; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
 441 ; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
 442 ; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
 443 ; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
 444 ; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
 445 ; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
 446 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
 447 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
 448 ; VBITS_EQ_256-NEXT: ret
 449   %op1 = load <16 x float>, <16 x float>* %a
 450   %op2 = load <16 x float>, <16 x float>* %b
 451   %res = call <16 x float> @llvm.minnum.v16f32(<16 x float> %op1, <16 x float> %op2)
 452   store <16 x float> %res, <16 x float>* %a
 453   ret void
 454 }
 455
 456 define void @fminnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
 457 ; CHECK-LABEL: fminnm_v32f32:
 458 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
 459 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 460 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 461 ; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 462 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 463 ; VBITS_GE_1024-NEXT: ret
 464   %op1 = load <32 x float>, <32 x float>* %a
 465   %op2 = load <32 x float>, <32 x float>* %b
 466   %res = call <32 x float> @llvm.minnum.v32f32(<32 x float> %op1, <32 x float> %op2)
 467   store <32 x float> %res, <32 x float>* %a
 468   ret void
 469 }
 470
 471 define void @fminnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 472 ; CHECK-LABEL: fminnm_v64f32:
 473 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
 474 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 475 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 476 ; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 477 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 478 ; VBITS_GE_2048-NEXT: ret
 479   %op1 = load <64 x float>, <64 x float>* %a
 480   %op2 = load <64 x float>, <64 x float>* %b
 481   %res = call <64 x float> @llvm.minnum.v64f32(<64 x float> %op1, <64 x float> %op2)
 482   store <64 x float> %res, <64 x float>* %a
 483   ret void
 484 }
 485
 486 ; Don't use SVE for 64-bit vectors.
 487 define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
 488 ; CHECK-LABEL: fminnm_v1f64:
 489 ; CHECK: fminnm d0, d0, d1
 490 ; CHECK-NEXT: ret
 491   %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
 492   ret <1 x double> %res
 493 }
 494
 495 ; Don't use SVE for 128-bit vectors.
 496 define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
 497 ; CHECK-LABEL: fminnm_v2f64:
 498 ; CHECK: fminnm v0.2d, v0.2d, v1.2d
 499 ; CHECK-NEXT: ret
 500   %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
 501   ret <2 x double> %res
 502 }
 503
 504 define void @fminnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 505 ; CHECK-LABEL: fminnm_v4f64:
 506 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
 507 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 508 ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 509 ; CHECK-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 510 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 511 ; CHECK-NEXT: ret
 512   %op1 = load <4 x double>, <4 x double>* %a
 513   %op2 = load <4 x double>, <4 x double>* %b
 514   %res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2)
 515   store <4 x double> %res, <4 x double>* %a
 516   ret void
 517 }
 518
 519 define void @fminnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
 520 ; CHECK-LABEL: fminnm_v8f64:
 521 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
 522 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 523 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 524 ; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 525 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 526 ; VBITS_GE_512-NEXT: ret
 527
 528 ; Ensure sensible type legalisation.
 529 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
 530 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 531 ; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
 532 ; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
 533 ; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
 534 ; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
 535 ; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
 536 ; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
 537 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
 538 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
 539 ; VBITS_EQ_256-NEXT: ret
 540   %op1 = load <8 x double>, <8 x double>* %a
 541   %op2 = load <8 x double>, <8 x double>* %b
 542   %res = call <8 x double> @llvm.minnum.v8f64(<8 x double> %op1, <8 x double> %op2)
 543   store <8 x double> %res, <8 x double>* %a
 544   ret void
 545 }
 546
 547 define void @fminnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
 548 ; CHECK-LABEL: fminnm_v16f64:
 549 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
 550 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 551 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 552 ; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 553 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 554 ; VBITS_GE_1024-NEXT: ret
 555   %op1 = load <16 x double>, <16 x double>* %a
 556   %op2 = load <16 x double>, <16 x double>* %b
 557   %res = call <16 x double> @llvm.minnum.v16f64(<16 x double> %op1, <16 x double> %op2)
 558   store <16 x double> %res, <16 x double>* %a
 559   ret void
 560 }
 561
 562 define void @fminnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 563 ; CHECK-LABEL: fminnm_v32f64:
 564 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
 565 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 566 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 567 ; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 568 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 569 ; VBITS_GE_2048-NEXT: ret
 570   %op1 = load <32 x double>, <32 x double>* %a
 571   %op2 = load <32 x double>, <32 x double>* %b
 572   %res = call <32 x double> @llvm.minnum.v32f64(<32 x double> %op1, <32 x double> %op2)
 573   store <32 x double> %res, <32 x double>* %a
 574   ret void
 575 }
 576
 577 ;
 578 ; FMAX
 579 ;
 580
 581 ; Don't use SVE for 64-bit vectors.
 582 define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
 583 ; CHECK-LABEL: fmax_v4f16:
 584 ; CHECK: fmax v0.4h, v0.4h, v1.4h
 585 ; CHECK-NEXT: ret
 586   %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
 587   ret <4 x half> %res
 588 }
 589
 590 ; Don't use SVE for 128-bit vectors.
 591 define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
 592 ; CHECK-LABEL: fmax_v8f16:
 593 ; CHECK: fmax v0.8h, v0.8h, v1.8h
 594 ; CHECK-NEXT: ret
 595   %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
 596   ret <8 x half> %res
 597 }
 598
 599 define void @fmax_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 600 ; CHECK-LABEL: fmax_v16f16:
 601 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
 602 ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 603 ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 604 ; CHECK-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 605 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 606 ; CHECK-NEXT: ret
 607   %op1 = load <16 x half>, <16 x half>* %a
 608   %op2 = load <16 x half>, <16 x half>* %b
 609   %res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2)
 610   store <16 x half> %res, <16 x half>* %a
 611   ret void
 612 }
 613
 614 define void @fmax_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
 615 ; CHECK-LABEL: fmax_v32f16:
 616 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
 617 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 618 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 619 ; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 620 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 621 ; VBITS_GE_512-NEXT: ret
 622
 623 ; Ensure sensible type legalisation.
 624 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
 625 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
 626 ; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
 627 ; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
 628 ; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
 629 ; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
 630 ; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
 631 ; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
 632 ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
 633 ; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
 634 ; VBITS_EQ_256-NEXT: ret
 635   %op1 = load <32 x half>, <32 x half>* %a
 636   %op2 = load <32 x half>, <32 x half>* %b
 637   %res = call <32 x half> @llvm.maximum.v32f16(<32 x half> %op1, <32 x half> %op2)
 638   store <32 x half> %res, <32 x half>* %a
 639   ret void
 640 }
 641
 642 define void @fmax_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
 643 ; CHECK-LABEL: fmax_v64f16:
 644 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
 645 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 646 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 647 ; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 648 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 649 ; VBITS_GE_1024-NEXT: ret
 650   %op1 = load <64 x half>, <64 x half>* %a
 651   %op2 = load <64 x half>, <64 x half>* %b
 652   %res = call <64 x half> @llvm.maximum.v64f16(<64 x half> %op1, <64 x half> %op2)
 653   store <64 x half> %res, <64 x half>* %a
 654   ret void
 655 }
 656
 657 define void @fmax_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 658 ; CHECK-LABEL: fmax_v128f16:
 659 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
 660 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 661 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 662 ; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 663 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 664 ; VBITS_GE_2048-NEXT: ret
 665   %op1 = load <128 x half>, <128 x half>* %a
 666   %op2 = load <128 x half>, <128 x half>* %b
 667   %res = call <128 x half> @llvm.maximum.v128f16(<128 x half> %op1, <128 x half> %op2)
 668   store <128 x half> %res, <128 x half>* %a
 669   ret void
 670 }
 671
 672 ; Don't use SVE for 64-bit vectors.
 673 define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
 674 ; CHECK-LABEL: fmax_v2f32:
 675 ; CHECK: fmax v0.2s, v0.2s, v1.2s
 676 ; CHECK-NEXT: ret
 677   %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
 678   ret <2 x float> %res
 679 }
 680
 681 ; Don't use SVE for 128-bit vectors.
 682 define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
 683 ; CHECK-LABEL: fmax_v4f32:
 684 ; CHECK: fmax v0.4s, v0.4s, v1.4s
 685 ; CHECK-NEXT: ret
 686   %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
 687   ret <4 x float> %res
 688 }
 689
 690 define void @fmax_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 691 ; CHECK-LABEL: fmax_v8f32:
 692 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 693 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 694 ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 695 ; CHECK-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 696 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 697 ; CHECK-NEXT: ret
 698   %op1 = load <8 x float>, <8 x float>* %a
 699   %op2 = load <8 x float>, <8 x float>* %b
 700   %res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2)
 701   store <8 x float> %res, <8 x float>* %a
 702   ret void
 703 }
 704
 705 define void @fmax_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
 706 ; CHECK-LABEL: fmax_v16f32:
 707 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
 708 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 709 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 710 ; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 711 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 712 ; VBITS_GE_512-NEXT: ret
 713
 714 ; Ensure sensible type legalisation.
 715 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
 716 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 717 ; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
 718 ; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
 719 ; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
 720 ; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
 721 ; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
 722 ; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
 723 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
 724 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
 725 ; VBITS_EQ_256-NEXT: ret
 726   %op1 = load <16 x float>, <16 x float>* %a
 727   %op2 = load <16 x float>, <16 x float>* %b
 728   %res = call <16 x float> @llvm.maximum.v16f32(<16 x float> %op1, <16 x float> %op2)
 729   store <16 x float> %res, <16 x float>* %a
 730   ret void
 731 }
 732
 733 define void @fmax_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
 734 ; CHECK-LABEL: fmax_v32f32:
 735 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
 736 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 737 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 738 ; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 739 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 740 ; VBITS_GE_1024-NEXT: ret
 741   %op1 = load <32 x float>, <32 x float>* %a
 742   %op2 = load <32 x float>, <32 x float>* %b
 743   %res = call <32 x float> @llvm.maximum.v32f32(<32 x float> %op1, <32 x float> %op2)
 744   store <32 x float> %res, <32 x float>* %a
 745   ret void
 746 }
 747
 748 define void @fmax_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
 749 ; CHECK-LABEL: fmax_v64f32:
 750 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
 751 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 752 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 753 ; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 754 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 755 ; VBITS_GE_2048-NEXT: ret
 756   %op1 = load <64 x float>, <64 x float>* %a
 757   %op2 = load <64 x float>, <64 x float>* %b
 758   %res = call <64 x float> @llvm.maximum.v64f32(<64 x float> %op1, <64 x float> %op2)
 759   store <64 x float> %res, <64 x float>* %a
 760   ret void
 761 }
 762
 763 ; Don't use SVE for 64-bit vectors.
 764 define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
 765 ; CHECK-LABEL: fmax_v1f64:
 766 ; CHECK: fmax d0, d0, d1
 767 ; CHECK-NEXT: ret
 768   %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
 769   ret <1 x double> %res
 770 }
 771
 772 ; Don't use SVE for 128-bit vectors.
 773 define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
 774 ; CHECK-LABEL: fmax_v2f64:
 775 ; CHECK: fmax v0.2d, v0.2d, v1.2d
 776 ; CHECK-NEXT: ret
 777   %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
 778   ret <2 x double> %res
 779 }
 780
 781 define void @fmax_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
 782 ; CHECK-LABEL: fmax_v4f64:
 783 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
 784 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 785 ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 786 ; CHECK-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 787 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 788 ; CHECK-NEXT: ret
 789   %op1 = load <4 x double>, <4 x double>* %a
 790   %op2 = load <4 x double>, <4 x double>* %b
 791   %res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2)
 792   store <4 x double> %res, <4 x double>* %a
 793   ret void
 794 }
 795
 796 define void @fmax_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
 797 ; CHECK-LABEL: fmax_v8f64:
 798 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
 799 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 800 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 801 ; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 802 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 803 ; VBITS_GE_512-NEXT: ret
 804
 805 ; Ensure sensible type legalisation.
 806 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
 807 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 808 ; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
 809 ; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
 810 ; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
 811 ; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
 812 ; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
 813 ; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
 814 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
 815 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
 816 ; VBITS_EQ_256-NEXT: ret
 817   %op1 = load <8 x double>, <8 x double>* %a
 818   %op2 = load <8 x double>, <8 x double>* %b
 819   %res = call <8 x double> @llvm.maximum.v8f64(<8 x double> %op1, <8 x double> %op2)
 820   store <8 x double> %res, <8 x double>* %a
 821   ret void
 822 }
 823
 824 define void @fmax_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
 825 ; CHECK-LABEL: fmax_v16f64:
 826 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
 827 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 828 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 829 ; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 830 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 831 ; VBITS_GE_1024-NEXT: ret
 832   %op1 = load <16 x double>, <16 x double>* %a
 833   %op2 = load <16 x double>, <16 x double>* %b
 834   %res = call <16 x double> @llvm.maximum.v16f64(<16 x double> %op1, <16 x double> %op2)
 835   store <16 x double> %res, <16 x double>* %a
 836   ret void
 837 }
 838
 839 define void @fmax_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
 840 ; CHECK-LABEL: fmax_v32f64:
 841 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
 842 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
 843 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
 844 ; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
 845 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
 846 ; VBITS_GE_2048-NEXT: ret
 847   %op1 = load <32 x double>, <32 x double>* %a
 848   %op2 = load <32 x double>, <32 x double>* %b
 849   %res = call <32 x double> @llvm.maximum.v32f64(<32 x double> %op1, <32 x double> %op2)
 850   store <32 x double> %res, <32 x double>* %a
 851   ret void
 852 }
 853
 854 ;
 855 ; FMIN
 856 ;
 857
 858 ; Don't use SVE for 64-bit vectors.
 859 define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
 860 ; CHECK-LABEL: fmin_v4f16:
 861 ; CHECK: fmin v0.4h, v0.4h, v1.4h
 862 ; CHECK-NEXT: ret
 863   %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
 864   ret <4 x half> %res
 865 }
 866
 867 ; Don't use SVE for 128-bit vectors.
 868 define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
 869 ; CHECK-LABEL: fmin_v8f16:
 870 ; CHECK: fmin v0.8h, v0.8h, v1.8h
 871 ; CHECK-NEXT: ret
 872   %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
 873   ret <8 x half> %res
 874 }
 875
 876 define void @fmin_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
 877 ; CHECK-LABEL: fmin_v16f16:
 878 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
 879 ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 880 ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 881 ; CHECK-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 882 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 883 ; CHECK-NEXT: ret
 884   %op1 = load <16 x half>, <16 x half>* %a
 885   %op2 = load <16 x half>, <16 x half>* %b
 886   %res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2)
 887   store <16 x half> %res, <16 x half>* %a
 888   ret void
 889 }
 890
 891 define void @fmin_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
 892 ; CHECK-LABEL: fmin_v32f16:
 893 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
 894 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 895 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 896 ; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 897 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 898 ; VBITS_GE_512-NEXT: ret
 899
 900 ; Ensure sensible type legalisation.
 901 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
 902 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
 903 ; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
 904 ; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
 905 ; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
 906 ; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
 907 ; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
 908 ; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
 909 ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
 910 ; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
 911 ; VBITS_EQ_256-NEXT: ret
 912   %op1 = load <32 x half>, <32 x half>* %a
 913   %op2 = load <32 x half>, <32 x half>* %b
 914   %res = call <32 x half> @llvm.minimum.v32f16(<32 x half> %op1, <32 x half> %op2)
 915   store <32 x half> %res, <32 x half>* %a
 916   ret void
 917 }
 918
 919 define void @fmin_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
 920 ; CHECK-LABEL: fmin_v64f16:
 921 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
 922 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 923 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 924 ; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 925 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 926 ; VBITS_GE_1024-NEXT: ret
 927   %op1 = load <64 x half>, <64 x half>* %a
 928   %op2 = load <64 x half>, <64 x half>* %b
 929   %res = call <64 x half> @llvm.minimum.v64f16(<64 x half> %op1, <64 x half> %op2)
 930   store <64 x half> %res, <64 x half>* %a
 931   ret void
 932 }
 933
 934 define void @fmin_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
 935 ; CHECK-LABEL: fmin_v128f16:
 936 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
 937 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
 938 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
 939 ; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
 940 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
 941 ; VBITS_GE_2048-NEXT: ret
 942   %op1 = load <128 x half>, <128 x half>* %a
 943   %op2 = load <128 x half>, <128 x half>* %b
 944   %res = call <128 x half> @llvm.minimum.v128f16(<128 x half> %op1, <128 x half> %op2)
 945   store <128 x half> %res, <128 x half>* %a
 946   ret void
 947 }
 948
 949 ; Don't use SVE for 64-bit vectors.
 950 define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
 951 ; CHECK-LABEL: fmin_v2f32:
 952 ; CHECK: fmin v0.2s, v0.2s, v1.2s
 953 ; CHECK-NEXT: ret
 954   %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
 955   ret <2 x float> %res
 956 }
 957
 958 ; Don't use SVE for 128-bit vectors.
 959 define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
 960 ; CHECK-LABEL: fmin_v4f32:
 961 ; CHECK: fmin v0.4s, v0.4s, v1.4s
 962 ; CHECK-NEXT: ret
 963   %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
 964   ret <4 x float> %res
 965 }
 966
 967 define void @fmin_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
 968 ; CHECK-LABEL: fmin_v8f32:
 969 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 970 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 971 ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 972 ; CHECK-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 973 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 974 ; CHECK-NEXT: ret
 975   %op1 = load <8 x float>, <8 x float>* %a
 976   %op2 = load <8 x float>, <8 x float>* %b
 977   %res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2)
 978   store <8 x float> %res, <8 x float>* %a
 979   ret void
 980 }
 981
 982 define void @fmin_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
 983 ; CHECK-LABEL: fmin_v16f32:
 984 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
 985 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
 986 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
 987 ; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
 988 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
 989 ; VBITS_GE_512-NEXT: ret
 990
 991 ; Ensure sensible type legalisation.
 992 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
 993 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 994 ; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
 995 ; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
 996 ; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
 997 ; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
 998 ; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
 999 ; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
1000 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
1001 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
1002 ; VBITS_EQ_256-NEXT: ret
1003   %op1 = load <16 x float>, <16 x float>* %a
1004   %op2 = load <16 x float>, <16 x float>* %b
1005   %res = call <16 x float> @llvm.minimum.v16f32(<16 x float> %op1, <16 x float> %op2)
1006   store <16 x float> %res, <16 x float>* %a
1007   ret void
1008 }
1009
1010 define void @fmin_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
1011 ; CHECK-LABEL: fmin_v32f32:
1012 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
1013 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
1014 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
1015 ; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
1016 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
1017 ; VBITS_GE_1024-NEXT: ret
1018   %op1 = load <32 x float>, <32 x float>* %a
1019   %op2 = load <32 x float>, <32 x float>* %b
1020   %res = call <32 x float> @llvm.minimum.v32f32(<32 x float> %op1, <32 x float> %op2)
1021   store <32 x float> %res, <32 x float>* %a
1022   ret void
1023 }
1024
1025 define void @fmin_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
1026 ; CHECK-LABEL: fmin_v64f32:
1027 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
1028 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
1029 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
1030 ; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
1031 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
1032 ; VBITS_GE_2048-NEXT: ret
1033   %op1 = load <64 x float>, <64 x float>* %a
1034   %op2 = load <64 x float>, <64 x float>* %b
1035   %res = call <64 x float> @llvm.minimum.v64f32(<64 x float> %op1, <64 x float> %op2)
1036   store <64 x float> %res, <64 x float>* %a
1037   ret void
1038 }
1039
1040 ; Don't use SVE for 64-bit vectors.
1041 define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
1042 ; CHECK-LABEL: fmin_v1f64:
1043 ; CHECK: fmin d0, d0, d1
1044 ; CHECK-NEXT: ret
1045   %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
1046   ret <1 x double> %res
1047 }
1048
1049 ; Don't use SVE for 128-bit vectors.
1050 define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
1051 ; CHECK-LABEL: fmin_v2f64:
1052 ; CHECK: fmin v0.2d, v0.2d, v1.2d
1053 ; CHECK-NEXT: ret
1054   %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
1055   ret <2 x double> %res
1056 }
1057
1058 define void @fmin_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
1059 ; CHECK-LABEL: fmin_v4f64:
1060 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
1061 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1062 ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1063 ; CHECK-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1064 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
1065 ; CHECK-NEXT: ret
1066   %op1 = load <4 x double>, <4 x double>* %a
1067   %op2 = load <4 x double>, <4 x double>* %b
1068   %res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2)
1069   store <4 x double> %res, <4 x double>* %a
1070   ret void
1071 }
1072
1073 define void @fmin_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
1074 ; CHECK-LABEL: fmin_v8f64:
1075 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
1076 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1077 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1078 ; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1079 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
1080 ; VBITS_GE_512-NEXT: ret
1081
1082 ; Ensure sensible type legalisation.
1083 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
1084 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
1085 ; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
1086 ; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
1087 ; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
1088 ; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
1089 ; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
1090 ; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
1091 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
1092 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
1093 ; VBITS_EQ_256-NEXT: ret
1094   %op1 = load <8 x double>, <8 x double>* %a
1095   %op2 = load <8 x double>, <8 x double>* %b
1096   %res = call <8 x double> @llvm.minimum.v8f64(<8 x double> %op1, <8 x double> %op2)
1097   store <8 x double> %res, <8 x double>* %a
1098   ret void
1099 }
1100
1101 define void @fmin_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
1102 ; CHECK-LABEL: fmin_v16f64:
1103 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
1104 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1105 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1106 ; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1107 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
1108 ; VBITS_GE_1024-NEXT: ret
1109   %op1 = load <16 x double>, <16 x double>* %a
1110   %op2 = load <16 x double>, <16 x double>* %b
1111   %res = call <16 x double> @llvm.minimum.v16f64(<16 x double> %op1, <16 x double> %op2)
1112   store <16 x double> %res, <16 x double>* %a
1113   ret void
1114 }
1115
1116 define void @fmin_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
1117 ; CHECK-LABEL: fmin_v32f64:
1118 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
1119 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1120 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1121 ; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1122 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
1123 ; VBITS_GE_2048-NEXT: ret
1124   %op1 = load <32 x double>, <32 x double>* %a
1125   %op2 = load <32 x double>, <32 x double>* %b
1126   %res = call <32 x double> @llvm.minimum.v32f64(<32 x double> %op1, <32 x double> %op2)
1127   store <32 x double> %res, <32 x double>* %a
1128   ret void
1129 }
1130
1131 attributes #0 = { "target-features"="+sve" }
1132
1133 declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
1134 declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
1135 declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
1136 declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>)
1137 declare <64 x half> @llvm.minnum.v64f16(<64 x half>, <64 x half>)
1138 declare <128 x half> @llvm.minnum.v128f16(<128 x half>, <128 x half>)
1139 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
1140 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
1141 declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
1142 declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>)
1143 declare <32 x float> @llvm.minnum.v32f32(<32 x float>, <32 x float>)
1144 declare <64 x float> @llvm.minnum.v64f32(<64 x float>, <64 x float>)
1145 declare <1 x double> @llvm.minnum.v1f64(<1 x double>, <1 x double>)
1146 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
1147 declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
1148 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
1149 declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>)
1150 declare <32 x double> @llvm.minnum.v32f64(<32 x double>, <32 x double>)
1151
1152 declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)
1153 declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
1154 declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
1155 declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
1156 declare <64 x half> @llvm.maxnum.v64f16(<64 x half>, <64 x half>)
1157 declare <128 x half> @llvm.maxnum.v128f16(<128 x half>, <128 x half>)
1158 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
1159 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
1160 declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
1161 declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>)
1162 declare <32 x float> @llvm.maxnum.v32f32(<32 x float>, <32 x float>)
1163 declare <64 x float> @llvm.maxnum.v64f32(<64 x float>, <64 x float>)
1164 declare <1 x double> @llvm.maxnum.v1f64(<1 x double>, <1 x double>)
1165 declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
1166 declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
1167 declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
1168 declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>)
1169 declare <32 x double> @llvm.maxnum.v32f64(<32 x double>, <32 x double>)
1170
1171 declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
1172 declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
1173 declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
1174 declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>)
1175 declare <64 x half> @llvm.minimum.v64f16(<64 x half>, <64 x half>)
1176 declare <128 x half> @llvm.minimum.v128f16(<128 x half>, <128 x half>)
1177 declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
1178 declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
1179 declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>)
1180 declare <16 x float> @llvm.minimum.v16f32(<16 x float>, <16 x float>)
1181 declare <32 x float> @llvm.minimum.v32f32(<32 x float>, <32 x float>)
1182 declare <64 x float> @llvm.minimum.v64f32(<64 x float>, <64 x float>)
1183 declare <1 x double> @llvm.minimum.v1f64(<1 x double>, <1 x double>)
1184 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
1185 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
1186 declare <8 x double> @llvm.minimum.v8f64(<8 x double>, <8 x double>)
1187 declare <16 x double> @llvm.minimum.v16f64(<16 x double>, <16 x double>)
1188 declare <32 x double> @llvm.minimum.v32f64(<32 x double>, <32 x double>)
1189
1190 declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
1191 declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
1192 declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
1193 declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
1194 declare <64 x half> @llvm.maximum.v64f16(<64 x half>, <64 x half>)
1195 declare <128 x half> @llvm.maximum.v128f16(<128 x half>, <128 x half>)
1196 declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
1197 declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
1198 declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>)
1199 declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
1200 declare <32 x float> @llvm.maximum.v32f32(<32 x float>, <32 x float>)
1201 declare <64 x float> @llvm.maximum.v64f32(<64 x float>, <64 x float>)
1202 declare <1 x double> @llvm.maximum.v1f64(<1 x double>, <1 x double>)
1203 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
1204 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
1205 declare <8 x double> @llvm.maximum.v8f64(<8 x double>, <8 x double>)
1206 declare <16 x double> @llvm.maximum.v16f64(<16 x double>, <16 x double>)
1207 declare <32 x double> @llvm.maximum.v32f64(<32 x double>, <32 x double>)