llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll

   1 ; RUN: llc -aarch64-sve-vector-bits-min=128  -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
   2 ; RUN: llc -aarch64-sve-vector-bits-min=256  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
   3 ; RUN: llc -aarch64-sve-vector-bits-min=384  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
   4 ; RUN: llc -aarch64-sve-vector-bits-min=512  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
   5 ; RUN: llc -aarch64-sve-vector-bits-min=640  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
   6 ; RUN: llc -aarch64-sve-vector-bits-min=768  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
   7 ; RUN: llc -aarch64-sve-vector-bits-min=896  -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
   8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
   9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
  16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
  17
  18 target triple = "aarch64-unknown-linux-gnu"
  19
  20 ; Don't use SVE when its registers are no bigger than NEON.
  21 ; NO_SVE-NOT: ptrue
  22
  23 ;
  24 ; FCVT H -> S
  25 ;
  26
  27 ; Don't use SVE for 64-bit vectors.
  28 define <2 x float> @fcvt_v2f16_v2f32(<2 x half> %op1) #0 {
  29 ; CHECK-LABEL: fcvt_v2f16_v2f32:
  30 ; CHECK: fcvtl v0.4s, v0.4h
  31 ; CHECK-NEXT: ret
  32   %res = fpext <2 x half> %op1 to <2 x float>
  33   ret <2 x float> %res
  34 }
  35
  36 ; Don't use SVE for 128-bit vectors.
  37 define <4 x float> @fcvt_v4f16_v4f32(<4 x half> %op1) #0 {
  38 ; CHECK-LABEL: fcvt_v4f16_v4f32:
  39 ; CHECK: fcvtl v0.4s, v0.4h
  40 ; CHECK-NEXT: ret
  41   %res = fpext <4 x half> %op1 to <4 x float>
  42   ret <4 x float> %res
  43 }
  44
  45 define void @fcvt_v8f16_v8f32(<8 x half>* %a, <8 x float>* %b) #0 {
  46 ; CHECK-LABEL: fcvt_v8f16_v8f32:
  47 ; CHECK: ldr q[[OP:[0-9]+]], [x0]
  48 ; CHECK-NEXT: ptrue [[PG:p[0-9]+]].s, vl8
  49 ; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].s, z[[OP]].h
  50 ; CHECK-NEXT: fcvt [[RES:z[0-9]+]].s, [[PG]]/m, [[UPK]].h
  51 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
  52 ; CHECK-NEXT: ret
  53   %op1 = load <8 x half>, <8 x half>* %a
  54   %res = fpext <8 x half> %op1 to <8 x float>
  55   store <8 x float> %res, <8 x float>* %b
  56   ret void
  57 }
  58
  59 define void @fcvt_v16f16_v16f32(<16 x half>* %a, <16 x float>* %b) #0 {
  60 ; CHECK-LABEL: fcvt_v16f16_v16f32:
  61 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16
  62 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
  63 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
  64 ; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
  65 ; VBITS_GE_512-NEXT: fcvt [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
  66 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
  67 ; VBITS_GE_512-NEXT: ret
  68
  69 ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently.
  70 ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl16
  71 ; VBITS_EQ_256-DAG: ld1h { [[VEC:z[0-9]+]].h }, [[PG1]]/z, [x0]
  72 ; VBITS_EQ_256-DAG: st1h { [[VEC:z[0-9]+]].h }, [[PG1]], [x8]
  73 ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp]
  74 ; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl8
  75 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
  76 ; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].s, z[[LO]].h
  77 ; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].s, z[[HI]].h
  78 ; VBITS_EQ_256-DAG: fcvt [[RES_LO:z[0-9]+]].s, [[PG2]]/m, [[UPK_LO]].h
  79 ; VBITS_EQ_256-DAG: fcvt [[RES_HI:z[0-9]+]].s, [[PG2]]/m, [[UPK_HI]].h
  80 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG2]], [x1]
  81 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG2]], [x1, x[[NUMELTS]], lsl #2]
  82   %op1 = load <16 x half>, <16 x half>* %a
  83   %res = fpext <16 x half> %op1 to <16 x float>
  84   store <16 x float> %res, <16 x float>* %b
  85   ret void
  86 }
  87
  88 define void @fcvt_v32f16_v32f32(<32 x half>* %a, <32 x float>* %b) #0 {
  89 ; CHECK-LABEL: fcvt_v32f16_v32f32:
  90 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32
  91 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
  92 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s, vl32
  93 ; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
  94 ; VBITS_GE_1024-NEXT: fcvt [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
  95 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
  96 ; VBITS_GE_1024-NEXT: ret
  97   %op1 = load <32 x half>, <32 x half>* %a
  98   %res = fpext <32 x half> %op1 to <32 x float>
  99   store <32 x float> %res, <32 x float>* %b
 100   ret void
 101 }
 102
 103 define void @fcvt_v64f16_v64f32(<64 x half>* %a, <64 x float>* %b) #0 {
 104 ; CHECK-LABEL: fcvt_v64f16_v64f32:
 105 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64
 106 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
 107 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s, vl64
 108 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[OP]].h
 109 ; VBITS_GE_2048-NEXT: fcvt [[RES:z[0-9]+]].s, [[PG2]]/m, [[UPK]].h
 110 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x1]
 111 ; VBITS_GE_2048-NEXT: ret
 112   %op1 = load <64 x half>, <64 x half>* %a
 113   %res = fpext <64 x half> %op1 to <64 x float>
 114   store <64 x float> %res, <64 x float>* %b
 115   ret void
 116 }
 117
 118 ;
 119 ; FCVT H -> D
 120 ;
 121
 122 ; Don't use SVE for 64-bit vectors.
 123 define <1 x double> @fcvt_v1f16_v1f64(<1 x half> %op1) #0 {
 124 ; CHECK-LABEL: fcvt_v1f16_v1f64:
 125 ; CHECK: fcvt d0, h0
 126 ; CHECK-NEXT: ret
 127   %res = fpext <1 x half> %op1 to <1 x double>
 128   ret <1 x double> %res
 129 }
 130
 131 ; v2f16 is not legal for NEON, so use SVE
 132 define <2 x double> @fcvt_v2f16_v2f64(<2 x half> %op1) #0 {
 133 ; CHECK-LABEL: fcvt_v2f16_v2f64:
 134 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
 135 ; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z0.h
 136 ; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
 137 ; CHECK-NEXT: fcvt z0.d, [[PG]]/m, [[UPK2]].h
 138 ; CHECK-NEXT: ret
 139   %res = fpext <2 x half> %op1 to <2 x double>
 140   ret <2 x double> %res
 141 }
 142
 143 define void @fcvt_v4f16_v4f64(<4 x half>* %a, <4 x double>* %b) #0 {
 144 ; CHECK-LABEL: fcvt_v4f16_v4f64:
 145 ; CHECK: ldr d[[OP:[0-9]+]], [x0]
 146 ; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
 147 ; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
 148 ; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
 149 ; CHECK-NEXT: fcvt [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
 150 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
 151 ; CHECK-NEXT: ret
 152   %op1 = load <4 x half>, <4 x half>* %a
 153   %res = fpext <4 x half> %op1 to <4 x double>
 154   store <4 x double> %res, <4 x double>* %b
 155   ret void
 156 }
 157
 158 define void @fcvt_v8f16_v8f64(<8 x half>* %a, <8 x double>* %b) #0 {
 159 ; CHECK-LABEL: fcvt_v8f16_v8f64:
 160 ; VBITS_GE_512: ldr q[[OP:[0-9]+]], [x0]
 161 ; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
 162 ; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[OP]].h
 163 ; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
 164 ; VBITS_GE_512-NEXT: fcvt [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK2]].h
 165 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
 166 ; VBITS_GE_512-NEXT: ret
 167
 168 ; Ensure sensible type legalisation.
 169 ; VBITS_EQ_256-DAG: ldr q[[OP:[0-9]+]], [x0]
 170 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
 171 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 172 ; VBITS_EQ_256-DAG: ext v[[HI:[0-9]+]].16b, v[[OP]].16b, v[[OP]].16b, #8
 173 ; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[OP]].h
 174 ; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[HI]].h
 175 ; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s
 176 ; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK2_HI]].s
 177 ; VBITS_EQ_256-DAG: fcvt [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[UPK2_LO]].h
 178 ; VBITS_EQ_256-DAG: fcvt [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[UPK2_HI]].h
 179 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x1]
 180 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x1, x[[NUMELTS]], lsl #3]
 181   %op1 = load <8 x half>, <8 x half>* %a
 182   %res = fpext <8 x half> %op1 to <8 x double>
 183   store <8 x double> %res, <8 x double>* %b
 184   ret void
 185 }
 186
 187 define void @fcvt_v16f16_v16f64(<16 x half>* %a, <16 x double>* %b) #0 {
 188 ; CHECK-LABEL: fcvt_v16f16_v16f64:
 189 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl16
 190 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
 191 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
 192 ; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
 193 ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
 194 ; VBITS_GE_1024-NEXT: fcvt [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
 195 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
 196 ; VBITS_GE_1024-NEXT: ret
 197   %op1 = load <16 x half>, <16 x half>* %a
 198   %res = fpext <16 x half> %op1 to <16 x double>
 199   store <16 x double> %res, <16 x double>* %b
 200   ret void
 201 }
 202
 203 define void @fcvt_v32f16_v32f64(<32 x half>* %a, <32 x double>* %b) #0 {
 204 ; CHECK-LABEL: fcvt_v32f16_v32f64:
 205 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl32
 206 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG1]]/z, [x0]
 207 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
 208 ; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[OP]].h
 209 ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK]].s
 210 ; VBITS_GE_2048-NEXT: fcvt [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK2]].h
 211 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
 212 ; VBITS_GE_2048-NEXT: ret
 213   %op1 = load <32 x half>, <32 x half>* %a
 214   %res = fpext <32 x half> %op1 to <32 x double>
 215   store <32 x double> %res, <32 x double>* %b
 216   ret void
 217 }
 218
 219 ;
 220 ; FCVT S -> D
 221 ;
 222
 223 ; Don't use SVE for 64-bit vectors.
 224 define <1 x double> @fcvt_v1f32_v1f64(<1 x float> %op1) #0 {
 225 ; CHECK-LABEL: fcvt_v1f32_v1f64:
 226 ; CHECK: fcvtl v0.2d, v0.2s
 227 ; CHECK-NEXT: ret
 228   %res = fpext <1 x float> %op1 to <1 x double>
 229   ret <1 x double> %res
 230 }
 231
 232 ; Don't use SVE for 128-bit vectors.
 233 define <2 x double> @fcvt_v2f32_v2f64(<2 x float> %op1) #0 {
 234 ; CHECK-LABEL: fcvt_v2f32_v2f64:
 235 ; CHECK: fcvtl v0.2d, v0.2s
 236 ; CHECK-NEXT: ret
 237   %res = fpext <2 x float> %op1 to <2 x double>
 238   ret <2 x double> %res
 239 }
 240
 241 define void @fcvt_v4f32_v4f64(<4 x float>* %a, <4 x double>* %b) #0 {
 242 ; CHECK-LABEL: fcvt_v4f32_v4f64:
 243 ; CHECK: ldr q[[OP:[0-9]+]], [x0]
 244 ; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4
 245 ; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[OP]].s
 246 ; CHECK-NEXT: fcvt [[RES:z[0-9]+]].d, [[PG]]/m, [[UPK]].s
 247 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
 248 ; CHECK-NEXT: ret
 249   %op1 = load <4 x float>, <4 x float>* %a
 250   %res = fpext <4 x float> %op1 to <4 x double>
 251   store <4 x double> %res, <4 x double>* %b
 252   ret void
 253 }
 254
 255 define void @fcvt_v8f32_v8f64(<8 x float>* %a, <8 x double>* %b) #0 {
 256 ; CHECK-LABEL: fcvt_v8f32_v8f64:
 257 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl8
 258 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
 259 ; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8
 260 ; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
 261 ; VBITS_GE_512-NEXT: fcvt [[RES:z[0-9]+]].d, [[PG1]]/m, [[UPK]].s
 262 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
 263 ; VBITS_GE_512-NEXT: ret
 264
 265 ; Ensure sensible type legalisation - fixed type extract_subvector codegen is poor currently.
 266 ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
 267 ; VBITS_EQ_256-DAG: ld1w { [[VEC:z[0-9]+]].s }, [[PG1]]/z, [x0]
 268 ; VBITS_EQ_256-DAG: st1w { [[VEC:z[0-9]+]].s }, [[PG1]], [x8]
 269 ; VBITS_EQ_256-DAG: ldp q[[LO:[0-9]+]], q[[HI:[0-9]+]], [sp]
 270 ; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d, vl4
 271 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 272 ; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, z[[LO]].s
 273 ; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, z[[HI]].s
 274 ; VBITS_EQ_256-DAG: fcvt [[RES_LO:z[0-9]+]].d, [[PG2]]/m, [[UPK_LO]].s
 275 ; VBITS_EQ_256-DAG: fcvt [[RES_HI:z[0-9]+]].d, [[PG2]]/m, [[UPK_HI]].s
 276 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG2]], [x1]
 277 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG2]], [x1, x[[NUMELTS]], lsl #3]
 278   %op1 = load <8 x float>, <8 x float>* %a
 279   %res = fpext <8 x float> %op1 to <8 x double>
 280   store <8 x double> %res, <8 x double>* %b
 281   ret void
 282 }
 283
 284 define void @fcvt_v16f32_v16f64(<16 x float>* %a, <16 x double>* %b) #0 {
 285 ; CHECK-LABEL: fcvt_v16f32_v16f64:
 286 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl16
 287 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
 288 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d, vl16
 289 ; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
 290 ; VBITS_GE_1024-NEXT: fcvt [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
 291 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
 292 ; VBITS_GE_1024-NEXT: ret
 293   %op1 = load <16 x float>, <16 x float>* %a
 294   %res = fpext <16 x float> %op1 to <16 x double>
 295   store <16 x double> %res, <16 x double>* %b
 296   ret void
 297 }
 298
 299 define void @fcvt_v32f32_v32f64(<32 x float>* %a, <32 x double>* %b) #0 {
 300 ; CHECK-LABEL: fcvt_v32f32_v32f64:
 301 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl32
 302 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
 303 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
 304 ; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[OP]].s
 305 ; VBITS_GE_2048-NEXT: fcvt [[RES:z[0-9]+]].d, [[PG2]]/m, [[UPK]].s
 306 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x1]
 307 ; VBITS_GE_2048-NEXT: ret
 308   %op1 = load <32 x float>, <32 x float>* %a
 309   %res = fpext <32 x float> %op1 to <32 x double>
 310   store <32 x double> %res, <32 x double>* %b
 311   ret void
 312 }
 313
 314 ;
 315 ; FCVT S -> H
 316 ;
 317
 318 ; Don't use SVE for 64-bit vectors.
 319 define <2 x half> @fcvt_v2f32_v2f16(<2 x float> %op1) #0 {
 320 ; CHECK-LABEL: fcvt_v2f32_v2f16:
 321 ; CHECK: fcvtn v0.4h, v0.4s
 322 ; CHECK-NEXT: ret
 323   %res = fptrunc <2 x float> %op1 to <2 x half>
 324   ret <2 x half> %res
 325 }
 326
 327 ; Don't use SVE for 128-bit vectors.
 328 define <4 x half> @fcvt_v4f32_v4f16(<4 x float> %op1) #0 {
 329 ; CHECK-LABEL: fcvt_v4f32_v4f16:
 330 ; CHECK: fcvtn v0.4h, v0.4s
 331 ; CHECK-NEXT: ret
 332   %res = fptrunc <4 x float> %op1 to <4 x half>
 333   ret <4 x half> %res
 334 }
 335
 336 define <8 x half> @fcvt_v8f32_v8f16(<8 x float>* %a) #0 {
 337 ; CHECK-LABEL: fcvt_v8f32_v8f16:
 338 ; CHECK: ptrue [[PG1:p[0-9]+]].s, vl8
 339 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
 340 ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
 341 ; CHECK-NEXT: fcvt [[CVT:z[0-9]+]].h, [[PG2]]/m, [[OP]].s
 342 ; CHECK-NEXT: uzp1 z0.h, [[CVT]].h, [[CVT]].h
 343 ; CHECK-NEXT: ret
 344   %op1 = load <8 x float>, <8 x float>* %a
 345   %res = fptrunc <8 x float> %op1 to <8 x half>
 346   ret <8 x half> %res
 347 }
 348
 349 define void @fcvt_v16f32_v16f16(<16 x float>* %a, <16 x half>* %b) #0 {
 350 ; CHECK-LABEL: fcvt_v16f32_v16f16:
 351 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl16
 352 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
 353 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
 354 ; VBITS_GE_512-NEXT: fcvt [[CVT:z[0-9]+]].h, [[PG2]]/m, [[OP]].s
 355 ; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
 356 ; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
 357 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
 358 ; VBITS_GE_512-NEXT: ret
 359
 360 ; Ensure sensible type legalisation
 361 ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
 362 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
 363 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG1]]/z, [x0]
 364 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #2]
 365 ; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s
 366 ; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].h, vl8
 367 ; VBITS_EQ_256-DAG: fcvt [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].s
 368 ; VBITS_EQ_256-DAG: fcvt [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].s
 369 ; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].h, [[CVT_LO]].h, [[CVT_LO]].h
 370 ; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].h, [[CVT_HI]].h, [[CVT_HI]].h
 371 ; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].h, [[PG3]], [[RES_LO]].h, [[RES_HI]].h
 372 ; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].h, vl16
 373 ; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG4]], [x1]
 374   %op1 = load <16 x float>, <16 x float>* %a
 375   %res = fptrunc <16 x float> %op1 to <16 x half>
 376   store <16 x half> %res, <16 x half>* %b
 377   ret void
 378 }
 379
 380 define void @fcvt_v32f32_v32f16(<32 x float>* %a, <32 x half>* %b) #0 {
 381 ; CHECK-LABEL: fcvt_v32f32_v32f16:
 382 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl32
 383 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
 384 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
 385 ; VBITS_GE_1024-NEXT: fcvt [[CVT:z[0-9]+]].h, [[PG2]]/m, [[OP]].s
 386 ; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
 387 ; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
 388 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
 389 ; VBITS_GE_1024-NEXT: ret
 390   %op1 = load <32 x float>, <32 x float>* %a
 391   %res = fptrunc <32 x float> %op1 to <32 x half>
 392   store <32 x half> %res, <32 x half>* %b
 393   ret void
 394 }
 395
 396 define void @fcvt_v64f32_v64f16(<64 x float>* %a, <64 x half>* %b) #0 {
 397 ; CHECK-LABEL: fcvt_v64f32_v64f16:
 398 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl64
 399 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG1]]/z, [x0]
 400 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
 401 ; VBITS_GE_2048-NEXT: fcvt [[RES:z[0-9]+]].h, [[PG2]]/m, [[UPK]].s
 402 ; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[CVT]].h, [[CVT]].h
 403 ; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl64
 404 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
 405 ; VBITS_GE_2048-NEXT: ret
 406   %op1 = load <64 x float>, <64 x float>* %a
 407   %res = fptrunc <64 x float> %op1 to <64 x half>
 408   store <64 x half> %res, <64 x half>* %b
 409   ret void
 410 }
 411
 412 ;
 413 ; FCVT D -> H
 414 ;
 415
 416 ; Don't use SVE for 64-bit vectors.
 417 define <1 x half> @fcvt_v1f64_v1f16(<1 x double> %op1) #0 {
 418 ; CHECK-LABEL: fcvt_v1f64_v1f16:
 419 ; CHECK: fcvt h0, d0
 420 ; CHECK-NEXT: ret
 421   %res = fptrunc <1 x double> %op1 to <1 x half>
 422   ret <1 x half> %res
 423 }
 424
 425 ; v2f16 is not legal for NEON, so use SVE
 426 define <2 x half> @fcvt_v2f64_v2f16(<2 x double> %op1) #0 {
 427 ; CHECK-LABEL: fcvt_v2f64_v2f16:
 428 ; CHECK: ptrue [[PG:p[0-9]+]].d
 429 ; CHECK-NEXT: fcvt [[CVT:z[0-9]+]].h, [[PG]]/m, z0.d
 430 ; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
 431 ; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
 432 ; CHECK-NEXT: ret
 433   %res = fptrunc <2 x double> %op1 to <2 x half>
 434   ret <2 x half> %res
 435 }
 436
 437 define <4 x half> @fcvt_v4f64_v4f16(<4 x double>* %a) #0 {
 438 ; CHECK-LABEL: fcvt_v4f64_v4f16:
 439 ; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
 440 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
 441 ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
 442 ; CHECK-NEXT: fcvt [[CVT:z[0-9]+]].h, [[PG2]]/m, [[OP]].d
 443 ; CHECK-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
 444 ; CHECK-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
 445 ; CHECK-NEXT: ret
 446   %op1 = load <4 x double>, <4 x double>* %a
 447   %res = fptrunc <4 x double> %op1 to <4 x half>
 448   ret <4 x half> %res
 449 }
 450
 451 define <8 x half> @fcvt_v8f64_v8f16(<8 x double>* %a) #0 {
 452 ; CHECK-LABEL: fcvt_v8f64_v8f16:
 453 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
 454 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
 455 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
 456 ; VBITS_GE_512-NEXT: fcvt [[CVT:z[0-9]+]].h, [[PG2]]/m, [[OP]].d
 457 ; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
 458 ; VBITS_GE_512-NEXT: uzp1 z0.h, [[UZP]].h, [[UZP]].h
 459 ; VBITS_GE_512-NEXT: ret
 460
 461 ; Ensure sensible type legalisation
 462 ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
 463 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 464 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG1]]/z, [x0]
 465 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3]
 466 ; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
 467 ; VBITS_EQ_256-DAG: fcvt [[CVT_LO:z[0-9]+]].h, [[PG2]]/m, [[LO]].d
 468 ; VBITS_EQ_256-DAG: fcvt [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d
 469 ; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
 470 ; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
 471 ; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h
 472 ; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h
 473 ; VBITS_EQ_256-DAG: mov v0.d[1], v[[RES_HI]].d[0]
 474   %op1 = load <8 x double>, <8 x double>* %a
 475   %res = fptrunc <8 x double> %op1 to <8 x half>
 476   ret <8 x half> %res
 477 }
 478
 479 define void @fcvt_v16f64_v16f16(<16 x double>* %a, <16 x half>* %b) #0 {
 480 ; CHECK-LABEL: fcvt_v16f64_v16f16:
 481 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
 482 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
 483 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
 484 ; VBITS_GE_1024-NEXT: fcvt [[CVT:z[0-9]+]].h, [[PG2]]/m, [[OP]].d
 485 ; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
 486 ; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
 487 ; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].h, vl16
 488 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
 489 ; VBITS_GE_1024-NEXT: ret
 490   %op1 = load <16 x double>, <16 x double>* %a
 491   %res = fptrunc <16 x double> %op1 to <16 x half>
 492   store <16 x half> %res, <16 x half>* %b
 493   ret void
 494 }
 495
 496 define void @fcvt_v32f64_v32f16(<32 x double>* %a, <32 x half>* %b) #0 {
 497 ; CHECK-LABEL: fcvt_v32f64_v32f16:
 498 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
 499 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
 500 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
 501 ; VBITS_GE_2048-NEXT: fcvt [[CVT:z[0-9]+]].h, [[PG2]]/m, [[OP]].d
 502 ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
 503 ; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].h, [[UZP]].h, [[UZP]].h
 504 ; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].h, vl32
 505 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG3]], [x1]
 506 ; VBITS_GE_2048-NEXT: ret
 507   %op1 = load <32 x double>, <32 x double>* %a
 508   %res = fptrunc <32 x double> %op1 to <32 x half>
 509   store <32 x half> %res, <32 x half>* %b
 510   ret void
 511 }
 512
 513 ;
 514 ; FCVT D -> S
 515 ;
 516
 517 ; Don't use SVE for 64-bit vectors.
 518 define <1 x float> @fcvt_v1f64_v1f32(<1 x double> %op1) #0 {
 519 ; CHECK-LABEL: fcvt_v1f64_v1f32:
 520 ; CHECK: fcvtn v0.2s, v0.2d
 521 ; CHECK-NEXT: ret
 522   %res = fptrunc <1 x double> %op1 to <1 x float>
 523   ret <1 x float> %res
 524 }
 525
 526 ; Don't use SVE for 128-bit vectors.
 527 define <2 x float> @fcvt_v2f64_v2f32(<2 x double> %op1) #0 {
 528 ; CHECK-LABEL: fcvt_v2f64_v2f32:
 529 ; CHECK: fcvtn v0.2s, v0.2d
 530 ; CHECK-NEXT: ret
 531   %res = fptrunc <2 x double> %op1 to <2 x float>
 532   ret <2 x float> %res
 533 }
 534
 535 define <4 x float> @fcvt_v4f64_v4f32(<4 x double>* %a) #0 {
 536 ; CHECK-LABEL: fcvt_v4f64_v4f32:
 537 ; CHECK: ptrue [[PG1:p[0-9]+]].d, vl4
 538 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
 539 ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
 540 ; CHECK-NEXT: fcvt [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].d
 541 ; CHECK-NEXT: uzp1 z0.s, [[CVT]].s, [[CVT]].s
 542 ; CHECK-NEXT: ret
 543   %op1 = load <4 x double>, <4 x double>* %a
 544   %res = fptrunc <4 x double> %op1 to <4 x float>
 545   ret <4 x float> %res
 546 }
 547
 548 define void @fcvt_v8f64_v8f32(<8 x double>* %a, <8 x float>* %b) #0 {
 549 ; CHECK-LABEL: fcvt_v8f64_v8f32:
 550 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl8
 551 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
 552 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
 553 ; VBITS_GE_512-NEXT: fcvt [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].d
 554 ; VBITS_GE_512-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
 555 ; VBITS_GE_512-NEXT: ptrue [[PG3:p[0-9]+]].s, vl8
 556 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
 557 ; VBITS_GE_512-NEXT: ret
 558
 559 ; Ensure sensible type legalisation
 560 ; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].d, vl4
 561 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
 562 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG1]]/z, [x0]
 563 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG1]]/z, [x0, x[[NUMELTS]], lsl #3]
 564 ; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].d
 565 ; VBITS_EQ_256-DAG: ptrue [[PG3:p[0-9]+]].s, vl4
 566 ; VBITS_EQ_256-DAG: fcvt [[CVT_LO:z[0-9]+]].s, [[PG2]]/m, [[LO]].d
 567 ; VBITS_EQ_256-DAG: fcvt [[CVT_HI:z[0-9]+]].s, [[PG2]]/m, [[HI]].d
 568 ; VBITS_EQ_256-DAG: uzp1 [[RES_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s
 569 ; VBITS_EQ_256-DAG: uzp1 [[RES_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s
 570 ; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG3]], [[RES_LO]].s, [[RES_HI]].s
 571 ; VBITS_EQ_256-DAG: ptrue [[PG4:p[0-9]+]].s, vl8
 572 ; VBITS_EQ_256-NEXT: st1w { [[RES]].s }, [[PG4]], [x1]
 573 ; VBITS_EQ_256-NEXT: ret
 574   %op1 = load <8 x double>, <8 x double>* %a
 575   %res = fptrunc <8 x double> %op1 to <8 x float>
 576   store <8 x float> %res, <8 x float>* %b
 577   ret void
 578 }
 579
 580 define void @fcvt_v16f64_v16f32(<16 x double>* %a, <16 x float>* %b) #0 {
 581 ; CHECK-LABEL: fcvt_v16f64_v16f32:
 582 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl16
 583 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
 584 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
 585 ; VBITS_GE_1024-NEXT: fcvt [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].d
 586 ; VBITS_GE_1024-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
 587 ; VBITS_GE_1024-NEXT: ptrue [[PG3:p[0-9]+]].s, vl16
 588 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
 589 ; VBITS_GE_1024-NEXT: ret
 590   %op1 = load <16 x double>, <16 x double>* %a
 591   %res = fptrunc <16 x double> %op1 to <16 x float>
 592   store <16 x float> %res, <16 x float>* %b
 593   ret void
 594 }
 595
 596 define void @fcvt_v32f64_v32f32(<32 x double>* %a, <32 x float>* %b) #0 {
 597 ; CHECK-LABEL: fcvt_v32f64_v32f32:
 598 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl32
 599 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG1]]/z, [x0]
 600 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
 601 ; VBITS_GE_2048-NEXT: fcvt [[CVT:z[0-9]+]].s, [[PG2]]/m, [[OP]].d
 602 ; VBITS_GE_2048-NEXT: uzp1 [[RES:z[0-9]+]].s, [[CVT]].s, [[CVT]].s
 603 ; VBITS_GE_2048-NEXT: ptrue [[PG3:p[0-9]+]].s, vl32
 604 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG3]], [x1]
 605 ; VBITS_GE_2048-NEXT: ret
 606   %op1 = load <32 x double>, <32 x double>* %a
 607   %res = fptrunc <32 x double> %op1 to <32 x float>
 608   store <32 x float> %res, <32 x float>* %b
 609   ret void
 610 }
 611
 612 attributes #0 = { "target-features"="+sve" }