llvm/test/CodeGen/AArch64/sve-intrinsics-ldN-sret-reg+imm-addr-mode.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s
   3 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sme < %s | FileCheck %s
   4
   5 ; NOTE: invalid, upper and lower bound immediate values of the regimm
   6 ; addressing mode are checked only for the byte version of each
   7 ; instruction (`ld<N>b`), as the code for detecting the immediate is
   8 ; common to all instructions, and varies only for the number of
   9 ; elements of the structure store, which is <N> = 2, 3, 4.
  10
  11 ; ld2b
  12 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
  13 ; CHECK-LABEL: ld2.nxv32i8:
  14 ; CHECK:       // %bb.0:
  15 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x0, #2, mul vl]
  16 ; CHECK-NEXT:    ret
  17   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 2
  18   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8*
  19   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
  20   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
  21 }
  22
  23 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
  24 ; CHECK-LABEL: ld2.nxv32i8_lower_bound:
  25 ; CHECK:       // %bb.0:
  26 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x0, #-16, mul vl]
  27 ; CHECK-NEXT:    ret
  28   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -16
  29   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
  30   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
  31   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
  32 }
  33
  34 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
  35 ; CHECK-LABEL: ld2.nxv32i8_upper_bound:
  36 ; CHECK:       // %bb.0:
  37 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x0, #14, mul vl]
  38 ; CHECK-NEXT:    ret
  39   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 14
  40   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
  41   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
  42   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
  43 }
  44
  45 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_not_multiple_of_2(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
  46 ; CHECK-LABEL: ld2.nxv32i8_not_multiple_of_2:
  47 ; CHECK:       // %bb.0:
  48 ; CHECK-NEXT:    rdvl x8, #3
  49 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x0, x8]
  50 ; CHECK-NEXT:    ret
  51   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3
  52   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
  53   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
  54   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
  55 }
  56
  57 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
  58 ; CHECK-LABEL: ld2.nxv32i8_outside_lower_bound:
  59 ; CHECK:       // %bb.0:
  60 ; CHECK-NEXT:    rdvl x8, #-18
  61 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x0, x8]
  62 ; CHECK-NEXT:    ret
  63   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -18
  64   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
  65   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
  66   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
  67 }
  68
  69 define { <vscale x 16 x i8>, <vscale x 16 x i8> } @ld2.nxv32i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
  70 ; CHECK-LABEL: ld2.nxv32i8_outside_upper_bound:
  71 ; CHECK:       // %bb.0:
  72 ; CHECK-NEXT:    rdvl x8, #16
  73 ; CHECK-NEXT:    ld2b { z0.b, z1.b }, p0/z, [x0, x8]
  74 ; CHECK-NEXT:    ret
  75   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 16
  76   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
  77   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
  78   ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
  79 }
  80
  81 ; ld2h
  82 define { <vscale x 8 x i16>, <vscale x 8 x i16> } @ld2.nxv16i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16>* %addr) {
  83 ; CHECK-LABEL: ld2.nxv16i16:
  84 ; CHECK:       // %bb.0:
  85 ; CHECK-NEXT:    ld2h { z0.h, z1.h }, p0/z, [x0, #14, mul vl]
  86 ; CHECK-NEXT:    ret
  87   %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 14
  88   %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
  89   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
  90   ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
  91 }
  92
  93 define { <vscale x 8 x half>, <vscale x 8 x half> } @ld2.nxv16f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half>* %addr) {
  94 ; CHECK-LABEL: ld2.nxv16f16:
  95 ; CHECK:       // %bb.0:
  96 ; CHECK-NEXT:    ld2h { z0.h, z1.h }, p0/z, [x0, #-16, mul vl]
  97 ; CHECK-NEXT:    ret
  98   %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 -16
  99   %base_ptr = bitcast <vscale x 8 x half>* %base to half *
 100   %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
 101   ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
 102 }
 103
 104 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld2.nxv16bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat>* %addr) #0 {
 105 ; CHECK-LABEL: ld2.nxv16bf16:
 106 ; CHECK:       // %bb.0:
 107 ; CHECK-NEXT:    ld2h { z0.h, z1.h }, p0/z, [x0, #12, mul vl]
 108 ; CHECK-NEXT:    ret
 109   %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 12
 110   %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
 111   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
 112   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
 113 }
 114
 115 ; ld2w
 116 define { <vscale x 4 x i32>, <vscale x 4 x i32> } @ld2.nxv8i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32>* %addr) {
 117 ; CHECK-LABEL: ld2.nxv8i32:
 118 ; CHECK:       // %bb.0:
 119 ; CHECK-NEXT:    ld2w { z0.s, z1.s }, p0/z, [x0, #14, mul vl]
 120 ; CHECK-NEXT:    ret
 121   %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 14
 122   %base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
 123   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
 124   ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 125 }
 126
 127 define { <vscale x 4 x float>, <vscale x 4 x float> } @ld2.nxv8f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float>* %addr) {
 128 ; CHECK-LABEL: ld2.nxv8f32:
 129 ; CHECK:       // %bb.0:
 130 ; CHECK-NEXT:    ld2w { z0.s, z1.s }, p0/z, [x0, #-16, mul vl]
 131 ; CHECK-NEXT:    ret
 132   %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -16
 133   %base_ptr = bitcast <vscale x 4 x float>* %base to float *
 134   %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
 135   ret { <vscale x 4 x float>, <vscale x 4 x float> } %res
 136 }
 137
 138 ; ld2d
 139 define { <vscale x 2 x i64>, <vscale x 2 x i64> } @ld2.nxv4i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64>* %addr) {
 140 ; CHECK-LABEL: ld2.nxv4i64:
 141 ; CHECK:       // %bb.0:
 142 ; CHECK-NEXT:    ld2d { z0.d, z1.d }, p0/z, [x0, #14, mul vl]
 143 ; CHECK-NEXT:    ret
 144   %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 14
 145   %base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
 146   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
 147   ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 148 }
 149
 150 define { <vscale x 2 x double>, <vscale x 2 x double> } @ld2.nxv4f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double>* %addr) {
 151 ; CHECK-LABEL: ld2.nxv4f64:
 152 ; CHECK:       // %bb.0:
 153 ; CHECK-NEXT:    ld2d { z0.d, z1.d }, p0/z, [x0, #-16, mul vl]
 154 ; CHECK-NEXT:    ret
 155   %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -16
 156   %base_ptr = bitcast <vscale x 2 x double>* %base to double *
 157   %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> %Pg, double *%base_ptr)
 158   ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
 159 }
 160
 161 ; ld3b
 162 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 163 ; CHECK-LABEL: ld3.nxv48i8:
 164 ; CHECK:       // %bb.0:
 165 ; CHECK-NEXT:    ld3b { z0.b - z2.b }, p0/z, [x0, #3, mul vl]
 166 ; CHECK-NEXT:    ret
 167   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 3
 168   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 169   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 170   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 171 }
 172
 173 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 174 ; CHECK-LABEL: ld3.nxv48i8_lower_bound:
 175 ; CHECK:       // %bb.0:
 176 ; CHECK-NEXT:    ld3b { z0.b - z2.b }, p0/z, [x0, #-24, mul vl]
 177 ; CHECK-NEXT:    ret
 178   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -24
 179   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 180   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 181   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 182 }
 183
 184 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 185 ; CHECK-LABEL: ld3.nxv48i8_upper_bound:
 186 ; CHECK:       // %bb.0:
 187 ; CHECK-NEXT:    ld3b { z0.b - z2.b }, p0/z, [x0, #21, mul vl]
 188 ; CHECK-NEXT:    ret
 189   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 21
 190   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 191   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 192   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 193 }
 194
 195 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_not_multiple_of_3_01(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 196 ; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_01:
 197 ; CHECK:       // %bb.0:
 198 ; CHECK-NEXT:    rdvl x8, #4
 199 ; CHECK-NEXT:    ld3b { z0.b - z2.b }, p0/z, [x0, x8]
 200 ; CHECK-NEXT:    ret
 201   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4
 202   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 203   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 204   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 205 }
 206
 207 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_not_multiple_of_3_02(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 208 ; CHECK-LABEL: ld3.nxv48i8_not_multiple_of_3_02:
 209 ; CHECK:       // %bb.0:
 210 ; CHECK-NEXT:    rdvl x8, #5
 211 ; CHECK-NEXT:    ld3b { z0.b - z2.b }, p0/z, [x0, x8]
 212 ; CHECK-NEXT:    ret
 213   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5
 214   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 215   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 216   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 217 }
 218
 219 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 220 ; CHECK-LABEL: ld3.nxv48i8_outside_lower_bound:
 221 ; CHECK:       // %bb.0:
 222 ; CHECK-NEXT:    rdvl x8, #-27
 223 ; CHECK-NEXT:    ld3b { z0.b - z2.b }, p0/z, [x0, x8]
 224 ; CHECK-NEXT:    ret
 225   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -27
 226   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 227   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 228   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 229 }
 230
 231 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld3.nxv48i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 232 ; CHECK-LABEL: ld3.nxv48i8_outside_upper_bound:
 233 ; CHECK:       // %bb.0:
 234 ; CHECK-NEXT:    rdvl x8, #24
 235 ; CHECK-NEXT:    ld3b { z0.b - z2.b }, p0/z, [x0, x8]
 236 ; CHECK-NEXT:    ret
 237   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 24
 238   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 239   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 240   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 241 }
 242
 243 ; ld3h
 244 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld3.nxv24i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16> *%addr) {
 245 ; CHECK-LABEL: ld3.nxv24i16:
 246 ; CHECK:       // %bb.0:
 247 ; CHECK-NEXT:    ld3h { z0.h - z2.h }, p0/z, [x0, #21, mul vl]
 248 ; CHECK-NEXT:    ret
 249   %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 21
 250   %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
 251   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3.sret.nxv8i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
 252   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 253 }
 254
 255 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld3.nxv24f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half> *%addr) {
 256 ; CHECK-LABEL: ld3.nxv24f16:
 257 ; CHECK:       // %bb.0:
 258 ; CHECK-NEXT:    ld3h { z0.h - z2.h }, p0/z, [x0, #21, mul vl]
 259 ; CHECK-NEXT:    ret
 260   %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 21
 261   %base_ptr = bitcast <vscale x 8 x half>* %base to half *
 262   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3.sret.nxv8f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
 263   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 264 }
 265
 266 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld3.nxv24bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat> *%addr) #0 {
 267 ; CHECK-LABEL: ld3.nxv24bf16:
 268 ; CHECK:       // %bb.0:
 269 ; CHECK-NEXT:    ld3h { z0.h - z2.h }, p0/z, [x0, #-24, mul vl]
 270 ; CHECK-NEXT:    ret
 271   %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -24
 272   %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
 273   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
 274   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
 275 }
 276
 277 ; ld3w
 278 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld3.nxv12i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> *%addr) {
 279 ; CHECK-LABEL: ld3.nxv12i32:
 280 ; CHECK:       // %bb.0:
 281 ; CHECK-NEXT:    ld3w { z0.s - z2.s }, p0/z, [x0, #21, mul vl]
 282 ; CHECK-NEXT:    ret
 283   %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 21
 284   %base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
 285   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
 286   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 287 }
 288
 289 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld3.nxv12f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float> *%addr) {
 290 ; CHECK-LABEL: ld3.nxv12f32:
 291 ; CHECK:       // %bb.0:
 292 ; CHECK-NEXT:    ld3w { z0.s - z2.s }, p0/z, [x0, #-24, mul vl]
 293 ; CHECK-NEXT:    ret
 294   %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -24
 295   %base_ptr = bitcast <vscale x 4 x float>* %base to float *
 296   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
 297   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 298 }
 299
 300 ; ld3d
 301 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld3.nxv6i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> *%addr) {
 302 ; CHECK-LABEL: ld3.nxv6i64:
 303 ; CHECK:       // %bb.0:
 304 ; CHECK-NEXT:    ld3d { z0.d - z2.d }, p0/z, [x0, #21, mul vl]
 305 ; CHECK-NEXT:    ret
 306   %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 21
 307   %base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
 308   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
 309   ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 310 }
 311
 312 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld3.nxv6f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double> *%addr) {
 313 ; CHECK-LABEL: ld3.nxv6f64:
 314 ; CHECK:       // %bb.0:
 315 ; CHECK-NEXT:    ld3d { z0.d - z2.d }, p0/z, [x0, #-24, mul vl]
 316 ; CHECK-NEXT:    ret
 317   %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -24
 318   %base_ptr = bitcast <vscale x 2 x double>* %base to double *
 319   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3.sret.nxv2f64(<vscale x 2 x i1> %Pg, double *%base_ptr)
 320   ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
 321 }
 322
 323 ; ; ld4b
 324 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 325 ; CHECK-LABEL: ld4.nxv64i8:
 326 ; CHECK:       // %bb.0:
 327 ; CHECK-NEXT:    ld4b { z0.b - z3.b }, p0/z, [x0, #4, mul vl]
 328 ; CHECK-NEXT:    ret
 329   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 4
 330   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 331   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 332   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 333 }
 334
 335 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 336 ; CHECK-LABEL: ld4.nxv64i8_lower_bound:
 337 ; CHECK:       // %bb.0:
 338 ; CHECK-NEXT:    ld4b { z0.b - z3.b }, p0/z, [x0, #-32, mul vl]
 339 ; CHECK-NEXT:    ret
 340   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -32
 341   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 342   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 343   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 344 }
 345
 346 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 347 ; CHECK-LABEL: ld4.nxv64i8_upper_bound:
 348 ; CHECK:       // %bb.0:
 349 ; CHECK-NEXT:    ld4b { z0.b - z3.b }, p0/z, [x0, #28, mul vl]
 350 ; CHECK-NEXT:    ret
 351   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 28
 352   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 353   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 354   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 355 }
 356
 357 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_not_multiple_of_4_01(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 358 ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_01:
 359 ; CHECK:       // %bb.0:
 360 ; CHECK-NEXT:    rdvl x8, #5
 361 ; CHECK-NEXT:    ld4b { z0.b - z3.b }, p0/z, [x0, x8]
 362 ; CHECK-NEXT:    ret
 363   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 5
 364   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 365   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 366   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 367 }
 368
 369 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_not_multiple_of_4_02(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 370 ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_02:
 371 ; CHECK:       // %bb.0:
 372 ; CHECK-NEXT:    rdvl x8, #6
 373 ; CHECK-NEXT:    ld4b { z0.b - z3.b }, p0/z, [x0, x8]
 374 ; CHECK-NEXT:    ret
 375   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 6
 376   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 377   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 378   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 379 }
 380
 381 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_not_multiple_of_4_03(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 382 ; CHECK-LABEL: ld4.nxv64i8_not_multiple_of_4_03:
 383 ; CHECK:       // %bb.0:
 384 ; CHECK-NEXT:    rdvl x8, #7
 385 ; CHECK-NEXT:    ld4b { z0.b - z3.b }, p0/z, [x0, x8]
 386 ; CHECK-NEXT:    ret
 387   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 7
 388   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 389   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 390   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 391 }
 392
 393 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_outside_lower_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 394 ; CHECK-LABEL: ld4.nxv64i8_outside_lower_bound:
 395 ; CHECK:       // %bb.0:
 396 ; CHECK-NEXT:    rdvl x8, #1
 397 ; CHECK-NEXT:    mov x9, #-576
 398 ; CHECK-NEXT:    lsr x8, x8, #4
 399 ; CHECK-NEXT:    mul x8, x8, x9
 400 ; CHECK-NEXT:    ld4b { z0.b - z3.b }, p0/z, [x0, x8]
 401 ; CHECK-NEXT:    ret
 402 ; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #4) #9)
 403 ; xM = -9 * 2^6
 404 ; xP = RDVL * 2^-4
 405 ; xOFFSET = RDVL * 2^-4 * -9 * 2^6 = RDVL * -36
 406   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 -36
 407   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 408   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 409   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 410 }
 411
 412 define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @ld4.nxv64i8_outside_upper_bound(<vscale x 16 x i1> %Pg, <vscale x 16 x i8> *%addr) {
 413 ; CHECK-LABEL: ld4.nxv64i8_outside_upper_bound:
 414 ; CHECK:       // %bb.0:
 415 ; CHECK-NEXT:    rdvl x8, #1
 416 ; CHECK-NEXT:    mov w9, #512
 417 ; CHECK-NEXT:    lsr x8, x8, #4
 418 ; CHECK-NEXT:    mul x8, x8, x9
 419 ; CHECK-NEXT:    ld4b { z0.b - z3.b }, p0/z, [x0, x8]
 420 ; CHECK-NEXT:    ret
 421 ; FIXME: optimize OFFSET computation so that xOFFSET = (mul (RDVL #16) #2)
 422 ; xM = 2^9
 423 ; xP = RDVL * 2^-4
 424 ; xOFFSET = RDVL * 2^-4 * 2^9 = RDVL * 32
 425   %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %addr, i64 32
 426   %base_ptr = bitcast <vscale x 16 x i8>* %base to i8 *
 427   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1> %Pg, i8 *%base_ptr)
 428   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 429 }
 430
 431 ; ld4h
 432 define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @ld4.nxv32i16(<vscale x 8 x i1> %Pg, <vscale x 8 x i16> *%addr) {
 433 ; CHECK-LABEL: ld4.nxv32i16:
 434 ; CHECK:       // %bb.0:
 435 ; CHECK-NEXT:    ld4h { z0.h - z3.h }, p0/z, [x0, #8, mul vl]
 436 ; CHECK-NEXT:    ret
 437   %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %addr, i64 8
 438   %base_ptr = bitcast <vscale x 8 x i16>* %base to i16 *
 439   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4.sret.nxv8i16(<vscale x 8 x i1> %Pg, i16 *%base_ptr)
 440   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 441 }
 442
 443 define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @ld4.nxv32f16(<vscale x 8 x i1> %Pg, <vscale x 8 x half> *%addr) {
 444 ; CHECK-LABEL: ld4.nxv32f16:
 445 ; CHECK:       // %bb.0:
 446 ; CHECK-NEXT:    ld4h { z0.h - z3.h }, p0/z, [x0, #28, mul vl]
 447 ; CHECK-NEXT:    ret
 448   %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %addr, i64 28
 449   %base_ptr = bitcast <vscale x 8 x half>* %base to half *
 450   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4.sret.nxv8f16(<vscale x 8 x i1> %Pg, half *%base_ptr)
 451   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 452 }
 453
 454 define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @ld4.nxv32bf16(<vscale x 8 x i1> %Pg, <vscale x 8 x bfloat> *%addr) #0 {
 455 ; CHECK-LABEL: ld4.nxv32bf16:
 456 ; CHECK:       // %bb.0:
 457 ; CHECK-NEXT:    ld4h { z0.h - z3.h }, p0/z, [x0, #-32, mul vl]
 458 ; CHECK-NEXT:    ret
 459   %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %addr, i64 -32
 460   %base_ptr = bitcast <vscale x 8 x bfloat>* %base to bfloat *
 461   %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1> %Pg, bfloat *%base_ptr)
 462   ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
 463 }
 464
 465 ; ld4w
 466 define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @ld4.nxv16i32(<vscale x 4 x i1> %Pg, <vscale x 4 x i32> *%addr) {
 467 ; CHECK-LABEL: ld4.nxv16i32:
 468 ; CHECK:       // %bb.0:
 469 ; CHECK-NEXT:    ld4w { z0.s - z3.s }, p0/z, [x0, #28, mul vl]
 470 ; CHECK-NEXT:    ret
 471   %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %addr, i64 28
 472   %base_ptr = bitcast <vscale x 4 x i32>* %base to i32 *
 473   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> %Pg, i32 *%base_ptr)
 474   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 475 }
 476
 477 define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @ld4.nxv16f32(<vscale x 4 x i1> %Pg, <vscale x 4 x float>* %addr) {
 478 ; CHECK-LABEL: ld4.nxv16f32:
 479 ; CHECK:       // %bb.0:
 480 ; CHECK-NEXT:    ld4w { z0.s - z3.s }, p0/z, [x0, #-32, mul vl]
 481 ; CHECK-NEXT:    ret
 482   %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %addr, i64 -32
 483   %base_ptr = bitcast <vscale x 4 x float>* %base to float *
 484   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4.sret.nxv4f32(<vscale x 4 x i1> %Pg, float *%base_ptr)
 485   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 486 }
 487
 488 ; ld4d
 489 define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @ld4.nxv8i64(<vscale x 2 x i1> %Pg, <vscale x 2 x i64> *%addr) {
 490 ; CHECK-LABEL: ld4.nxv8i64:
 491 ; CHECK:       // %bb.0:
 492 ; CHECK-NEXT:    ld4d { z0.d - z3.d }, p0/z, [x0, #28, mul vl]
 493 ; CHECK-NEXT:    ret
 494   %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %addr, i64 28
 495   %base_ptr = bitcast <vscale x 2 x i64>* %base to i64 *
 496   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> %Pg, i64 *%base_ptr)
 497   ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
 498 }
 499
 500 define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @ld4.nxv8f64(<vscale x 2 x i1> %Pg, <vscale x 2 x double> *%addr) {
 501 ; CHECK-LABEL: ld4.nxv8f64:
 502 ; CHECK:       // %bb.0:
 503 ; CHECK-NEXT:    ld4d { z0.d - z3.d }, p0/z, [x0, #-32, mul vl]
 504 ; CHECK-NEXT:    ret
 505   %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %addr, i64 -32
 506   %base_ptr = bitcast <vscale x 2 x double>* %base to double *
 507   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1> %Pg, double * %base_ptr)
 508   ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
 509 }
 510
 511 declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1>, i8*)
 512 declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1>, i16*)
 513 declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1>, i32*)
 514 declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1>, i64*)
 515 declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1>, half*)
 516 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1>, bfloat*)
 517 declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1>, float*)
 518 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1>, double*)
 519
 520 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld3.sret.nxv16i8(<vscale x 16 x i1>, i8*)
 521 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld3.sret.nxv8i16(<vscale x 8 x i1>, i16*)
 522 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1>, i32*)
 523 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1>, i64*)
 524 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld3.sret.nxv8f16(<vscale x 8 x i1>, half*)
 525 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld3.sret.nxv8bf16(<vscale x 8 x i1>, bfloat*)
 526 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1>, float*)
 527 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld3.sret.nxv2f64(<vscale x 2 x i1>, double*)
 528
 529 declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld4.sret.nxv16i8(<vscale x 16 x i1>, i8*)
 530 declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld4.sret.nxv8i16(<vscale x 8 x i1>, i16*)
 531 declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1>, i32*)
 532 declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1>, i64*)
 533 declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld4.sret.nxv8f16(<vscale x 8 x i1>, half*)
 534 declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld4.sret.nxv8bf16(<vscale x 8 x i1>, bfloat*)
 535 declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld4.sret.nxv4f32(<vscale x 4 x i1>, float*)
 536 declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1>, double*)
 537
 538 ; +bf16 is required for the bfloat version.
 539 attributes #0 = { "target-features"="+bf16" }