clang/test/CodeGen/AArch64/pure-scalable-args.c

   1 // RUN: %clang_cc1 -O3 -triple aarch64                                  -target-feature +sve -target-feature +sve2p1 -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-AAPCS
   2 // RUN: %clang_cc1 -O3 -triple arm64-apple-ios7.0 -target-abi darwinpcs -target-feature +sve -target-feature +sve2p1 -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DARWIN
   3 // RUN: %clang_cc1 -O3 -triple aarch64-linux-gnu                        -target-feature +sve -target-feature +sve2p1 -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-AAPCS
   4
   5 // REQUIRES: aarch64-registered-target
   6
   7 #include <arm_neon.h>
   8 #include <arm_sve.h>
   9 #include <stdarg.h>
  10
  11 typedef svfloat32_t fvec32 __attribute__((arm_sve_vector_bits(128)));
  12 typedef svfloat64_t fvec64 __attribute__((arm_sve_vector_bits(128)));
  13 typedef svbool_t bvec __attribute__((arm_sve_vector_bits(128)));
  14 typedef svmfloat8_t mfvec8 __attribute__((arm_sve_vector_bits(128)));
  15
  16 typedef struct {
  17     float f[4];
  18 } HFA;
  19
  20 typedef struct {
  21     mfloat8x16_t f[4];
  22 } HVA;
  23
  24 // Pure Scalable Type, needs 4 Z-regs, 2 P-regs
  25 typedef struct {
  26      bvec a;
  27      fvec64 x;
  28      fvec32 y[2];
  29      mfvec8 z;
  30      bvec b;
  31 } PST;
  32
  33 // Pure Scalable Type, 1 Z-reg
  34 typedef struct {
  35     fvec32 x;
  36 } SmallPST;
  37
  38 // Big PST, does not fit in registers.
  39 typedef struct {
  40     struct {
  41         bvec a;
  42         fvec32 x[4];
  43     } u[2];
  44     fvec64 v;
  45 } BigPST;
  46
  47 // A small aggregate type
  48 typedef struct  {
  49     char data[16];
  50 } SmallAgg;
  51
  52 // CHECK: %struct.PST = type { <2 x i8>, <2 x double>, [2 x <4 x float>], <16 x i8>, <2 x i8> }
  53
  54 // Test argument passing of Pure Scalable Types by examining the generated
  55 // LLVM IR function declarations. A PST argument in C/C++ should map to:
  56 //   a) an `ptr` argument, if passed indirectly through memory
  57 //   b) a series of scalable vector arguments, if passed via registers
  58
  59 // Simple argument passing, PST expanded into registers.
  60 //   a    -> p0
  61 //   b    -> p1
  62 //   x    -> q0
  63 //   y[0] -> q1
  64 //   y[1] -> q2
  65 //   z    -> q3
  66 void test_argpass_simple(PST *p) {
  67     void argpass_simple_callee(PST);
  68     argpass_simple_callee(*p);
  69 }
  70 // CHECK-AAPCS:      define dso_local void @test_argpass_simple(ptr nocapture noundef readonly %p)
  71 // CHECK-AAPCS-NEXT: entry:
  72 // CHECK-AAPCS-NEXT: %0 = load <2 x i8>, ptr %p, align 16
  73 // CHECK-AAPCS-NEXT: %cast.scalable = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %0, i64 0)
  74 // CHECK-AAPCS-NEXT: %1 = bitcast <vscale x 2 x i8> %cast.scalable to <vscale x 16 x i1>
  75 // CHECK-AAPCS-NEXT: %2 = getelementptr inbounds nuw i8, ptr %p, i64 16
  76 // CHECK-AAPCS-NEXT: %3 = load <2 x double>, ptr %2, align 16
  77 // CHECK-AAPCS-NEXT: %cast.scalable1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %3, i64 0)
  78 // CHECK-AAPCS-NEXT: %4 = getelementptr inbounds nuw i8, ptr %p, i64 32
  79 // CHECK-AAPCS-NEXT: %5 = load <4 x float>, ptr %4, align 16
  80 // CHECK-AAPCS-NEXT: %cast.scalable2 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %5, i64 0)
  81 // CHECK-AAPCS-NEXT: %6 = getelementptr inbounds nuw i8, ptr %p, i64 48
  82 // CHECK-AAPCS-NEXT: %7 = load <4 x float>, ptr %6, align 16
  83 // CHECK-AAPCS-NEXT: %cast.scalable3 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %7, i64 0)
  84 // CHECK-AAPCS-NEXT: %8 = getelementptr inbounds nuw i8, ptr %p, i64 64
  85 // CHECK-AAPCS-NEXT: %9 = load <16 x i8>, ptr %8, align 16
  86 // CHECK-AAPCS-NEXT: %cast.scalable4 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %9, i64 0)
  87 // CHECK-AAPCS-NEXT: %10 = getelementptr inbounds nuw i8, ptr %p, i64 80
  88 // CHECK-AAPCS-NEXT: %11 = load <2 x i8>, ptr %10, align 16
  89 // CHECK-AAPCS-NEXT: %cast.scalable5 = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %11, i64 0)
  90 // CHECK-AAPCS-NEXT: %12 = bitcast <vscale x 2 x i8> %cast.scalable5 to <vscale x 16 x i1>
  91 // CHECK-AAPCS-NEXT: tail call void @argpass_simple_callee(<vscale x 16 x i1> %1, <vscale x 2 x double> %cast.scalable1, <vscale x 4 x float> %cast.scalable2, <vscale x 4 x float> %cast.scalable3, <vscale x 16 x i8> %cast.scalable4, <vscale x 16 x i1> %12)
  92 // CHECK-AAPCS-NEXT: ret void
  93
  94 // CHECK-AAPCS:  declare void @argpass_simple_callee(<vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>)
  95 // CHECK-DARWIN: declare void @argpass_simple_callee(ptr noundef)
  96
  97 // Boundary case of using the last available Z-reg, PST expanded.
  98 //   0.0  -> d0-d3
  99 //   a    -> p0
 100 //   b    -> p1
 101 //   x    -> q4
 102 //   y[0] -> q5
 103 //   y[1] -> q6
 104 //   z    -> q7
 105 void test_argpass_last_z(PST *p) {
 106     void argpass_last_z_callee(double, double, double, double, PST);
 107     argpass_last_z_callee(.0, .0, .0, .0, *p);
 108 }
 109 // CHECK-AAPCS:  declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>)
 110 // CHECK-DARWIN: declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, ptr noundef)
 111
 112
 113 // Like the above, but using a tuple type to occupy some registers.
 114 //   x    -> z0.d-z3.d
 115 //   a    -> p0
 116 //   b    -> p1
 117 //   x    -> q4
 118 //   y[0] -> q5
 119 //   y[1] -> q6
 120 //   z    -> q7
 121 void test_argpass_last_z_tuple(PST *p, svfloat64x4_t x) {
 122   void argpass_last_z_tuple_callee(svfloat64x4_t, PST);
 123   argpass_last_z_tuple_callee(x, *p);
 124 }
 125 // CHECK-AAPCS:  declare void @argpass_last_z_tuple_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>)
 126 // CHECK-DARWIN: declare void @argpass_last_z_tuple_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, ptr noundef)
 127
 128
 129 // Boundary case of using the last available P-reg, PST expanded.
 130 //   false -> p0-p1
 131 //   a     -> p2
 132 //   b     -> p3
 133 //   x     -> q0
 134 //   y[0]  -> q1
 135 //   y[1]  -> q2
 136 //   z     -> q3
 137 void test_argpass_last_p(PST *p) {
 138     void argpass_last_p_callee(svbool_t, svcount_t, PST);
 139     argpass_last_p_callee(svpfalse(), svpfalse_c(), *p);
 140 }
 141 // CHECK-AAPCS:  declare void @argpass_last_p_callee(<vscale x 16 x i1>, target("aarch64.svcount"), <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>)
 142 // CHECK-DARWIN: declare void @argpass_last_p_callee(<vscale x 16 x i1>, target("aarch64.svcount"), ptr noundef)
 143
 144
 145 // Not enough Z-regs, push PST to memory and pass a pointer, Z-regs and
 146 // P-regs still available for other arguments
 147 //   u     -> z0
 148 //   v     -> q1
 149 //   w     -> q2
 150 //   0.0   -> d3-d4
 151 //   1     -> w0
 152 //   *p    -> memory, address -> x1
 153 //   2     -> w2
 154 //   3.0   -> d5
 155 //   true  -> p0
 156 void test_argpass_no_z(PST *p, double dummy, svmfloat8_t u, int8x16_t v, mfloat8x16_t w) {
 157     void argpass_no_z_callee(svmfloat8_t, int8x16_t, mfloat8x16_t, double, double, int, PST, int, double, svbool_t);
 158     argpass_no_z_callee(u, v, w, .0, .0, 1, *p, 2, 3.0, svptrue_b64());
 159 }
 160 // CHECK: declare void @argpass_no_z_callee(<vscale x 16 x i8>, <16 x i8> noundef, <16 x i8>, double noundef, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 161
 162
 163 // Like the above, using a tuple to occupy some registers.
 164 //   x     -> z0.d-z3.d
 165 //   0.0   -> d4
 166 //   1     -> w0
 167 //   *p    -> memory, address -> x1
 168 //   2     -> w2
 169 //   3.0   -> d5
 170 //   true  -> p0
 171 void test_argpass_no_z_tuple_f64(PST *p, float dummy, svfloat64x4_t x) {
 172   void argpass_no_z_tuple_f64_callee(svfloat64x4_t, double, int, PST, int,
 173                                      double, svbool_t);
 174   argpass_no_z_tuple_f64_callee(x, .0, 1, *p, 2, 3.0, svptrue_b64());
 175 }
 176 // CHECK: declare void @argpass_no_z_tuple_f64_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 177
 178
 179 // Likewise, using a different tuple.
 180 //   x     -> z0.d-z3.d
 181 //   0.0   -> d4
 182 //   1     -> w0
 183 //   *p    -> memory, address -> x1
 184 //   2     -> w2
 185 //   3.0   -> d5
 186 //   true  -> p0
 187 void test_argpass_no_z_tuple_mfp8(PST *p, float dummy, svmfloat8x4_t x) {
 188   void argpass_no_z_tuple_mfp8_callee(svmfloat8x4_t, double, int, PST, int,
 189                                       double, svbool_t);
 190   argpass_no_z_tuple_mfp8_callee(x, .0, 1, *p, 2, 3.0, svptrue_b64());
 191 }
 192 // CHECK: declare void @argpass_no_z_tuple_mfp8_callee(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 193
 194
 195 // Not enough Z-regs (consumed by a HFA), PST passed indirectly
 196 //   0.0  -> d0
 197 //   *h   -> s1-s4
 198 //   1    -> w0
 199 //   *p   -> memory, address -> x1
 200 //   p    -> x1
 201 //   2    -> w2
 202 //   true -> p0
 203 void test_argpass_no_z_hfa(HFA *h, PST *p) {
 204     void argpass_no_z_hfa_callee(double, HFA, int, PST, int, svbool_t);
 205     argpass_no_z_hfa_callee(.0, *h, 1, *p, 2, svptrue_b64());
 206 }
 207 // CHECK-AAPCS:  declare void @argpass_no_z_hfa_callee(double noundef, [4 x float] alignstack(8), i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>)
 208 // CHECK-DARWIN: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float], i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>)
 209
 210 // Not enough Z-regs (consumed by a HVA), PST passed indirectly
 211 //   0.0  -> d0
 212 //   *h   -> s1-s4
 213 //   1    -> w0
 214 //   *p   -> memory, address -> x1
 215 //   p    -> x1
 216 //   2    -> w2
 217 //   true -> p0
 218 void test_argpass_no_z_hva(HVA *h, PST *p) {
 219     void argpass_no_z_hva_callee(double, HVA, int, PST, int, svbool_t);
 220     argpass_no_z_hva_callee(.0, *h, 1, *p, 2, svptrue_b64());
 221 }
 222 // CHECK-AAPCS:  declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>] alignstack(16), i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>)
 223 // CHECK-DARWIN: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>], i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>)
 224
 225 // Not enough P-regs, PST passed indirectly, Z-regs and P-regs still available.
 226 //   true -> p0-p2
 227 //   1    -> w0
 228 //   *p   -> memory, address -> x1
 229 //   2    -> w2
 230 //   3.0  -> d0
 231 //   true -> p3
 232 void test_argpass_no_p(PST *p) {
 233     void argpass_no_p_callee(svbool_t, svbool_t, svbool_t, int, PST, int, double, svbool_t);
 234     argpass_no_p_callee(svptrue_b8(), svptrue_b16(), svptrue_b32(), 1, *p, 2, 3.0, svptrue_b64());
 235 }
 236 // CHECK: declare void @argpass_no_p_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 237
 238
 239 // Like above, using a tuple to occupy some registers.
 240 // P-regs still available.
 241 //   v    -> p0-p1
 242 //   u    -> p2
 243 //   1    -> w0
 244 //   *p   -> memory, address -> x1
 245 //   2    -> w2
 246 //   3.0  -> d0
 247 //   true -> p3
 248 void test_argpass_no_p_tuple(PST *p, svbool_t u, svboolx2_t v) {
 249   void argpass_no_p_tuple_callee(svboolx2_t, svbool_t, int, PST, int, double,
 250                                  svbool_t);
 251   argpass_no_p_tuple_callee(v, u, 1, *p, 2, 3.0, svptrue_b64());
 252 }
 253 // CHECK: declare void @argpass_no_p_tuple_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>)
 254
 255
 256 // HFAs go back-to-back to memory, afterwards Z-regs not available, PST passed indirectly.
 257 //   0.0   -> d0-d3
 258 //   *h    -> memory
 259 //   *p    -> memory, address -> x0
 260 //   *h    -> memory
 261 //   false -> p0
 262 void test_after_hfa(HFA *h, PST *p) {
 263     void after_hfa_callee(double, double, double, double, double, HFA, PST, HFA, svbool_t);
 264     after_hfa_callee(.0, .0, .0, .0, .0, *h, *p, *h, svpfalse());
 265 }
 266 // CHECK-AAPCS:  declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float] alignstack(8), ptr noundef, [4 x float] alignstack(8), <vscale x 16 x i1>)
 267 // CHECK-DARWIN: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float], ptr noundef, [4 x float], <vscale x 16 x i1>)
 268
 269 // Small PST, not enough registers, passed indirectly, unlike other small
 270 // aggregates.
 271 //   *s  -> x0-x1
 272 //   0.0 -> d0-d7
 273 //   *p  -> memory, address -> x2
 274 //   1.0 -> memory
 275 //   2.0 -> memory (next to the above)
 276 void test_small_pst(SmallPST *p, SmallAgg *s) {
 277     void small_pst_callee(SmallAgg, double, double, double, double, double, double, double, double, double, SmallPST, double);
 278     small_pst_callee(*s, .0, .0, .0, .0, .0, .0, .0, .0, 1.0, *p, 2.0);
 279 }
 280 // CHECK-AAPCS:  declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, ptr noundef, double noundef)
 281 // CHECK-DARWIN: declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, i128, double noundef)
 282
 283
 284 // Simple return, PST expanded to registers
 285 //   p->a    -> p0
 286 //   p->x    -> q0
 287 //   p->y[0] -> q1
 288 //   p->y[1] -> q2
 289 //   p->z    -> q3
 290 //   p->b    -> p1
 291 PST test_return(PST *p) {
 292     return *p;
 293 }
 294 // CHECK-AAPCS:  define dso_local <{ <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1> }> @test_return(ptr
 295 // CHECK-DARWIN: define void @test_return(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.PST) align 16 %agg.result, ptr nocapture noundef readonly %p)
 296
 297 // Corner case of 1-element aggregate
 298 //   p->x -> q0
 299 SmallPST test_return_small_pst(SmallPST *p) {
 300     return *p;
 301 }
 302 // CHECK-AAPCS:  define dso_local <vscale x 4 x float> @test_return_small_pst(ptr
 303 // CHECK-DARWIN: define i128 @test_return_small_pst(ptr nocapture noundef readonly %p)
 304
 305
 306 // Big PST, returned indirectly
 307 //   *p -> *x8
 308 BigPST test_return_big_pst(BigPST *p) {
 309     return *p;
 310 }
 311 // CHECK-AAPCS:  define dso_local void @test_return_big_pst(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.BigPST) align 16 %agg.result, ptr nocapture noundef readonly %p)
 312 // CHECK-DARWIN: define void @test_return_big_pst(ptr dead_on_unwind noalias nocapture writable writeonly sret(%struct.BigPST) align 16 %agg.result, ptr nocapture noundef readonly %p)
 313
 314 // Variadic arguments are unnamed, PST passed indirectly.
 315 // (Passing SVE types to a variadic function currently unsupported by
 316 // the AArch64 backend)
 317 //   p->a    -> p0
 318 //   p->x    -> q0
 319 //   p->y[0] -> q1
 320 //   p->y[1] -> q2
 321 //   p->z    -> q3
 322 //   p->b    -> p1
 323 //   *q -> memory, address -> x1
 324 void test_pass_variadic(PST *p, PST *q) {
 325     void pass_variadic_callee(PST, ...);
 326     pass_variadic_callee(*p, *q);
 327 }
 328 // CHECK-AAPCS: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp, ptr noundef nonnull align 16 dereferenceable(96) %q, i64 96, i1 false)
 329 // CHECK-AAPCS: call void (<vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>, ...) @pass_variadic_callee(<vscale x 16 x i1> %1, <vscale x 2 x double> %cast.scalable1, <vscale x 4 x float> %cast.scalable2, <vscale x 4 x float> %cast.scalable3, <vscale x 16 x i8> %cast.scalable4, <vscale x 16 x i1> %12, ptr noundef nonnull %byval-temp)
 330
 331 // CHECK-DARWIN: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp, ptr noundef nonnull align 16 dereferenceable(96) %p, i64 96, i1 false)
 332 // CHECK-DARWIN: call void @llvm.lifetime.start.p0(i64 96, ptr nonnull %byval-temp1)
 333 // CHECK-DARWIN: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp1, ptr noundef nonnull align 16 dereferenceable(96) %q, i64 96, i1 false)
 334 // CHECK-DARWIN: call void (ptr, ...) @pass_variadic_callee(ptr noundef nonnull %byval-temp, ptr noundef nonnull %byval-temp1)
 335
 336
 337 // Test passing a small PST, still passed indirectly, despite being <= 128 bits
 338 void test_small_pst_variadic(SmallPST *p) {
 339     void small_pst_variadic_callee(int, ...);
 340     small_pst_variadic_callee(0, *p);
 341 }
 342 // CHECK-AAPCS: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) %byval-temp, ptr noundef nonnull align 16 dereferenceable(16) %p, i64 16, i1 false)
 343 // CHECK-AAPCS: call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, ptr noundef nonnull %byval-temp)
 344
 345 // CHECK-DARWIN: %0 = load i128, ptr %p, align 16
 346 // CHECK-DARWIN: tail call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, i128 %0)
 347
 348 // Test handling of a PST argument when passed in registers, from the callee side.
 349 void test_argpass_callee_side(PST v) {
 350     void use(PST *p);
 351     use(&v);
 352 }
 353 // CHECK-AAPCS:      define dso_local void @test_argpass_callee_side(<vscale x 16 x i1> %0, <vscale x 2 x double> %.coerce1, <vscale x 4 x float> %.coerce3, <vscale x 4 x float> %.coerce5, <vscale x 16 x i8> %.coerce7, <vscale x 16 x i1> %1)
 354 // CHECK-AAPCS-NEXT: entry:
 355 // CHECK-AAPCS-NEXT:   %v = alloca %struct.PST, align 16
 356 // CHECK-AAPCS-NEXT:   %.coerce = bitcast <vscale x 16 x i1> %0 to <vscale x 2 x i8>
 357 // CHECK-AAPCS-NEXT:   %cast.fixed = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> %.coerce, i64 0)
 358 // CHECK-AAPCS-NEXT:   store <2 x i8> %cast.fixed, ptr %v, align 16
 359 // CHECK-AAPCS-NEXT:   %2 = getelementptr inbounds nuw i8, ptr %v, i64 16
 360 // CHECK-AAPCS-NEXT:   %cast.fixed2 = tail call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> %.coerce1, i64 0)
 361 // CHECK-AAPCS-NEXT:   store <2 x double> %cast.fixed2, ptr %2, align 16
 362 // CHECK-AAPCS-NEXT:   %3 = getelementptr inbounds nuw i8, ptr %v, i64 32
 363 // CHECK-AAPCS-NEXT:   %cast.fixed4 = tail call <4 x float> @llvm.vector.extract.v4f32.nxv4f32(<vscale x 4 x float> %.coerce3, i64 0)
 364 // CHECK-AAPCS-NEXT:   store <4 x float> %cast.fixed4, ptr %3, align 16
 365 // CHECK-AAPCS-NEXT:   %4 = getelementptr inbounds nuw i8, ptr %v, i64 48
 366 // CHECK-AAPCS-NEXT:   %cast.fixed6 = tail call <4 x float> @llvm.vector.extract.v4f32.nxv4f32(<vscale x 4 x float> %.coerce5, i64 0)
 367 // CHECK-AAPCS-NEXT:   store <4 x float> %cast.fixed6, ptr %4, align 16
 368 // CHECK-AAPCS-NEXT:   %5 = getelementptr inbounds nuw i8, ptr %v, i64 64
 369 // CHECK-AAPCS-NEXT:   %cast.fixed8 = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> %.coerce7, i64 0)
 370 // CHECK-AAPCS-NEXT:   store <16 x i8> %cast.fixed8, ptr %5, align 16
 371 // CHECK-AAPCS-NEXT:   %6 = getelementptr inbounds nuw i8, ptr %v, i64 80
 372 // CHECK-AAPCS-NEXT:   %.coerce9 = bitcast <vscale x 16 x i1> %1 to <vscale x 2 x i8>
 373 // CHECK-AAPCS-NEXT:   %cast.fixed10 = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> %.coerce9, i64 0)
 374 // CHECK-AAPCS-NEXT:   store <2 x i8> %cast.fixed10, ptr %6, align 16
 375 // CHECK-AAPCS-NEXT:   call void @use(ptr noundef nonnull %v)
 376 // CHECK-AAPCS-NEXT:   ret void
 377 // CHECK-AAPCS-NEXT: }
 378
 379 // Test va_arg operation
 380 #ifdef __cplusplus
 381  extern "C"
 382 #endif
 383 void test_va_arg(int n, ...) {
 384      va_list ap;
 385      va_start(ap, n);
 386      PST v = va_arg(ap, PST);
 387      va_end(ap);
 388
 389      void use1(bvec, fvec32);
 390      use1(v.a, v.y[1]);
 391 }
 392 // CHECK-AAPCS: define dso_local void @test_va_arg(i32 noundef %n, ...)
 393 // CHECK-AAPCS-NEXT: entry:
 394 // CHECK-AAPCS-NEXT:   %ap = alloca %struct.__va_list, align 8
 395 // CHECK-AAPCS-NEXT:   call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %ap)
 396 // CHECK-AAPCS-NEXT:   call void @llvm.va_start.p0(ptr nonnull %ap)
 397 // CHECK-AAPCS-NEXT:   %gr_offs_p = getelementptr inbounds nuw i8, ptr %ap, i64 24
 398 // CHECK-AAPCS-NEXT:   %gr_offs = load i32, ptr %gr_offs_p, align 8
 399 // CHECK-AAPCS-NEXT:   %0 = icmp sgt i32 %gr_offs, -1
 400 // CHECK-AAPCS-NEXT:   br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg
 401 // CHECK-AAPCS-EMPTY:
 402 // CHECK-AAPCS-NEXT: vaarg.maybe_reg:                                  ; preds = %entry
 403
 404 // Increment by 8, size of the pointer to the argument value, not size of the argument value itself.
 405
 406 // CHECK-AAPCS-NEXT:   %new_reg_offs = add nsw i32 %gr_offs, 8
 407 // CHECK-AAPCS-NEXT:   store i32 %new_reg_offs, ptr %gr_offs_p, align 8
 408 // CHECK-AAPCS-NEXT:   %inreg = icmp samesign ult i32 %gr_offs, -7
 409 // CHECK-AAPCS-NEXT:   br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
 410 // CHECK-AAPCS-EMPTY:
 411 // CHECK-AAPCS-NEXT: vaarg.in_reg:                                     ; preds = %vaarg.maybe_reg
 412 // CHECK-AAPCS-NEXT:   %reg_top_p = getelementptr inbounds nuw i8, ptr %ap, i64 8
 413 // CHECK-AAPCS-NEXT:   %reg_top = load ptr, ptr %reg_top_p, align 8
 414 // CHECK-AAPCS-NEXT:   %1 = sext i32 %gr_offs to i64
 415 // CHECK-AAPCS-NEXT:   %2 = getelementptr inbounds i8, ptr %reg_top, i64 %1
 416 // CHECK-AAPCS-NEXT:   br label %vaarg.end
 417 // CHECK-AAPCS-EMPTY:
 418 // CHECK-AAPCS-NEXT: vaarg.on_stack:                                   ; preds = %vaarg.maybe_reg, %entry
 419 // CHECK-AAPCS-NEXT:   %stack = load ptr, ptr %ap, align 8
 420 // CHECK-AAPCS-NEXT:   %new_stack = getelementptr inbounds i8, ptr %stack, i64 8
 421 // CHECK-AAPCS-NEXT:   store ptr %new_stack, ptr %ap, align 8
 422 // CHECK-AAPCS-NEXT:   br label %vaarg.end
 423 // CHECK-AAPCS-EMPTY:
 424 // CHECK-AAPCS-NEXT: vaarg.end:                                        ; preds = %vaarg.on_stack, %vaarg.in_reg
 425 // CHECK-AAPCS-NEXT:   %vaargs.addr = phi ptr [ %2, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
 426
 427 // Extra indirection, for a composite passed indirectly.
 428 // CHECK-AAPCS-NEXT:   %vaarg.addr = load ptr, ptr %vaargs.addr, align 8
 429
 430 // CHECK-AAPCS-NEXT:   %v.sroa.0.0.copyload = load <2 x i8>, ptr %vaarg.addr, align 16
 431 // CHECK-AAPCS-NEXT:   %v.sroa.43.0.vaarg.addr.sroa_idx = getelementptr inbounds i8, ptr %vaarg.addr, i64 48
 432 // CHECK-AAPCS-NEXT:   %v.sroa.43.0.copyload = load <4 x float>, ptr %v.sroa.43.0.vaarg.addr.sroa_idx, align 16
 433 // CHECK-AAPCS-NEXT:   call void @llvm.va_end.p0(ptr nonnull %ap)
 434 // CHECK-AAPCS-NEXT:   %cast.scalable = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %v.sroa.0.0.copyload, i64 0)
 435 // CHECK-AAPCS-NEXT:   %3 = bitcast <vscale x 2 x i8> %cast.scalable to <vscale x 16 x i1>
 436 // CHECK-AAPCS-NEXT:   %cast.scalable2 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %v.sroa.43.0.copyload, i64 0)
 437 // CHECK-AAPCS-NEXT:   call void @use1(<vscale x 16 x i1> noundef %3, <vscale x 4 x float> noundef %cast.scalable2)
 438 // CHECK-AAPCS-NEXT:   call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %ap)
 439 // CHECK-AAPCS-NEXT:   ret void
 440 // CHECK-AAPCS-NEXT: }
 441
 442 // CHECK-DARWIN: define void @test_va_arg(i32 noundef %n, ...)
 443 // CHECK-DARWIN-NEXT: entry:
 444 // CHECK-DARWIN-NEXT:   %ap = alloca ptr, align 8
 445 // CHECK-DARWIN-NEXT:   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ap)
 446 // CHECK-DARWIN-NEXT:   call void @llvm.va_start.p0(ptr nonnull %ap)
 447 // CHECK-DARWIN-NEXT:   %argp.cur = load ptr, ptr %ap, align 8
 448 // CHECK-DARWIN-NEXT:   %argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 8
 449 // CHECK-DARWIN-NEXT:   store ptr %argp.next, ptr %ap, align 8
 450 // CHECK-DARWIN-NEXT:   %0 = load ptr, ptr %argp.cur, align 8
 451 // CHECK-DARWIN-NEXT:   %v.sroa.0.0.copyload = load <2 x i8>, ptr %0, align 16
 452 // CHECK-DARWIN-NEXT:   %v.sroa.43.0..sroa_idx = getelementptr inbounds i8, ptr %0, i64 48
 453 // CHECK-DARWIN-NEXT:   %v.sroa.43.0.copyload = load <4 x float>, ptr %v.sroa.43.0..sroa_idx, align 16
 454 // CHECK-DARWIN-NEXT:   call void @llvm.va_end.p0(ptr nonnull %ap)
 455 // CHECK-DARWIN-NEXT:   %cast.scalable = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %v.sroa.0.0.copyload, i64 0)
 456 // CHECK-DARWIN-NEXT:   %1 = bitcast <vscale x 2 x i8> %cast.scalable to <vscale x 16 x i1>
 457 // CHECK-DARWIN-NEXT:   %cast.scalable2 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %v.sroa.43.0.copyload, i64 0)
 458 // CHECK-DARWIN-NEXT:   call void @use1(<vscale x 16 x i1> noundef %1, <vscale x 4 x float> noundef %cast.scalable2)
 459 // CHECK-DARWIN-NEXT:   call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ap)
 460 // CHECK-DARWIN-NEXT:   ret void
 461 // CHECK-DARWIN-NEXT: }