llvm/test/CodeGen/AArch64/sched-past-vector-ldst.ll

   1 ; RUN: llc < %s -mcpu=cortex-a53 -enable-post-misched=false -enable-aa-sched-mi | FileCheck %s
   2
   3 ; Check that the vector store intrinsic does not prevent fmla instructions from
   4 ; being scheduled together.  Since the vector loads and stores generated from
   5 ; the intrinsics do not alias each other, the store can be pushed past the load.
   6 ; This allows fmla instructions to be scheduled together.
   7
   8
   9 ; CHECK: fmla
  10 ; CHECK-NEXT: mov
  11 ; CHECK-NEXT: mov
  12 ; CHECK-NEXT: fmla
  13 ; CHECK-NEXT: fmla
  14 ; CHECK-NEXT: fmla
  15 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
  16 target triple = "aarch64--linux-gnu"
  17
  18 %Struct = type { i64*, [9 x double], [16 x {float, float}], [16 x {float, float}], i32, i32 }
  19
  20 ; Function Attrs: nounwind
  21 define linkonce_odr void @func(%Struct* nocapture %this, <4 x float> %f) unnamed_addr #0 align 2 {
  22 entry:
  23   %scevgep = getelementptr %Struct, %Struct* %this, i64 0, i32 2, i64 8, i32 0
  24   %struct_ptr = bitcast float* %scevgep to i8*
  25   %vec1 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8* %struct_ptr)
  26   %ev1 = extractvalue { <4 x float>, <4 x float> } %vec1, 1
  27   %fm1 = fmul <4 x float> %f, %ev1
  28   %av1 = fadd <4 x float> %f, %fm1
  29   %ev2 = extractvalue { <4 x float>, <4 x float> } %vec1, 0
  30   %fm2 = fmul <4 x float> %f, %ev2
  31   %av2 = fadd <4 x float> %f, %fm2
  32   %scevgep2 = getelementptr %Struct, %Struct* %this, i64 0, i32 3, i64 8, i32 0
  33   %struct_ptr2 = bitcast float* %scevgep2 to i8*
  34   tail call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> %av2, <4 x float> %av1, i8* %struct_ptr2)
  35   %scevgep3 = getelementptr %Struct, %Struct* %this, i64 0, i32 2, i64 12, i32 0
  36   %struct_ptr3 = bitcast float* %scevgep3 to i8*
  37   %vec2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8* %struct_ptr3)
  38   %ev3 = extractvalue { <4 x float>, <4 x float> } %vec2, 1
  39   %fm3 = fmul <4 x float> %f, %ev3
  40   %av3 = fadd <4 x float> %f, %fm3
  41   %ev4 = extractvalue { <4 x float>, <4 x float> } %vec2, 0
  42   %fm4 = fmul <4 x float> %f, %ev4
  43   %av4 = fadd <4 x float> %f, %fm4
  44   %scevgep4 = getelementptr %Struct, %Struct* %this, i64 0, i32 3, i64 12, i32 0
  45   %struct_ptr4 = bitcast float* %scevgep4 to i8*
  46   tail call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> %av4, <4 x float> %av3, i8* %struct_ptr4)
  47   ret void
  48 }
  49
  50 ; Function Attrs: nounwind readonly
  51 declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0i8(i8*) #2
  52
  53 ; Function Attrs: nounwind
  54 declare void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float>, <4 x float>, i8* nocapture) #1
  55
  56 attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
  57 attributes #1 = { nounwind }
  58 attributes #2 = { nounwind readonly }