llvm/test/CodeGen/AArch64/consecutive-stores-of-faddv.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc  -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
   3
   4 ; Tests consecutive stores of @llvm.aarch64.sve.faddv. Within SDAG faddv is
   5 ; lowered as a FADDV + EXTRACT_VECTOR_ELT (of lane 0). Stores of extracts can
   6 ; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in
   7 ; some cases as it can lead to worse codegen.
   8
   9 ; TODO: A single `stp s0, s1, [x0]` may be preferred here.
  10 define void @consecutive_stores_pair(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) {
  11 ; CHECK-LABEL: consecutive_stores_pair:
  12 ; CHECK:       // %bb.0:
  13 ; CHECK-NEXT:    ptrue p0.s
  14 ; CHECK-NEXT:    faddv s0, p0, z0.s
  15 ; CHECK-NEXT:    faddv s1, p0, z1.s
  16 ; CHECK-NEXT:    mov v0.s[1], v1.s[0]
  17 ; CHECK-NEXT:    str d0, [x0]
  18 ; CHECK-NEXT:    ret
  19   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
  20   %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
  21   %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
  22   store float %reduce0, ptr %dest0, align 4
  23   store float %reduce1, ptr %dest1, align 4
  24   ret void
  25 }
  26
  27 define void @consecutive_stores_quadruple(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) {
  28 ; CHECK-LABEL: consecutive_stores_quadruple:
  29 ; CHECK:       // %bb.0:
  30 ; CHECK-NEXT:    ptrue p0.s
  31 ; CHECK-NEXT:    faddv s0, p0, z0.s
  32 ; CHECK-NEXT:    faddv s1, p0, z1.s
  33 ; CHECK-NEXT:    faddv s2, p0, z2.s
  34 ; CHECK-NEXT:    mov v0.s[1], v1.s[0]
  35 ; CHECK-NEXT:    faddv s3, p0, z3.s
  36 ; CHECK-NEXT:    mov v2.s[1], v3.s[0]
  37 ; CHECK-NEXT:    stp d0, d2, [x0]
  38 ; CHECK-NEXT:    ret
  39   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
  40   %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
  41   %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
  42   %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
  43   %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
  44   %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2)
  45   %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3)
  46   store float %reduce0, ptr %dest0, align 4
  47   store float %reduce1, ptr %dest1, align 4
  48   store float %reduce2, ptr %dest2, align 4
  49   store float %reduce3, ptr %dest3, align 4
  50   ret void
  51 }
  52
  53 define void @consecutive_stores_pair_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) "aarch64_pstate_sm_enabled"  {
  54 ; CHECK-LABEL: consecutive_stores_pair_streaming_function:
  55 ; CHECK:       // %bb.0:
  56 ; CHECK-NEXT:    ptrue p0.s
  57 ; CHECK-NEXT:    faddv s0, p0, z0.s
  58 ; CHECK-NEXT:    faddv s1, p0, z1.s
  59 ; CHECK-NEXT:    stp s0, s1, [x0]
  60 ; CHECK-NEXT:    ret
  61   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
  62   %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
  63   %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
  64   store float %reduce0, ptr %dest0, align 4
  65   store float %reduce1, ptr %dest1, align 4
  66   ret void
  67 }
  68
  69 define void @consecutive_stores_quadruple_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) "aarch64_pstate_sm_enabled" {
  70 ; CHECK-LABEL: consecutive_stores_quadruple_streaming_function:
  71 ; CHECK:       // %bb.0:
  72 ; CHECK-NEXT:    ptrue p0.s
  73 ; CHECK-NEXT:    faddv s0, p0, z0.s
  74 ; CHECK-NEXT:    faddv s1, p0, z1.s
  75 ; CHECK-NEXT:    faddv s2, p0, z2.s
  76 ; CHECK-NEXT:    stp s0, s1, [x0]
  77 ; CHECK-NEXT:    faddv s3, p0, z3.s
  78 ; CHECK-NEXT:    stp s2, s3, [x0, #8]
  79 ; CHECK-NEXT:    ret
  80   %dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
  81   %dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
  82   %dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
  83   %reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
  84   %reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
  85   %reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2)
  86   %reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3)
  87   store float %reduce0, ptr %dest0, align 4
  88   store float %reduce1, ptr %dest1, align 4
  89   store float %reduce2, ptr %dest2, align 4
  90   store float %reduce3, ptr %dest3, align 4
  91   ret void
  92 }