llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -passes="default<O3>" -S            < %s | FileCheck %s --check-prefix=SSE
   3 ; RUN: opt -passes="default<O3>" -S -mattr=avx < %s | FileCheck %s --check-prefix=AVX
   4
   5 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   6 target triple = "x86_64--"
   7
   8 %union.ElementWiseAccess = type { <4 x float> }
   9
  10 $getAt = comdat any
  11
  12 define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
  13 ; SSE-LABEL: @ConvertVectors_ByRef(
  14 ; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
  15 ; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  16 ; SSE-NEXT:    ret <4 x float> [[TMP3]]
  17 ;
  18 ; AVX-LABEL: @ConvertVectors_ByRef(
  19 ; AVX-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
  20 ; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  21 ; AVX-NEXT:    ret <4 x float> [[TMP3]]
  22 ;
  23   %2 = alloca ptr, align 8
  24   %3 = alloca <4 x float>, align 16
  25   store ptr %0, ptr %2, align 8
  26   %4 = load ptr, ptr %2, align 8
  27   %5 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %4)
  28   %6 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %5, i32 noundef 0)
  29   %7 = insertelement <4 x float> undef, float %6, i32 0
  30   %8 = load ptr, ptr %2, align 8
  31   %9 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %8)
  32   %10 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %9, i32 noundef 1)
  33   %11 = insertelement <4 x float> %7, float %10, i32 1
  34   %12 = load ptr, ptr %2, align 8
  35   %13 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %12)
  36   %14 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %13, i32 noundef 2)
  37   %15 = insertelement <4 x float> %11, float %14, i32 2
  38   %16 = load ptr, ptr %2, align 8
  39   %17 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %16)
  40   %18 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %17, i32 noundef 2)
  41   %19 = insertelement <4 x float> %15, float %18, i32 3
  42   store <4 x float> %19, ptr %3, align 16
  43   %20 = load <4 x float>, ptr %3, align 16
  44   ret <4 x float> %20
  45 }
  46
  47 define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 {
  48 ; SSE-LABEL: @ConvertVectors_ByVal(
  49 ; SSE-NEXT:  entry:
  50 ; SSE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
  51 ; SSE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 8
  52 ; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
  53 ; SSE-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
  54 ; SSE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
  55 ; SSE-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
  56 ; SSE-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
  57 ; SSE-NEXT:    ret <4 x float> [[VECINIT16]]
  58 ;
  59 ; AVX-LABEL: @ConvertVectors_ByVal(
  60 ; AVX-NEXT:  entry:
  61 ; AVX-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
  62 ; AVX-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 8
  63 ; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
  64 ; AVX-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
  65 ; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
  66 ; AVX-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
  67 ; AVX-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
  68 ; AVX-NEXT:    ret <4 x float> [[VECINIT16]]
  69 ;
  70 entry:
  71   %V.addr = alloca ptr, align 8
  72   %.compoundliteral = alloca <4 x float>, align 16
  73   %ref.tmp = alloca %union.ElementWiseAccess, align 16
  74   %ref.tmp2 = alloca %union.ElementWiseAccess, align 16
  75   %ref.tmp7 = alloca %union.ElementWiseAccess, align 16
  76   %ref.tmp12 = alloca %union.ElementWiseAccess, align 16
  77   store ptr %V, ptr %V.addr, align 8
  78   call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp) #4
  79   %0 = load ptr, ptr %V.addr, align 8
  80   %call = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %0)
  81   %coerce.dive = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp, i32 0, i32 0
  82   %1 = getelementptr inbounds { double, double }, ptr %coerce.dive, i32 0, i32 0
  83   %2 = extractvalue { double, double } %call, 0
  84   store double %2, ptr %1, align 16
  85   %3 = getelementptr inbounds { double, double }, ptr %coerce.dive, i32 0, i32 1
  86   %4 = extractvalue { double, double } %call, 1
  87   store double %4, ptr %3, align 8
  88   %call1 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp, i32 noundef 0)
  89   %vecinit = insertelement <4 x float> undef, float %call1, i32 0
  90   call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp2) #4
  91   %5 = load ptr, ptr %V.addr, align 8
  92   %call3 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %5)
  93   %coerce.dive4 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp2, i32 0, i32 0
  94   %6 = getelementptr inbounds { double, double }, ptr %coerce.dive4, i32 0, i32 0
  95   %7 = extractvalue { double, double } %call3, 0
  96   store double %7, ptr %6, align 16
  97   %8 = getelementptr inbounds { double, double }, ptr %coerce.dive4, i32 0, i32 1
  98   %9 = extractvalue { double, double } %call3, 1
  99   store double %9, ptr %8, align 8
 100   %call5 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp2, i32 noundef 1)
 101   %vecinit6 = insertelement <4 x float> %vecinit, float %call5, i32 1
 102   call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp7) #4
 103   %10 = load ptr, ptr %V.addr, align 8
 104   %call8 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %10)
 105   %coerce.dive9 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp7, i32 0, i32 0
 106   %11 = getelementptr inbounds { double, double }, ptr %coerce.dive9, i32 0, i32 0
 107   %12 = extractvalue { double, double } %call8, 0
 108   store double %12, ptr %11, align 16
 109   %13 = getelementptr inbounds { double, double }, ptr %coerce.dive9, i32 0, i32 1
 110   %14 = extractvalue { double, double } %call8, 1
 111   store double %14, ptr %13, align 8
 112   %call10 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp7, i32 noundef 2)
 113   %vecinit11 = insertelement <4 x float> %vecinit6, float %call10, i32 2
 114   call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp12) #4
 115   %15 = load ptr, ptr %V.addr, align 8
 116   %call13 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %15)
 117   %coerce.dive14 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp12, i32 0, i32 0
 118   %16 = getelementptr inbounds { double, double }, ptr %coerce.dive14, i32 0, i32 0
 119   %17 = extractvalue { double, double } %call13, 0
 120   store double %17, ptr %16, align 16
 121   %18 = getelementptr inbounds { double, double }, ptr %coerce.dive14, i32 0, i32 1
 122   %19 = extractvalue { double, double } %call13, 1
 123   store double %19, ptr %18, align 8
 124   %call15 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp12, i32 noundef 2)
 125   %vecinit16 = insertelement <4 x float> %vecinit11, float %call15, i32 3
 126   store <4 x float> %vecinit16, ptr %.compoundliteral, align 16
 127   %20 = load <4 x float>, ptr %.compoundliteral, align 16
 128   call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp12) #4
 129   call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp7) #4
 130   call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp2) #4
 131   call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp) #4
 132   ret <4 x float> %20
 133 }
 134
 135 define internal { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %t) #1 {
 136 entry:
 137   %retval = alloca %union.ElementWiseAccess, align 16
 138   %t.addr = alloca ptr, align 8
 139   store ptr %t, ptr %t.addr, align 8
 140   %0 = load ptr, ptr %t.addr, align 8
 141   call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval, ptr align 16 %0, i64 16, i1 false)
 142   %coerce.dive = getelementptr inbounds %union.ElementWiseAccess, ptr %retval, i32 0, i32 0
 143   %1 = load { double, double }, ptr %coerce.dive, align 16
 144   ret { double, double } %1
 145 }
 146
 147 declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
 148 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
 149 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3
 150
 151 define internal noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #1 {
 152   %2 = alloca ptr, align 8
 153   store ptr %0, ptr %2, align 8
 154   %3 = load ptr, ptr %2, align 8
 155   ret ptr %3
 156 }
 157
 158 define linkonce_odr dso_local noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %0, i32 noundef %1) #1 comdat align 2 {
 159   %3 = alloca ptr, align 8
 160   %4 = alloca i32, align 4
 161   store ptr %0, ptr %3, align 8
 162   store i32 %1, ptr %4, align 4
 163   %5 = load ptr, ptr %3, align 8
 164   %6 = load i32, ptr %4, align 4
 165   %7 = sext i32 %6 to i64
 166   %8 = getelementptr inbounds [4 x float], ptr %5, i64 0, i64 %7
 167   %9 = load float, ptr %8, align 4
 168   ret float %9
 169 }
 170
 171 define linkonce_odr noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %this, i32 noundef %i) #1 align 2 {
 172 entry:
 173   %this.addr = alloca ptr, align 8
 174   %i.addr = alloca i32, align 4
 175   store ptr %this, ptr %this.addr, align 8
 176   store i32 %i, ptr %i.addr, align 4
 177   %this1 = load ptr, ptr %this.addr, align 8
 178   %0 = load i32, ptr %i.addr, align 4
 179   %idxprom = sext i32 %0 to i64
 180   %arrayidx = getelementptr inbounds [4 x float], ptr %this1, i64 0, i64 %idxprom
 181   %1 = load float, ptr %arrayidx, align 4
 182   ret float %1
 183 }
 184
 185 ; Vector combine + SLP should form a narrow load and a vector cast
 186
 187 define void @PR51397(ptr noundef %dst, ptr noundef %srcp) {
 188 ; SSE-LABEL: @PR51397(
 189 ; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[SRCP:%.*]], align 16
 190 ; SSE-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 191 ; SSE-NEXT:    store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 16
 192 ; SSE-NEXT:    ret void
 193 ;
 194 ; AVX-LABEL: @PR51397(
 195 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[SRCP:%.*]], align 16
 196 ; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
 197 ; AVX-NEXT:    store <4 x float> [[TMP2]], ptr [[DST:%.*]], align 16
 198 ; AVX-NEXT:    ret void
 199 ;
 200   %src = load <8 x i32>, ptr %srcp, align 16
 201   %vecext = extractelement <8 x i32> %src, i32 0
 202   %conv = sitofp i32 %vecext to float
 203   %vecinit = insertelement <4 x float> undef, float %conv, i32 0
 204   %vecext1 = extractelement <8 x i32> %src, i32 1
 205   %conv2 = sitofp i32 %vecext1 to float
 206   %vecinit3 = insertelement <4 x float> %vecinit, float %conv2, i32 1
 207   %vecext4 = extractelement <8 x i32> %src, i32 2
 208   %conv5 = sitofp i32 %vecext4 to float
 209   %vecinit6 = insertelement <4 x float> %vecinit3, float %conv5, i32 2
 210   %vecext7 = extractelement <8 x i32> %src, i32 3
 211   %conv8 = sitofp i32 %vecext7 to float
 212   %vecinit9 = insertelement <4 x float> %vecinit6, float %conv8, i32 3
 213   store <4 x float> %vecinit9, ptr %dst, align 16
 214   ret void
 215 }