llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -vector-library=Accelerate -S %s | FileCheck %s
   3 ; RUN: opt -passes=inject-tli-mappings,slp-vectorizer -S %s | FileCheck --check-prefix NOACCELERATE %s
   4
   5 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
   6 target triple = "arm64-apple-ios14.0.0"
   7
   8 declare float @llvm.sin.f32(float)
   9
  10 ; Accelerate provides sin() for <4 x float>
  11 define <4 x float> @int_sin_4x(ptr %a) {
  12 ; CHECK-LABEL: @int_sin_4x(
  13 ; CHECK-NEXT:  entry:
  14 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
  15 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
  16 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
  17 ;
  18 ; NOACCELERATE-LABEL: @int_sin_4x(
  19 ; NOACCELERATE-NEXT:  entry:
  20 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
  21 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
  22 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
  23 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
  24 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
  25 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
  26 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
  27 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
  28 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
  29 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
  30 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  31 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
  32 ;
  33 entry:
  34   %0 = load <4 x float>, ptr %a, align 16
  35   %vecext = extractelement <4 x float> %0, i32 0
  36   %1 = tail call fast float @llvm.sin.f32(float %vecext)
  37   %vecins = insertelement <4 x float> undef, float %1, i32 0
  38   %vecext.1 = extractelement <4 x float> %0, i32 1
  39   %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
  40   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
  41   %vecext.2 = extractelement <4 x float> %0, i32 2
  42   %3 = tail call fast float @llvm.sin.f32(float %vecext.2)
  43   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
  44   %vecext.3 = extractelement <4 x float> %0, i32 3
  45   %4 = tail call fast float @llvm.sin.f32(float %vecext.3)
  46   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
  47   ret <4 x float> %vecins.3
  48 }
  49
  50 declare float @ceilf(float) readonly nounwind willreturn
  51
  52 define <4 x float> @ceil_4x(ptr %a) {
  53 ; CHECK-LABEL: @ceil_4x(
  54 ; CHECK-NEXT:  entry:
  55 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
  56 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
  57 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
  58 ;
  59 ; NOACCELERATE-LABEL: @ceil_4x(
  60 ; NOACCELERATE-NEXT:  entry:
  61 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
  62 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
  63 ; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
  64 ;
  65 entry:
  66   %0 = load <4 x float>, ptr %a, align 16
  67   %vecext = extractelement <4 x float> %0, i32 0
  68   %1 = tail call fast float @ceilf(float %vecext)
  69   %vecins = insertelement <4 x float> undef, float %1, i32 0
  70   %vecext.1 = extractelement <4 x float> %0, i32 1
  71   %2 = tail call fast float @ceilf(float %vecext.1)
  72   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
  73   %vecext.2 = extractelement <4 x float> %0, i32 2
  74   %3 = tail call fast float @ceilf(float %vecext.2)
  75   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
  76   %vecext.3 = extractelement <4 x float> %0, i32 3
  77   %4 = tail call fast float @ceilf(float %vecext.3)
  78   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
  79   ret <4 x float> %vecins.3
  80 }
  81
  82 declare float @fabsf(float) readonly nounwind willreturn
  83
  84 define <4 x float> @fabs_4x(ptr %a) {
  85 ; CHECK-LABEL: @fabs_4x(
  86 ; CHECK-NEXT:  entry:
  87 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
  88 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
  89 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
  90 ;
  91 ; NOACCELERATE-LABEL: @fabs_4x(
  92 ; NOACCELERATE-NEXT:  entry:
  93 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
  94 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
  95 ; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
  96 ;
  97 entry:
  98   %0 = load <4 x float>, ptr %a, align 16
  99   %vecext = extractelement <4 x float> %0, i32 0
 100   %1 = tail call fast float @fabsf(float %vecext)
 101   %vecins = insertelement <4 x float> undef, float %1, i32 0
 102   %vecext.1 = extractelement <4 x float> %0, i32 1
 103   %2 = tail call fast float @fabsf(float %vecext.1)
 104   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 105   %vecext.2 = extractelement <4 x float> %0, i32 2
 106   %3 = tail call fast float @fabsf(float %vecext.2)
 107   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 108   %vecext.3 = extractelement <4 x float> %0, i32 3
 109   %4 = tail call fast float @fabsf(float %vecext.3)
 110   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 111   ret <4 x float> %vecins.3
 112 }
 113 declare float @llvm.fabs.f32(float)
 114 define <4 x float> @int_fabs_4x(ptr %a) {
 115 ; CHECK-LABEL: @int_fabs_4x(
 116 ; CHECK-NEXT:  entry:
 117 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 118 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
 119 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 120 ;
 121 ; NOACCELERATE-LABEL: @int_fabs_4x(
 122 ; NOACCELERATE-NEXT:  entry:
 123 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 124 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
 125 ; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 126 ;
 127 entry:
 128   %0 = load <4 x float>, ptr %a, align 16
 129   %vecext = extractelement <4 x float> %0, i32 0
 130   %1 = tail call fast float @llvm.fabs.f32(float %vecext)
 131   %vecins = insertelement <4 x float> undef, float %1, i32 0
 132   %vecext.1 = extractelement <4 x float> %0, i32 1
 133   %2 = tail call fast float @llvm.fabs.f32(float %vecext.1)
 134   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 135   %vecext.2 = extractelement <4 x float> %0, i32 2
 136   %3 = tail call fast float @llvm.fabs.f32(float %vecext.2)
 137   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 138   %vecext.3 = extractelement <4 x float> %0, i32 3
 139   %4 = tail call fast float @llvm.fabs.f32(float %vecext.3)
 140   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 141   ret <4 x float> %vecins.3
 142 }
 143 declare float @floorf(float) readonly nounwind willreturn
 144 define <4 x float> @floor_4x(ptr %a) {
 145 ; CHECK-LABEL: @floor_4x(
 146 ; CHECK-NEXT:  entry:
 147 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 148 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
 149 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 150 ;
 151 ; NOACCELERATE-LABEL: @floor_4x(
 152 ; NOACCELERATE-NEXT:  entry:
 153 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 154 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
 155 ; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 156 ;
 157 entry:
 158   %0 = load <4 x float>, ptr %a, align 16
 159   %vecext = extractelement <4 x float> %0, i32 0
 160   %1 = tail call fast float @floorf(float %vecext)
 161   %vecins = insertelement <4 x float> undef, float %1, i32 0
 162   %vecext.1 = extractelement <4 x float> %0, i32 1
 163   %2 = tail call fast float @floorf(float %vecext.1)
 164   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 165   %vecext.2 = extractelement <4 x float> %0, i32 2
 166   %3 = tail call fast float @floorf(float %vecext.2)
 167   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 168   %vecext.3 = extractelement <4 x float> %0, i32 3
 169   %4 = tail call fast float @floorf(float %vecext.3)
 170   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 171   ret <4 x float> %vecins.3
 172 }
 173 declare float @sqrtf(float) readonly nounwind willreturn
 174 define <4 x float> @sqrt_4x(ptr %a) {
 175 ; CHECK-LABEL: @sqrt_4x(
 176 ; CHECK-NEXT:  entry:
 177 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 178 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
 179 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 180 ;
 181 ; NOACCELERATE-LABEL: @sqrt_4x(
 182 ; NOACCELERATE-NEXT:  entry:
 183 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 184 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
 185 ; NOACCELERATE-NEXT:    ret <4 x float> [[TMP1]]
 186 ;
 187 entry:
 188   %0 = load <4 x float>, ptr %a, align 16
 189   %vecext = extractelement <4 x float> %0, i32 0
 190   %1 = tail call fast float @sqrtf(float %vecext)
 191   %vecins = insertelement <4 x float> undef, float %1, i32 0
 192   %vecext.1 = extractelement <4 x float> %0, i32 1
 193   %2 = tail call fast float @sqrtf(float %vecext.1)
 194   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 195   %vecext.2 = extractelement <4 x float> %0, i32 2
 196   %3 = tail call fast float @sqrtf(float %vecext.2)
 197   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 198   %vecext.3 = extractelement <4 x float> %0, i32 3
 199   %4 = tail call fast float @sqrtf(float %vecext.3)
 200   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 201   ret <4 x float> %vecins.3
 202 }
 203 declare float @expf(float) readonly nounwind willreturn
 204 define <4 x float> @exp_4x(ptr %a) {
 205 ; CHECK-LABEL: @exp_4x(
 206 ; CHECK-NEXT:  entry:
 207 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 208 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpf(<4 x float> [[TMP0]])
 209 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 210 ;
 211 ; NOACCELERATE-LABEL: @exp_4x(
 212 ; NOACCELERATE-NEXT:  entry:
 213 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 214 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 215 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]])
 216 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 217 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 218 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 219 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 220 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 221 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
 222 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 223 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 224 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 225 ;
 226 entry:
 227   %0 = load <4 x float>, ptr %a, align 16
 228   %vecext = extractelement <4 x float> %0, i32 0
 229   %1 = tail call fast float @expf(float %vecext)
 230   %vecins = insertelement <4 x float> undef, float %1, i32 0
 231   %vecext.1 = extractelement <4 x float> %0, i32 1
 232   %2 = tail call fast float @expf(float %vecext.1)
 233   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 234   %vecext.2 = extractelement <4 x float> %0, i32 2
 235   %3 = tail call fast float @expf(float %vecext.2)
 236   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 237   %vecext.3 = extractelement <4 x float> %0, i32 3
 238   %4 = tail call fast float @expf(float %vecext.3)
 239   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 240   ret <4 x float> %vecins.3
 241 }
 242 declare float @expm1f(float) readonly nounwind willreturn
 243 define <4 x float> @expm1_4x(ptr %a) {
 244 ; CHECK-LABEL: @expm1_4x(
 245 ; CHECK-NEXT:  entry:
 246 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 247 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vexpm1f(<4 x float> [[TMP0]])
 248 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 249 ;
 250 ; NOACCELERATE-LABEL: @expm1_4x(
 251 ; NOACCELERATE-NEXT:  entry:
 252 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 253 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 254 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @expm1f(float [[VECEXT]])
 255 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 256 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 257 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @expm1f(float [[VECEXT_1]])
 258 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 259 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
 260 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @expm1f(float [[VECEXT_2]])
 261 ; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 262 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
 263 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @expm1f(float [[VECEXT_3]])
 264 ; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 265 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 266 ;
 267 entry:
 268   %0 = load <4 x float>, ptr %a, align 16
 269   %vecext = extractelement <4 x float> %0, i32 0
 270   %1 = tail call fast float @expm1f(float %vecext)
 271   %vecins = insertelement <4 x float> undef, float %1, i32 0
 272   %vecext.1 = extractelement <4 x float> %0, i32 1
 273   %2 = tail call fast float @expm1f(float %vecext.1)
 274   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 275   %vecext.2 = extractelement <4 x float> %0, i32 2
 276   %3 = tail call fast float @expm1f(float %vecext.2)
 277   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 278   %vecext.3 = extractelement <4 x float> %0, i32 3
 279   %4 = tail call fast float @expm1f(float %vecext.3)
 280   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 281   ret <4 x float> %vecins.3
 282 }
 283 declare float @logf(float) readonly nounwind willreturn
 284 define <4 x float> @log_4x(ptr %a) {
 285 ; CHECK-LABEL: @log_4x(
 286 ; CHECK-NEXT:  entry:
 287 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 288 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogf(<4 x float> [[TMP0]])
 289 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 290 ;
 291 ; NOACCELERATE-LABEL: @log_4x(
 292 ; NOACCELERATE-NEXT:  entry:
 293 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 294 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 295 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]])
 296 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 297 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 298 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 299 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 300 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 301 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
 302 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 303 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 304 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 305 ;
 306 entry:
 307   %0 = load <4 x float>, ptr %a, align 16
 308   %vecext = extractelement <4 x float> %0, i32 0
 309   %1 = tail call fast float @logf(float %vecext)
 310   %vecins = insertelement <4 x float> undef, float %1, i32 0
 311   %vecext.1 = extractelement <4 x float> %0, i32 1
 312   %2 = tail call fast float @logf(float %vecext.1)
 313   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 314   %vecext.2 = extractelement <4 x float> %0, i32 2
 315   %3 = tail call fast float @logf(float %vecext.2)
 316   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 317   %vecext.3 = extractelement <4 x float> %0, i32 3
 318   %4 = tail call fast float @logf(float %vecext.3)
 319   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 320   ret <4 x float> %vecins.3
 321 }
 322 declare float @log1pf(float) readonly nounwind willreturn
 323 define <4 x float> @log1p_4x(ptr %a) {
 324 ; CHECK-LABEL: @log1p_4x(
 325 ; CHECK-NEXT:  entry:
 326 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 327 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlog1pf(<4 x float> [[TMP0]])
 328 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 329 ;
 330 ; NOACCELERATE-LABEL: @log1p_4x(
 331 ; NOACCELERATE-NEXT:  entry:
 332 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 333 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 334 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @log1pf(float [[VECEXT]])
 335 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 336 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 337 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @log1pf(float [[VECEXT_1]])
 338 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 339 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
 340 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @log1pf(float [[VECEXT_2]])
 341 ; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 342 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
 343 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @log1pf(float [[VECEXT_3]])
 344 ; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 345 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 346 ;
 347 entry:
 348   %0 = load <4 x float>, ptr %a, align 16
 349   %vecext = extractelement <4 x float> %0, i32 0
 350   %1 = tail call fast float @log1pf(float %vecext)
 351   %vecins = insertelement <4 x float> undef, float %1, i32 0
 352   %vecext.1 = extractelement <4 x float> %0, i32 1
 353   %2 = tail call fast float @log1pf(float %vecext.1)
 354   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 355   %vecext.2 = extractelement <4 x float> %0, i32 2
 356   %3 = tail call fast float @log1pf(float %vecext.2)
 357   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 358   %vecext.3 = extractelement <4 x float> %0, i32 3
 359   %4 = tail call fast float @log1pf(float %vecext.3)
 360   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 361   ret <4 x float> %vecins.3
 362 }
 363 declare float @log10pf(float) readonly nounwind willreturn
 364 define <4 x float> @log10p_4x(ptr %a) {
 365 ; CHECK-LABEL: @log10p_4x(
 366 ; CHECK-NEXT:  entry:
 367 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 368 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 369 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
 370 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 371 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 372 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
 373 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 374 ; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
 375 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
 376 ; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 377 ; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
 378 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
 379 ; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 380 ; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 381 ;
 382 ; NOACCELERATE-LABEL: @log10p_4x(
 383 ; NOACCELERATE-NEXT:  entry:
 384 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 385 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 386 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @log10pf(float [[VECEXT]])
 387 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 388 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 389 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @log10pf(float [[VECEXT_1]])
 390 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 391 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
 392 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @log10pf(float [[VECEXT_2]])
 393 ; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 394 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
 395 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @log10pf(float [[VECEXT_3]])
 396 ; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 397 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 398 ;
 399 entry:
 400   %0 = load <4 x float>, ptr %a, align 16
 401   %vecext = extractelement <4 x float> %0, i32 0
 402   %1 = tail call fast float @log10pf(float %vecext)
 403   %vecins = insertelement <4 x float> undef, float %1, i32 0
 404   %vecext.1 = extractelement <4 x float> %0, i32 1
 405   %2 = tail call fast float @log10pf(float %vecext.1)
 406   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 407   %vecext.2 = extractelement <4 x float> %0, i32 2
 408   %3 = tail call fast float @log10pf(float %vecext.2)
 409   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 410   %vecext.3 = extractelement <4 x float> %0, i32 3
 411   %4 = tail call fast float @log10pf(float %vecext.3)
 412   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 413   ret <4 x float> %vecins.3
 414 }
 415 declare float @logbf(float) readonly nounwind willreturn
 416 define <4 x float> @logb_4x(ptr %a) {
 417 ; CHECK-LABEL: @logb_4x(
 418 ; CHECK-NEXT:  entry:
 419 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 420 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vlogbf(<4 x float> [[TMP0]])
 421 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 422 ;
 423 ; NOACCELERATE-LABEL: @logb_4x(
 424 ; NOACCELERATE-NEXT:  entry:
 425 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 426 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 427 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @logbf(float [[VECEXT]])
 428 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 429 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 430 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @logbf(float [[VECEXT_1]])
 431 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 432 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
 433 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @logbf(float [[VECEXT_2]])
 434 ; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
 435 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
 436 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @logbf(float [[VECEXT_3]])
 437 ; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
 438 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 439 ;
 440 entry:
 441   %0 = load <4 x float>, ptr %a, align 16
 442   %vecext = extractelement <4 x float> %0, i32 0
 443   %1 = tail call fast float @logbf(float %vecext)
 444   %vecins = insertelement <4 x float> undef, float %1, i32 0
 445   %vecext.1 = extractelement <4 x float> %0, i32 1
 446   %2 = tail call fast float @logbf(float %vecext.1)
 447   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 448   %vecext.2 = extractelement <4 x float> %0, i32 2
 449   %3 = tail call fast float @logbf(float %vecext.2)
 450   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 451   %vecext.3 = extractelement <4 x float> %0, i32 3
 452   %4 = tail call fast float @logbf(float %vecext.3)
 453   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 454   ret <4 x float> %vecins.3
 455 }
 456 declare float @sinf(float) readonly nounwind willreturn
 457 define <4 x float> @sin_4x(ptr %a) {
 458 ; CHECK-LABEL: @sin_4x(
 459 ; CHECK-NEXT:  entry:
 460 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 461 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
 462 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 463 ;
 464 ; NOACCELERATE-LABEL: @sin_4x(
 465 ; NOACCELERATE-NEXT:  entry:
 466 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 467 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 468 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]])
 469 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 470 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 471 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 472 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 473 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 474 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
 475 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 476 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 477 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 478 ;
 479 entry:
 480   %0 = load <4 x float>, ptr %a, align 16
 481   %vecext = extractelement <4 x float> %0, i32 0
 482   %1 = tail call fast float @sinf(float %vecext)
 483   %vecins = insertelement <4 x float> undef, float %1, i32 0
 484   %vecext.1 = extractelement <4 x float> %0, i32 1
 485   %2 = tail call fast float @sinf(float %vecext.1)
 486   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 487   %vecext.2 = extractelement <4 x float> %0, i32 2
 488   %3 = tail call fast float @sinf(float %vecext.2)
 489   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 490   %vecext.3 = extractelement <4 x float> %0, i32 3
 491   %4 = tail call fast float @sinf(float %vecext.3)
 492   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 493   ret <4 x float> %vecins.3
 494 }
 495 declare float @cosf(float) readonly nounwind willreturn
 496 define <4 x float> @cos_4x(ptr %a) {
 497 ; CHECK-LABEL: @cos_4x(
 498 ; CHECK-NEXT:  entry:
 499 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 500 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
 501 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 502 ;
 503 ; NOACCELERATE-LABEL: @cos_4x(
 504 ; NOACCELERATE-NEXT:  entry:
 505 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 506 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 507 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]])
 508 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 509 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 510 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]])
 511 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 512 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 513 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
 514 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 515 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 516 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 517 ;
 518 entry:
 519   %0 = load <4 x float>, ptr %a, align 16
 520   %vecext = extractelement <4 x float> %0, i32 0
 521   %1 = tail call fast float @cosf(float %vecext)
 522   %vecins = insertelement <4 x float> undef, float %1, i32 0
 523   %vecext.1 = extractelement <4 x float> %0, i32 1
 524   %2 = tail call fast float @cosf(float %vecext.1)
 525   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 526   %vecext.2 = extractelement <4 x float> %0, i32 2
 527   %3 = tail call fast float @cosf(float %vecext.2)
 528   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 529   %vecext.3 = extractelement <4 x float> %0, i32 3
 530   %4 = tail call fast float @cosf(float %vecext.3)
 531   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 532   ret <4 x float> %vecins.3
 533 }
 534 declare float @tanf(float) readonly nounwind willreturn
 535 define <4 x float> @tan_4x(ptr %a) {
 536 ; CHECK-LABEL: @tan_4x(
 537 ; CHECK-NEXT:  entry:
 538 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 539 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanf(<4 x float> [[TMP0]])
 540 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 541 ;
 542 ; NOACCELERATE-LABEL: @tan_4x(
 543 ; NOACCELERATE-NEXT:  entry:
 544 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 545 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 546 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @tanf(float [[VECEXT]])
 547 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 548 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 549 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @tanf(float [[VECEXT_1]])
 550 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 551 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 552 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tan.v2f32(<2 x float> [[TMP3]])
 553 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 554 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 555 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 556 ;
 557 entry:
 558   %0 = load <4 x float>, ptr %a, align 16
 559   %vecext = extractelement <4 x float> %0, i32 0
 560   %1 = tail call fast float @tanf(float %vecext)
 561   %vecins = insertelement <4 x float> undef, float %1, i32 0
 562   %vecext.1 = extractelement <4 x float> %0, i32 1
 563   %2 = tail call fast float @tanf(float %vecext.1)
 564   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 565   %vecext.2 = extractelement <4 x float> %0, i32 2
 566   %3 = tail call fast float @tanf(float %vecext.2)
 567   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 568   %vecext.3 = extractelement <4 x float> %0, i32 3
 569   %4 = tail call fast float @tanf(float %vecext.3)
 570   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 571   ret <4 x float> %vecins.3
 572 }
 573 declare float @asinf(float) readonly nounwind willreturn
 574 define <4 x float> @asin_4x(ptr %a) {
 575 ; CHECK-LABEL: @asin_4x(
 576 ; CHECK-NEXT:  entry:
 577 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 578 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
 579 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 580 ;
 581 ; NOACCELERATE-LABEL: @asin_4x(
 582 ; NOACCELERATE-NEXT:  entry:
 583 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 584 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 585 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @asinf(float [[VECEXT]])
 586 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 587 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 588 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @asinf(float [[VECEXT_1]])
 589 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 590 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 591 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
 592 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 593 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 594 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 595 ;
 596 entry:
 597   %0 = load <4 x float>, ptr %a, align 16
 598   %vecext = extractelement <4 x float> %0, i32 0
 599   %1 = tail call fast float @asinf(float %vecext)
 600   %vecins = insertelement <4 x float> undef, float %1, i32 0
 601   %vecext.1 = extractelement <4 x float> %0, i32 1
 602   %2 = tail call fast float @asinf(float %vecext.1)
 603   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 604   %vecext.2 = extractelement <4 x float> %0, i32 2
 605   %3 = tail call fast float @asinf(float %vecext.2)
 606   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 607   %vecext.3 = extractelement <4 x float> %0, i32 3
 608   %4 = tail call fast float @asinf(float %vecext.3)
 609   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 610   ret <4 x float> %vecins.3
 611 }
 612 define <4 x float> @int_asin_4x(ptr %a) {
 613 ; CHECK-LABEL: @int_asin_4x(
 614 ; CHECK-NEXT:  entry:
 615 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 616 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinf(<4 x float> [[TMP0]])
 617 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 618 ;
 619 ; NOACCELERATE-LABEL: @int_asin_4x(
 620 ; NOACCELERATE-NEXT:  entry:
 621 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 622 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 623 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT]])
 624 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 625 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 626 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.asin.f32(float [[VECEXT_1]])
 627 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 628 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 629 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.asin.v2f32(<2 x float> [[TMP3]])
 630 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 631 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 632 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 633 ;
 634 entry:
 635   %0 = load <4 x float>, ptr %a, align 16
 636   %vecext = extractelement <4 x float> %0, i32 0
 637   %1 = tail call fast float @llvm.asin.f32(float %vecext)
 638   %vecins = insertelement <4 x float> undef, float %1, i32 0
 639   %vecext.1 = extractelement <4 x float> %0, i32 1
 640   %2 = tail call fast float @llvm.asin.f32(float %vecext.1)
 641   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 642   %vecext.2 = extractelement <4 x float> %0, i32 2
 643   %3 = tail call fast float @llvm.asin.f32(float %vecext.2)
 644   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 645   %vecext.3 = extractelement <4 x float> %0, i32 3
 646   %4 = tail call fast float @llvm.asin.f32(float %vecext.3)
 647   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 648   ret <4 x float> %vecins.3
 649 }
 650 declare float @acosf(float) readonly nounwind willreturn
 651 define <4 x float> @acos_4x(ptr %a) {
 652 ; CHECK-LABEL: @acos_4x(
 653 ; CHECK-NEXT:  entry:
 654 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 655 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
 656 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 657 ;
 658 ; NOACCELERATE-LABEL: @acos_4x(
 659 ; NOACCELERATE-NEXT:  entry:
 660 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 661 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 662 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @acosf(float [[VECEXT]])
 663 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 664 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 665 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @acosf(float [[VECEXT_1]])
 666 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 667 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 668 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
 669 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 670 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 671 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 672 ;
 673 entry:
 674   %0 = load <4 x float>, ptr %a, align 16
 675   %vecext = extractelement <4 x float> %0, i32 0
 676   %1 = tail call fast float @acosf(float %vecext)
 677   %vecins = insertelement <4 x float> undef, float %1, i32 0
 678   %vecext.1 = extractelement <4 x float> %0, i32 1
 679   %2 = tail call fast float @acosf(float %vecext.1)
 680   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 681   %vecext.2 = extractelement <4 x float> %0, i32 2
 682   %3 = tail call fast float @acosf(float %vecext.2)
 683   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 684   %vecext.3 = extractelement <4 x float> %0, i32 3
 685   %4 = tail call fast float @acosf(float %vecext.3)
 686   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 687   ret <4 x float> %vecins.3
 688 }
 689 define <4 x float> @int_acos_4x(ptr %a) {
 690 ; CHECK-LABEL: @int_acos_4x(
 691 ; CHECK-NEXT:  entry:
 692 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 693 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacosf(<4 x float> [[TMP0]])
 694 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 695 ;
 696 ; NOACCELERATE-LABEL: @int_acos_4x(
 697 ; NOACCELERATE-NEXT:  entry:
 698 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 699 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 700 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT]])
 701 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 702 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 703 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.acos.f32(float [[VECEXT_1]])
 704 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 705 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 706 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.acos.v2f32(<2 x float> [[TMP3]])
 707 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 708 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 709 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 710 ;
 711 entry:
 712   %0 = load <4 x float>, ptr %a, align 16
 713   %vecext = extractelement <4 x float> %0, i32 0
 714   %1 = tail call fast float @llvm.acos.f32(float %vecext)
 715   %vecins = insertelement <4 x float> undef, float %1, i32 0
 716   %vecext.1 = extractelement <4 x float> %0, i32 1
 717   %2 = tail call fast float @llvm.acos.f32(float %vecext.1)
 718   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 719   %vecext.2 = extractelement <4 x float> %0, i32 2
 720   %3 = tail call fast float @llvm.acos.f32(float %vecext.2)
 721   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 722   %vecext.3 = extractelement <4 x float> %0, i32 3
 723   %4 = tail call fast float @llvm.acos.f32(float %vecext.3)
 724   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 725   ret <4 x float> %vecins.3
 726 }
 727 declare float @atanf(float) readonly nounwind willreturn
 728 define <4 x float> @atan_4x(ptr %a) {
 729 ; CHECK-LABEL: @atan_4x(
 730 ; CHECK-NEXT:  entry:
 731 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 732 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
 733 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 734 ;
 735 ; NOACCELERATE-LABEL: @atan_4x(
 736 ; NOACCELERATE-NEXT:  entry:
 737 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 738 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 739 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @atanf(float [[VECEXT]])
 740 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 741 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 742 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @atanf(float [[VECEXT_1]])
 743 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 744 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 745 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
 746 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 747 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 748 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 749 ;
 750 entry:
 751   %0 = load <4 x float>, ptr %a, align 16
 752   %vecext = extractelement <4 x float> %0, i32 0
 753   %1 = tail call fast float @atanf(float %vecext)
 754   %vecins = insertelement <4 x float> undef, float %1, i32 0
 755   %vecext.1 = extractelement <4 x float> %0, i32 1
 756   %2 = tail call fast float @atanf(float %vecext.1)
 757   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 758   %vecext.2 = extractelement <4 x float> %0, i32 2
 759   %3 = tail call fast float @atanf(float %vecext.2)
 760   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 761   %vecext.3 = extractelement <4 x float> %0, i32 3
 762   %4 = tail call fast float @atanf(float %vecext.3)
 763   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 764   ret <4 x float> %vecins.3
 765 }
 766 define <4 x float> @int_atan_4x(ptr %a) {
 767 ; CHECK-LABEL: @int_atan_4x(
 768 ; CHECK-NEXT:  entry:
 769 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 770 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanf(<4 x float> [[TMP0]])
 771 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 772 ;
 773 ; NOACCELERATE-LABEL: @int_atan_4x(
 774 ; NOACCELERATE-NEXT:  entry:
 775 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 776 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 777 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT]])
 778 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 779 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 780 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atan.f32(float [[VECEXT_1]])
 781 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 782 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 783 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.atan.v2f32(<2 x float> [[TMP3]])
 784 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 785 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 786 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 787 ;
 788 entry:
 789   %0 = load <4 x float>, ptr %a, align 16
 790   %vecext = extractelement <4 x float> %0, i32 0
 791   %1 = tail call fast float @llvm.atan.f32(float %vecext)
 792   %vecins = insertelement <4 x float> undef, float %1, i32 0
 793   %vecext.1 = extractelement <4 x float> %0, i32 1
 794   %2 = tail call fast float @llvm.atan.f32(float %vecext.1)
 795   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 796   %vecext.2 = extractelement <4 x float> %0, i32 2
 797   %3 = tail call fast float @llvm.atan.f32(float %vecext.2)
 798   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 799   %vecext.3 = extractelement <4 x float> %0, i32 3
 800   %4 = tail call fast float @llvm.atan.f32(float %vecext.3)
 801   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 802   ret <4 x float> %vecins.3
 803 }
 804 declare float @atan2f(float,float) readonly nounwind willreturn
 805 define <4 x float> @atan2_4x(ptr %a, ptr %b) {
 806 ; CHECK-LABEL: @atan2_4x(
 807 ; CHECK-NEXT:  entry:
 808 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 809 ; CHECK-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
 810 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
 811 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 812 ;
 813 ; NOACCELERATE-LABEL: @atan2_4x(
 814 ; NOACCELERATE-NEXT:  entry:
 815 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 816 ; NOACCELERATE-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
 817 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 818 ; NOACCELERATE-NEXT:    [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
 819 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @atan2f(float [[VECEXT]], float [[VECEXTB]])
 820 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 821 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 822 ; NOACCELERATE-NEXT:    [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
 823 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]])
 824 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 825 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 826 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 827 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 828 ; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 829 ; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 830 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
 831 ;
 832 entry:
 833   %0 = load <4 x float>, ptr %a, align 16
 834   %bb = load <4 x float>, ptr %b, align 16
 835   %vecext = extractelement <4 x float> %0, i32 0
 836   %vecextb = extractelement <4 x float> %bb, i32 0
 837   %1 = tail call fast float @atan2f(float %vecext, float %vecextb)
 838   %vecins = insertelement <4 x float> undef, float %1, i32 0
 839   %vecext.1 = extractelement <4 x float> %0, i32 1
 840   %vecextb.1 = extractelement <4 x float> %bb, i32 1
 841   %2 = tail call fast float @atan2f(float %vecext.1, float %vecextb.1)
 842   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 843   %vecext.2 = extractelement <4 x float> %0, i32 2
 844   %vecextb.2 = extractelement <4 x float> %bb, i32 2
 845   %3 = tail call fast float @atan2f(float %vecext.2, float %vecextb.2)
 846   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 847   %vecext.3 = extractelement <4 x float> %0, i32 3
 848   %vecextb.3 = extractelement <4 x float> %bb, i32 3
 849   %4 = tail call fast float @atan2f(float %vecext.3, float %vecextb.3)
 850   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 851   ret <4 x float> %vecins.3
 852 }
 853 define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
 854 ; CHECK-LABEL: @int_atan2_4x(
 855 ; CHECK-NEXT:  entry:
 856 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 857 ; CHECK-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
 858 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
 859 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 860 ;
 861 ; NOACCELERATE-LABEL: @int_atan2_4x(
 862 ; NOACCELERATE-NEXT:  entry:
 863 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 864 ; NOACCELERATE-NEXT:    [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
 865 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 866 ; NOACCELERATE-NEXT:    [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
 867 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT]], float [[VECEXTB]])
 868 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 869 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 870 ; NOACCELERATE-NEXT:    [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
 871 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]])
 872 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 873 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 874 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 875 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
 876 ; NOACCELERATE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 877 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 878 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 879 ;
 880 entry:
 881   %0 = load <4 x float>, ptr %a, align 16
 882   %bb = load <4 x float>, ptr %b, align 16
 883   %vecext = extractelement <4 x float> %0, i32 0
 884   %vecextb = extractelement <4 x float> %bb, i32 0
 885   %1 = tail call fast float @llvm.atan2.f32(float %vecext, float %vecextb)
 886   %vecins = insertelement <4 x float> undef, float %1, i32 0
 887   %vecext.1 = extractelement <4 x float> %0, i32 1
 888   %vecextb.1 = extractelement <4 x float> %bb, i32 1
 889   %2 = tail call fast float @llvm.atan2.f32(float %vecext.1, float %vecextb.1)
 890   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 891   %vecext.2 = extractelement <4 x float> %0, i32 2
 892   %vecextb.2 = extractelement <4 x float> %bb, i32 2
 893   %3 = tail call fast float @llvm.atan2.f32(float %vecext.2, float %vecextb.2)
 894   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 895   %vecext.3 = extractelement <4 x float> %0, i32 3
 896   %vecextb.3 = extractelement <4 x float> %bb, i32 3
 897   %4 = tail call fast float @llvm.atan2.f32(float %vecext.3, float %vecextb.3)
 898   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 899   ret <4 x float> %vecins.3
 900 }
 901 declare float @sinhf(float) readonly nounwind willreturn
 902 define <4 x float> @sinh_4x(ptr %a) {
 903 ; CHECK-LABEL: @sinh_4x(
 904 ; CHECK-NEXT:  entry:
 905 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 906 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
 907 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 908 ;
 909 ; NOACCELERATE-LABEL: @sinh_4x(
 910 ; NOACCELERATE-NEXT:  entry:
 911 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 912 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 913 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @sinhf(float [[VECEXT]])
 914 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 915 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 916 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @sinhf(float [[VECEXT_1]])
 917 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 918 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 919 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
 920 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 921 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 922 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 923 ;
 924 entry:
 925   %0 = load <4 x float>, ptr %a, align 16
 926   %vecext = extractelement <4 x float> %0, i32 0
 927   %1 = tail call fast float @sinhf(float %vecext)
 928   %vecins = insertelement <4 x float> undef, float %1, i32 0
 929   %vecext.1 = extractelement <4 x float> %0, i32 1
 930   %2 = tail call fast float @sinhf(float %vecext.1)
 931   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 932   %vecext.2 = extractelement <4 x float> %0, i32 2
 933   %3 = tail call fast float @sinhf(float %vecext.2)
 934   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 935   %vecext.3 = extractelement <4 x float> %0, i32 3
 936   %4 = tail call fast float @sinhf(float %vecext.3)
 937   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 938   ret <4 x float> %vecins.3
 939 }
 940 define <4 x float> @int_sinh_4x(ptr %a) {
 941 ; CHECK-LABEL: @int_sinh_4x(
 942 ; CHECK-NEXT:  entry:
 943 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 944 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vsinhf(<4 x float> [[TMP0]])
 945 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 946 ;
 947 ; NOACCELERATE-LABEL: @int_sinh_4x(
 948 ; NOACCELERATE-NEXT:  entry:
 949 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 950 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 951 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT]])
 952 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 953 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 954 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sinh.f32(float [[VECEXT_1]])
 955 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 956 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 957 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sinh.v2f32(<2 x float> [[TMP3]])
 958 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 959 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 960 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
 961 ;
 962 entry:
 963   %0 = load <4 x float>, ptr %a, align 16
 964   %vecext = extractelement <4 x float> %0, i32 0
 965   %1 = tail call fast float @llvm.sinh.f32(float %vecext)
 966   %vecins = insertelement <4 x float> undef, float %1, i32 0
 967   %vecext.1 = extractelement <4 x float> %0, i32 1
 968   %2 = tail call fast float @llvm.sinh.f32(float %vecext.1)
 969   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
 970   %vecext.2 = extractelement <4 x float> %0, i32 2
 971   %3 = tail call fast float @llvm.sinh.f32(float %vecext.2)
 972   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
 973   %vecext.3 = extractelement <4 x float> %0, i32 3
 974   %4 = tail call fast float @llvm.sinh.f32(float %vecext.3)
 975   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
 976   ret <4 x float> %vecins.3
 977 }
 978 declare float @coshf(float) readonly nounwind willreturn
 979 define <4 x float> @cosh_4x(ptr %a) {
 980 ; CHECK-LABEL: @cosh_4x(
 981 ; CHECK-NEXT:  entry:
 982 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 983 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
 984 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 985 ;
 986 ; NOACCELERATE-LABEL: @cosh_4x(
 987 ; NOACCELERATE-NEXT:  entry:
 988 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
 989 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
 990 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @coshf(float [[VECEXT]])
 991 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
 992 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 993 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @coshf(float [[VECEXT_1]])
 994 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
 995 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 996 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
 997 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 998 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 999 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1000 ;
1001 entry:
1002   %0 = load <4 x float>, ptr %a, align 16
1003   %vecext = extractelement <4 x float> %0, i32 0
1004   %1 = tail call fast float @coshf(float %vecext)
1005   %vecins = insertelement <4 x float> undef, float %1, i32 0
1006   %vecext.1 = extractelement <4 x float> %0, i32 1
1007   %2 = tail call fast float @coshf(float %vecext.1)
1008   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1009   %vecext.2 = extractelement <4 x float> %0, i32 2
1010   %3 = tail call fast float @coshf(float %vecext.2)
1011   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1012   %vecext.3 = extractelement <4 x float> %0, i32 3
1013   %4 = tail call fast float @coshf(float %vecext.3)
1014   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1015   ret <4 x float> %vecins.3
1016 }
1017 define <4 x float> @int_cosh_4x(ptr %a) {
1018 ; CHECK-LABEL: @int_cosh_4x(
1019 ; CHECK-NEXT:  entry:
1020 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1021 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcoshf(<4 x float> [[TMP0]])
1022 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1023 ;
1024 ; NOACCELERATE-LABEL: @int_cosh_4x(
1025 ; NOACCELERATE-NEXT:  entry:
1026 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1027 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1028 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT]])
1029 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1030 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1031 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cosh.f32(float [[VECEXT_1]])
1032 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1033 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1034 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cosh.v2f32(<2 x float> [[TMP3]])
1035 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1036 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1037 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1038 ;
1039 entry:
1040   %0 = load <4 x float>, ptr %a, align 16
1041   %vecext = extractelement <4 x float> %0, i32 0
1042   %1 = tail call fast float @llvm.cosh.f32(float %vecext)
1043   %vecins = insertelement <4 x float> undef, float %1, i32 0
1044   %vecext.1 = extractelement <4 x float> %0, i32 1
1045   %2 = tail call fast float @llvm.cosh.f32(float %vecext.1)
1046   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1047   %vecext.2 = extractelement <4 x float> %0, i32 2
1048   %3 = tail call fast float @llvm.cosh.f32(float %vecext.2)
1049   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1050   %vecext.3 = extractelement <4 x float> %0, i32 3
1051   %4 = tail call fast float @llvm.cosh.f32(float %vecext.3)
1052   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1053   ret <4 x float> %vecins.3
1054 }
1055 declare float @tanhf(float) readonly nounwind willreturn
1056 define <4 x float> @tanh_4x(ptr %a) {
1057 ; CHECK-LABEL: @tanh_4x(
1058 ; CHECK-NEXT:  entry:
1059 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1060 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
1061 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1062 ;
1063 ; NOACCELERATE-LABEL: @tanh_4x(
1064 ; NOACCELERATE-NEXT:  entry:
1065 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1066 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1067 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @tanhf(float [[VECEXT]])
1068 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1069 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1070 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @tanhf(float [[VECEXT_1]])
1071 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1072 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1073 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
1074 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1075 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1076 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1077 ;
1078 entry:
1079   %0 = load <4 x float>, ptr %a, align 16
1080   %vecext = extractelement <4 x float> %0, i32 0
1081   %1 = tail call fast float @tanhf(float %vecext)
1082   %vecins = insertelement <4 x float> undef, float %1, i32 0
1083   %vecext.1 = extractelement <4 x float> %0, i32 1
1084   %2 = tail call fast float @tanhf(float %vecext.1)
1085   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1086   %vecext.2 = extractelement <4 x float> %0, i32 2
1087   %3 = tail call fast float @tanhf(float %vecext.2)
1088   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1089   %vecext.3 = extractelement <4 x float> %0, i32 3
1090   %4 = tail call fast float @tanhf(float %vecext.3)
1091   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1092   ret <4 x float> %vecins.3
1093 }
1094 define <4 x float> @int_tanh_4x(ptr %a) {
1095 ; CHECK-LABEL: @int_tanh_4x(
1096 ; CHECK-NEXT:  entry:
1097 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1098 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vtanhf(<4 x float> [[TMP0]])
1099 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1100 ;
1101 ; NOACCELERATE-LABEL: @int_tanh_4x(
1102 ; NOACCELERATE-NEXT:  entry:
1103 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1104 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1105 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT]])
1106 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1107 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1108 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.tanh.f32(float [[VECEXT_1]])
1109 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1110 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1111 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.tanh.v2f32(<2 x float> [[TMP3]])
1112 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1113 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1114 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1115 ;
1116 entry:
1117   %0 = load <4 x float>, ptr %a, align 16
1118   %vecext = extractelement <4 x float> %0, i32 0
1119   %1 = tail call fast float @llvm.tanh.f32(float %vecext)
1120   %vecins = insertelement <4 x float> undef, float %1, i32 0
1121   %vecext.1 = extractelement <4 x float> %0, i32 1
1122   %2 = tail call fast float @llvm.tanh.f32(float %vecext.1)
1123   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1124   %vecext.2 = extractelement <4 x float> %0, i32 2
1125   %3 = tail call fast float @llvm.tanh.f32(float %vecext.2)
1126   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1127   %vecext.3 = extractelement <4 x float> %0, i32 3
1128   %4 = tail call fast float @llvm.tanh.f32(float %vecext.3)
1129   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1130   ret <4 x float> %vecins.3
1131 }
1132 declare float @asinhf(float) readonly nounwind willreturn
1133 define <4 x float> @asinh_4x(ptr %a) {
1134 ; CHECK-LABEL: @asinh_4x(
1135 ; CHECK-NEXT:  entry:
1136 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1137 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vasinhf(<4 x float> [[TMP0]])
1138 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1139 ;
1140 ; NOACCELERATE-LABEL: @asinh_4x(
1141 ; NOACCELERATE-NEXT:  entry:
1142 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1143 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1144 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @asinhf(float [[VECEXT]])
1145 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1146 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1147 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @asinhf(float [[VECEXT_1]])
1148 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1149 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1150 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @asinhf(float [[VECEXT_2]])
1151 ; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1152 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1153 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @asinhf(float [[VECEXT_3]])
1154 ; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1155 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
1156 ;
1157 entry:
1158   %0 = load <4 x float>, ptr %a, align 16
1159   %vecext = extractelement <4 x float> %0, i32 0
1160   %1 = tail call fast float @asinhf(float %vecext)
1161   %vecins = insertelement <4 x float> undef, float %1, i32 0
1162   %vecext.1 = extractelement <4 x float> %0, i32 1
1163   %2 = tail call fast float @asinhf(float %vecext.1)
1164   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1165   %vecext.2 = extractelement <4 x float> %0, i32 2
1166   %3 = tail call fast float @asinhf(float %vecext.2)
1167   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1168   %vecext.3 = extractelement <4 x float> %0, i32 3
1169   %4 = tail call fast float @asinhf(float %vecext.3)
1170   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1171   ret <4 x float> %vecins.3
1172 }
1173 declare float @acoshf(float) readonly nounwind willreturn
1174 define <4 x float> @acosh_4x(ptr %a) {
1175 ; CHECK-LABEL: @acosh_4x(
1176 ; CHECK-NEXT:  entry:
1177 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1178 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vacoshf(<4 x float> [[TMP0]])
1179 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1180 ;
1181 ; NOACCELERATE-LABEL: @acosh_4x(
1182 ; NOACCELERATE-NEXT:  entry:
1183 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1184 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1185 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @acoshf(float [[VECEXT]])
1186 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1187 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1188 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @acoshf(float [[VECEXT_1]])
1189 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1190 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1191 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @acoshf(float [[VECEXT_2]])
1192 ; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1193 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1194 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @acoshf(float [[VECEXT_3]])
1195 ; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1196 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
1197 ;
1198 entry:
1199   %0 = load <4 x float>, ptr %a, align 16
1200   %vecext = extractelement <4 x float> %0, i32 0
1201   %1 = tail call fast float @acoshf(float %vecext)
1202   %vecins = insertelement <4 x float> undef, float %1, i32 0
1203   %vecext.1 = extractelement <4 x float> %0, i32 1
1204   %2 = tail call fast float @acoshf(float %vecext.1)
1205   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1206   %vecext.2 = extractelement <4 x float> %0, i32 2
1207   %3 = tail call fast float @acoshf(float %vecext.2)
1208   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1209   %vecext.3 = extractelement <4 x float> %0, i32 3
1210   %4 = tail call fast float @acoshf(float %vecext.3)
1211   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1212   ret <4 x float> %vecins.3
1213 }
1214 declare float @atanhf(float) readonly nounwind willreturn
1215 define <4 x float> @atanh_4x(ptr %a) {
1216 ; CHECK-LABEL: @atanh_4x(
1217 ; CHECK-NEXT:  entry:
1218 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1219 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vatanhf(<4 x float> [[TMP0]])
1220 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1221 ;
1222 ; NOACCELERATE-LABEL: @atanh_4x(
1223 ; NOACCELERATE-NEXT:  entry:
1224 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1225 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1226 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @atanhf(float [[VECEXT]])
1227 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1228 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1229 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @atanhf(float [[VECEXT_1]])
1230 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1231 ; NOACCELERATE-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
1232 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = tail call fast float @atanhf(float [[VECEXT_2]])
1233 ; NOACCELERATE-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
1234 ; NOACCELERATE-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
1235 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = tail call fast float @atanhf(float [[VECEXT_3]])
1236 ; NOACCELERATE-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
1237 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_3]]
1238 ;
1239 entry:
1240   %0 = load <4 x float>, ptr %a, align 16
1241   %vecext = extractelement <4 x float> %0, i32 0
1242   %1 = tail call fast float @atanhf(float %vecext)
1243   %vecins = insertelement <4 x float> undef, float %1, i32 0
1244   %vecext.1 = extractelement <4 x float> %0, i32 1
1245   %2 = tail call fast float @atanhf(float %vecext.1)
1246   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1247   %vecext.2 = extractelement <4 x float> %0, i32 2
1248   %3 = tail call fast float @atanhf(float %vecext.2)
1249   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1250   %vecext.3 = extractelement <4 x float> %0, i32 3
1251   %4 = tail call fast float @atanhf(float %vecext.3)
1252   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1253   ret <4 x float> %vecins.3
1254 }
1255
1256 ; Accelerate *does not* provide sin() for <2 x float>.
1257 define <2 x float> @sin_2x(ptr %a) {
1258 ; CHECK-LABEL: @sin_2x(
1259 ; CHECK-NEXT:  entry:
1260 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1261 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1262 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #[[ATTR2:[0-9]+]]
1263 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1264 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1265 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #[[ATTR2]]
1266 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1267 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
1268 ;
1269 ; NOACCELERATE-LABEL: @sin_2x(
1270 ; NOACCELERATE-NEXT:  entry:
1271 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1272 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1273 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
1274 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1275 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1276 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
1277 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1278 ; NOACCELERATE-NEXT:    ret <2 x float> [[VECINS_1]]
1279 ;
1280 entry:
1281   %0 = load <2 x float>, ptr %a, align 16
1282   %vecext = extractelement <2 x float> %0, i32 0
1283   %1 = tail call fast float @llvm.sin.f32(float %vecext)
1284   %vecins = insertelement <2 x float> undef, float %1, i32 0
1285   %vecext.1 = extractelement <2 x float> %0, i32 1
1286   %2 = tail call fast float @llvm.sin.f32(float %vecext.1)
1287   %vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
1288   ret <2 x float> %vecins.1
1289 }
1290
1291
1292 declare float @llvm.cos.f32(float)
1293
1294 ; Accelerate provides cos() for <4 x float>
1295 define <4 x float> @int_cos_4x(ptr %a) {
1296 ; CHECK-LABEL: @int_cos_4x(
1297 ; CHECK-NEXT:  entry:
1298 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1299 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
1300 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
1301 ;
1302 ; NOACCELERATE-LABEL: @int_cos_4x(
1303 ; NOACCELERATE-NEXT:  entry:
1304 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
1305 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
1306 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
1307 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
1308 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
1309 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1310 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
1311 ; NOACCELERATE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
1312 ; NOACCELERATE-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]])
1313 ; NOACCELERATE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
1314 ; NOACCELERATE-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1315 ; NOACCELERATE-NEXT:    ret <4 x float> [[VECINS_31]]
1316 ;
1317 entry:
1318   %0 = load <4 x float>, ptr %a, align 16
1319   %vecext = extractelement <4 x float> %0, i32 0
1320   %1 = tail call fast float @llvm.cos.f32(float %vecext)
1321   %vecins = insertelement <4 x float> undef, float %1, i32 0
1322   %vecext.1 = extractelement <4 x float> %0, i32 1
1323   %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
1324   %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
1325   %vecext.2 = extractelement <4 x float> %0, i32 2
1326   %3 = tail call fast float @llvm.cos.f32(float %vecext.2)
1327   %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
1328   %vecext.3 = extractelement <4 x float> %0, i32 3
1329   %4 = tail call fast float @llvm.cos.f32(float %vecext.3)
1330   %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
1331   ret <4 x float> %vecins.3
1332 }
1333
1334 ; Accelerate *does not* provide cos() for <2 x float>.
1335 define <2 x float> @cos_2x(ptr %a) {
1336 ; CHECK-LABEL: @cos_2x(
1337 ; CHECK-NEXT:  entry:
1338 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1339 ; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1340 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #[[ATTR3:[0-9]+]]
1341 ; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1342 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1343 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #[[ATTR3]]
1344 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1345 ; CHECK-NEXT:    ret <2 x float> [[VECINS_1]]
1346 ;
1347 ; NOACCELERATE-LABEL: @cos_2x(
1348 ; NOACCELERATE-NEXT:  entry:
1349 ; NOACCELERATE-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 16
1350 ; NOACCELERATE-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
1351 ; NOACCELERATE-NEXT:    [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
1352 ; NOACCELERATE-NEXT:    [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
1353 ; NOACCELERATE-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
1354 ; NOACCELERATE-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
1355 ; NOACCELERATE-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
1356 ; NOACCELERATE-NEXT:    ret <2 x float> [[VECINS_1]]
1357 ;
1358 entry:
1359   %0 = load <2 x float>, ptr %a, align 16
1360   %vecext = extractelement <2 x float> %0, i32 0
1361   %1 = tail call fast float @llvm.cos.f32(float %vecext)
1362   %vecins = insertelement <2 x float> undef, float %1, i32 0
1363   %vecext.1 = extractelement <2 x float> %0, i32 1
1364   %2 = tail call fast float @llvm.cos.f32(float %vecext.1)
1365   %vecins.1 = insertelement <2 x float> %vecins, float %2, i32 1
1366   ret <2 x float> %vecins.1
1367 }