llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -passes='default<O3>' -enable-matrix -S %s | FileCheck %s
   3
   4 target triple = "arm64-apple-ios"
   5
   6 define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, ptr nonnull align 8 dereferenceable(1800) %A, ptr nonnull align 8 dereferenceable(1800) %B) #0 {
   7 ; CHECK-LABEL: @matrix_extract_insert_scalar(
   8 ; CHECK-NEXT:  entry:
   9 ; CHECK-NEXT:    [[CONV:%.*]] = zext i32 [[K:%.*]] to i64
  10 ; CHECK-NEXT:    [[CONV1:%.*]] = zext i32 [[J:%.*]] to i64
  11 ; CHECK-NEXT:    [[TMP0:%.*]] = mul nuw nsw i64 [[CONV1]], 15
  12 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV]]
  13 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 225
  14 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
  15 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[TMP1]]
  16 ; CHECK-NEXT:    [[MATRIXEXT:%.*]] = load double, ptr [[TMP3]], align 8
  17 ; CHECK-NEXT:    [[CONV2:%.*]] = zext i32 [[I:%.*]] to i64
  18 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV2]]
  19 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 225
  20 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP5]])
  21 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <225 x double>, ptr [[B:%.*]], i64 0, i64 [[TMP4]]
  22 ; CHECK-NEXT:    [[MATRIXEXT4:%.*]] = load double, ptr [[TMP6]], align 8
  23 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]]
  24 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP1]]
  25 ; CHECK-NEXT:    [[MATRIXEXT7:%.*]] = load double, ptr [[TMP7]], align 8
  26 ; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]]
  27 ; CHECK-NEXT:    store double [[SUB]], ptr [[TMP7]], align 8
  28 ; CHECK-NEXT:    ret void
  29 ;
  30 entry:
  31   %i.addr = alloca i32, align 4
  32   %k.addr = alloca i32, align 4
  33   %j.addr = alloca i32, align 4
  34   %A.addr = alloca ptr, align 8
  35   %B.addr = alloca ptr, align 8
  36   store i32 %i, ptr %i.addr, align 4
  37   store i32 %k, ptr %k.addr, align 4
  38   store i32 %j, ptr %j.addr, align 4
  39   store ptr %A, ptr %A.addr, align 8
  40   store ptr %B, ptr %B.addr, align 8
  41   %0 = load i32, ptr %k.addr, align 4
  42   %conv = zext i32 %0 to i64
  43   %1 = load i32, ptr %j.addr, align 4
  44   %conv1 = zext i32 %1 to i64
  45   %2 = mul i64 %conv1, 15
  46   %3 = add i64 %2, %conv
  47   %4 = icmp ult i64 %3, 225
  48   call void @llvm.assume(i1 %4)
  49   %5 = load ptr, ptr %A.addr, align 8
  50   %6 = load <225 x double>, ptr %5, align 8
  51   %matrixext = extractelement <225 x double> %6, i64 %3
  52   %7 = load i32, ptr %i.addr, align 4
  53   %conv2 = zext i32 %7 to i64
  54   %8 = load i32, ptr %j.addr, align 4
  55   %conv3 = zext i32 %8 to i64
  56   %9 = mul i64 %conv3, 15
  57   %10 = add i64 %9, %conv2
  58   %11 = icmp ult i64 %10, 225
  59   call void @llvm.assume(i1 %11)
  60   %12 = load ptr, ptr %B.addr, align 8
  61   %13 = load <225 x double>, ptr %12, align 8
  62   %matrixext4 = extractelement <225 x double> %13, i64 %10
  63   %mul = fmul double %matrixext, %matrixext4
  64   %14 = load ptr, ptr %B.addr, align 8
  65   %15 = load i32, ptr %k.addr, align 4
  66   %conv5 = zext i32 %15 to i64
  67   %16 = load i32, ptr %j.addr, align 4
  68   %conv6 = zext i32 %16 to i64
  69   %17 = mul i64 %conv6, 15
  70   %18 = add i64 %17, %conv5
  71   %19 = icmp ult i64 %18, 225
  72   call void @llvm.assume(i1 %19)
  73   %20 = load <225 x double>, ptr %14, align 8
  74   %matrixext7 = extractelement <225 x double> %20, i64 %18
  75   %sub = fsub double %matrixext7, %mul
  76   %21 = icmp ult i64 %18, 225
  77   call void @llvm.assume(i1 %21)
  78   %22 = load <225 x double>, ptr %14, align 8
  79   %matins = insertelement <225 x double> %22, double %sub, i64 %18
  80   store <225 x double> %matins, ptr %14, align 8
  81   ret void
  82 }
  83 define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferenceable(1800) %A, ptr nonnull align 8 dereferenceable(1800) %B) {
  84 ; CHECK-LABEL: @matrix_extract_insert_loop(
  85 ; CHECK-NEXT:  entry:
  86 ; CHECK-NEXT:    [[CMP210_NOT:%.*]] = icmp eq i32 [[I:%.*]], 0
  87 ; CHECK-NEXT:    [[CONV6:%.*]] = zext i32 [[I]] to i64
  88 ; CHECK-NEXT:    br i1 [[CMP210_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]]
  89 ; CHECK:       for.cond1.preheader.us:
  90 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[I]], 225
  91 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP0]])
  92 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <225 x double>, ptr [[B:%.*]], i64 0, i64 [[CONV6]]
  93 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
  94 ; CHECK:       for.body4.us:
  95 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ]
  96 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 225
  97 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
  98 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[INDVARS_IV]]
  99 ; CHECK-NEXT:    [[MATRIXEXT_US:%.*]] = load double, ptr [[TMP3]], align 8
 100 ; CHECK-NEXT:    [[MATRIXEXT8_US:%.*]] = load double, ptr [[TMP1]], align 8
 101 ; CHECK-NEXT:    [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]]
 102 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[INDVARS_IV]]
 103 ; CHECK-NEXT:    [[MATRIXEXT11_US:%.*]] = load double, ptr [[TMP4]], align 8
 104 ; CHECK-NEXT:    [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]]
 105 ; CHECK-NEXT:    store double [[SUB_US]], ptr [[TMP4]], align 8
 106 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 107 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[CONV6]]
 108 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]], label [[FOR_BODY4_US]]
 109 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 110 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[CONV6]], 15
 111 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[I]], 210
 112 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP6]])
 113 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP5]]
 114 ; CHECK-NEXT:    br label [[FOR_BODY4_US_1:%.*]]
 115 ; CHECK:       for.body4.us.1:
 116 ; CHECK-NEXT:    [[INDVARS_IV_1:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY4_US_1]] ]
 117 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV_1]], 15
 118 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[INDVARS_IV_1]], 210
 119 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP9]])
 120 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP8]]
 121 ; CHECK-NEXT:    [[MATRIXEXT_US_1:%.*]] = load double, ptr [[TMP10]], align 8
 122 ; CHECK-NEXT:    [[MATRIXEXT8_US_1:%.*]] = load double, ptr [[TMP7]], align 8
 123 ; CHECK-NEXT:    [[MUL_US_1:%.*]] = fmul double [[MATRIXEXT_US_1]], [[MATRIXEXT8_US_1]]
 124 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP8]]
 125 ; CHECK-NEXT:    [[MATRIXEXT11_US_1:%.*]] = load double, ptr [[TMP11]], align 8
 126 ; CHECK-NEXT:    [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]]
 127 ; CHECK-NEXT:    store double [[SUB_US_1]], ptr [[TMP11]], align 8
 128 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV_1]], 1
 129 ; CHECK-NEXT:    [[EXITCOND_NOT_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], [[CONV6]]
 130 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]], label [[FOR_BODY4_US_1]]
 131 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.1:
 132 ; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[CONV6]], 30
 133 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i32 [[I]], 195
 134 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP13]])
 135 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP12]]
 136 ; CHECK-NEXT:    br label [[FOR_BODY4_US_2:%.*]]
 137 ; CHECK:       for.body4.us.2:
 138 ; CHECK-NEXT:    [[INDVARS_IV_2:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INDVARS_IV_NEXT_2:%.*]], [[FOR_BODY4_US_2]] ]
 139 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV_2]], 30
 140 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i64 [[INDVARS_IV_2]], 195
 141 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
 142 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP15]]
 143 ; CHECK-NEXT:    [[MATRIXEXT_US_2:%.*]] = load double, ptr [[TMP17]], align 8
 144 ; CHECK-NEXT:    [[MATRIXEXT8_US_2:%.*]] = load double, ptr [[TMP14]], align 8
 145 ; CHECK-NEXT:    [[MUL_US_2:%.*]] = fmul double [[MATRIXEXT_US_2]], [[MATRIXEXT8_US_2]]
 146 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP15]]
 147 ; CHECK-NEXT:    [[MATRIXEXT11_US_2:%.*]] = load double, ptr [[TMP18]], align 8
 148 ; CHECK-NEXT:    [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]]
 149 ; CHECK-NEXT:    store double [[SUB_US_2]], ptr [[TMP18]], align 8
 150 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_2]] = add nuw nsw i64 [[INDVARS_IV_2]], 1
 151 ; CHECK-NEXT:    [[EXITCOND_NOT_2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], [[CONV6]]
 152 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]], label [[FOR_BODY4_US_2]]
 153 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.2:
 154 ; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw i64 [[CONV6]], 45
 155 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i32 [[I]], 180
 156 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP20]])
 157 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP19]]
 158 ; CHECK-NEXT:    br label [[FOR_BODY4_US_3:%.*]]
 159 ; CHECK:       for.body4.us.3:
 160 ; CHECK-NEXT:    [[INDVARS_IV_3:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY4_US_3]] ]
 161 ; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV_3]], 45
 162 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp ult i64 [[INDVARS_IV_3]], 180
 163 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
 164 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP22]]
 165 ; CHECK-NEXT:    [[MATRIXEXT_US_3:%.*]] = load double, ptr [[TMP24]], align 8
 166 ; CHECK-NEXT:    [[MATRIXEXT8_US_3:%.*]] = load double, ptr [[TMP21]], align 8
 167 ; CHECK-NEXT:    [[MUL_US_3:%.*]] = fmul double [[MATRIXEXT_US_3]], [[MATRIXEXT8_US_3]]
 168 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP22]]
 169 ; CHECK-NEXT:    [[MATRIXEXT11_US_3:%.*]] = load double, ptr [[TMP25]], align 8
 170 ; CHECK-NEXT:    [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]]
 171 ; CHECK-NEXT:    store double [[SUB_US_3]], ptr [[TMP25]], align 8
 172 ; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV_3]], 1
 173 ; CHECK-NEXT:    [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], [[CONV6]]
 174 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_3]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY4_US_3]]
 175 ; CHECK:       for.cond.cleanup:
 176 ; CHECK-NEXT:    ret void
 177 ;
 178 entry:
 179   %i.addr = alloca i32, align 4
 180   %A.addr = alloca ptr, align 8
 181   %B.addr = alloca ptr, align 8
 182   %j = alloca i32, align 4
 183   %cleanup.dest.slot = alloca i32, align 4
 184   %k = alloca i32, align 4
 185   store i32 %i, ptr %i.addr, align 4
 186   store ptr %A, ptr %A.addr, align 8
 187   store ptr %B, ptr %B.addr, align 8
 188   call void @llvm.lifetime.start.p0(i64 4, ptr %j) #3
 189   store i32 0, ptr %j, align 4
 190   br label %for.cond
 191
 192 for.cond:                                         ; preds = %for.inc12, %entry
 193   %0 = load i32, ptr %j, align 4
 194   %cmp = icmp ult i32 %0, 4
 195   br i1 %cmp, label %for.body, label %for.cond.cleanup
 196
 197 for.cond.cleanup:                                 ; preds = %for.cond
 198   store i32 2, ptr %cleanup.dest.slot, align 4
 199   call void @llvm.lifetime.end.p0(i64 4, ptr %j) #3
 200   br label %for.end14
 201
 202 for.body:                                         ; preds = %for.cond
 203   call void @llvm.lifetime.start.p0(i64 4, ptr %k) #3
 204   store i32 0, ptr %k, align 4
 205   br label %for.cond1
 206
 207 for.cond1:                                        ; preds = %for.inc, %for.body
 208   %1 = load i32, ptr %k, align 4
 209   %2 = load i32, ptr %i.addr, align 4
 210   %cmp2 = icmp ult i32 %1, %2
 211   br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
 212
 213 for.cond.cleanup3:                                ; preds = %for.cond1
 214   store i32 5, ptr %cleanup.dest.slot, align 4
 215   call void @llvm.lifetime.end.p0(i64 4, ptr %k) #3
 216   br label %for.end
 217
 218 for.body4:                                        ; preds = %for.cond1
 219   %3 = load i32, ptr %k, align 4
 220   %conv = zext i32 %3 to i64
 221   %4 = load i32, ptr %j, align 4
 222   %conv5 = zext i32 %4 to i64
 223   %5 = mul i64 %conv5, 15
 224   %6 = add i64 %5, %conv
 225   %7 = icmp ult i64 %6, 225
 226   call void @llvm.assume(i1 %7)
 227   %8 = load ptr, ptr %A.addr, align 8
 228   %9 = load <225 x double>, ptr %8, align 8
 229   %matrixext = extractelement <225 x double> %9, i64 %6
 230   %10 = load i32, ptr %i.addr, align 4
 231   %conv6 = zext i32 %10 to i64
 232   %11 = load i32, ptr %j, align 4
 233   %conv7 = zext i32 %11 to i64
 234   %12 = mul i64 %conv7, 15
 235   %13 = add i64 %12, %conv6
 236   %14 = icmp ult i64 %13, 225
 237   call void @llvm.assume(i1 %14)
 238   %15 = load ptr, ptr %B.addr, align 8
 239   %16 = load <225 x double>, ptr %15, align 8
 240   %matrixext8 = extractelement <225 x double> %16, i64 %13
 241   %mul = fmul double %matrixext, %matrixext8
 242   %17 = load ptr, ptr %B.addr, align 8
 243   %18 = load i32, ptr %k, align 4
 244   %conv9 = zext i32 %18 to i64
 245   %19 = load i32, ptr %j, align 4
 246   %conv10 = zext i32 %19 to i64
 247   %20 = mul i64 %conv10, 15
 248   %21 = add i64 %20, %conv9
 249   %22 = icmp ult i64 %21, 225
 250   call void @llvm.assume(i1 %22)
 251   %23 = load <225 x double>, ptr %17, align 8
 252   %matrixext11 = extractelement <225 x double> %23, i64 %21
 253   %sub = fsub double %matrixext11, %mul
 254   %24 = icmp ult i64 %21, 225
 255   call void @llvm.assume(i1 %24)
 256   %25 = load <225 x double>, ptr %17, align 8
 257   %matins = insertelement <225 x double> %25, double %sub, i64 %21
 258   store <225 x double> %matins, ptr %17, align 8
 259   br label %for.inc
 260
 261 for.inc:                                          ; preds = %for.body4
 262   %26 = load i32, ptr %k, align 4
 263   %inc = add i32 %26, 1
 264   store i32 %inc, ptr %k, align 4
 265   br label %for.cond1
 266
 267 for.end:                                          ; preds = %for.cond.cleanup3
 268   br label %for.inc12
 269
 270 for.inc12:                                        ; preds = %for.end
 271   %27 = load i32, ptr %j, align 4
 272   %inc13 = add i32 %27, 1
 273   store i32 %inc13, ptr %j, align 4
 274   br label %for.cond
 275
 276 for.end14:                                        ; preds = %for.cond.cleanup
 277   ret void
 278 }
 279
 280 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
 281 declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
 282
 283 ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn
 284 declare void @llvm.assume(i1 noundef) #2
 285
 286 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
 287 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
 288
 289 ; Function Attrs: nounwind ssp uwtable mustprogress
 290
 291 define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) {
 292 ; CHECK-LABEL: @reverse_hadd_v4f32(
 293 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[A:%.*]], <4 x i32> <i32 2, i32 0, i32 6, i32 4>
 294 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[A]], <4 x i32> <i32 3, i32 1, i32 7, i32 5>
 295 ; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
 296 ; CHECK-NEXT:    ret <4 x float> [[TMP3]]
 297 ;
 298   %vecext = extractelement <4 x float> %a, i32 0
 299   %vecext1 = extractelement <4 x float> %a, i32 1
 300   %add = fadd float %vecext, %vecext1
 301   %vecinit = insertelement <4 x float> undef, float %add, i32 0
 302   %vecext2 = extractelement <4 x float> %a, i32 2
 303   %vecext3 = extractelement <4 x float> %a, i32 3
 304   %add4 = fadd float %vecext2, %vecext3
 305   %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
 306   %vecext6 = extractelement <4 x float> %b, i32 0
 307   %vecext7 = extractelement <4 x float> %b, i32 1
 308   %add8 = fadd float %vecext6, %vecext7
 309   %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
 310   %vecext10 = extractelement <4 x float> %b, i32 2
 311   %vecext11 = extractelement <4 x float> %b, i32 3
 312   %add12 = fadd float %vecext10, %vecext11
 313   %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
 314   %shuffle = shufflevector <4 x float> %vecinit13, <4 x float> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 315   ret <4 x float> %shuffle
 316 }