llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
   2 ; RUN: opt < %s -S -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s
   3
   4 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
   5
   6 define i32 @masked_gather() {
   7 ; CHECK-LABEL: 'masked_gather'
   8 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
   9 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x double> undef)
  10 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x float> undef)
  11 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x float> undef)
  12 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
  13 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
  14 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
  15 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
  16 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x half> undef)
  17 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x half> undef)
  18 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
  19 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
  20 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x i32> undef)
  21 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i32> undef)
  22 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i32> undef)
  23 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i32> undef)
  24 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x i16> undef)
  25 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x i16> undef)
  26 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x i16> undef)
  27 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x i16> undef)
  28 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
  29 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
  30 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
  31 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
  32 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
  33 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32p = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x ptr> undef)
  34 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
  35 ;
  36   %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x double> undef)
  37   %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x double> undef)
  38
  39   %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x float> undef)
  40   %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x float> undef)
  41   %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
  42   %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
  43
  44   %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
  45   %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
  46   %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x half> undef)
  47   %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x half> undef)
  48
  49   %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
  50   %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
  51
  52   %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> undef, i32 4, <16 x i1> undef, <16 x i32> undef)
  53   %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> undef, i32 4, <8 x i1> undef, <8 x i32> undef)
  54   %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x i32> undef)
  55   %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x i32> undef)
  56
  57   %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x i16> undef)
  58   %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x i16> undef)
  59   %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x i16> undef)
  60   %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x i16> undef)
  61
  62   %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
  63   %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
  64   %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
  65   %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
  66   %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
  67
  68   %V4I32p = call <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x ptr> undef)
  69
  70   ret i32 0
  71 }
  72
  73 define i32 @masked_scatter() {
  74 ; CHECK-LABEL: 'masked_scatter'
  75 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
  76 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
  77 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
  78 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
  79 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
  80 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
  81 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
  82 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
  83 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
  84 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
  85 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
  86 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
  87 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
  88 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
  89 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
  90 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
  91 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
  92 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
  93 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
  94 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
  95 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
  96 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
  97 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
  98 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
  99 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
 100 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
 101 ;
 102   call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
 103   call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
 104
 105   call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
 106   call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
 107   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
 108   call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
 109
 110   call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
 111   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
 112   call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
 113   call void @llvm.masked.scatter.v2f16.v2p0(<2 x half> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
 114
 115   call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
 116   call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
 117
 118   call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> undef, <16 x ptr> undef, i32 4, <16 x i1> undef)
 119   call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> undef, <8 x ptr> undef, i32 4, <8 x i1> undef)
 120   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
 121   call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
 122
 123   call void @llvm.masked.scatter.v16i16.v16p0(<16 x i16> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
 124   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
 125   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
 126   call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
 127
 128   call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
 129   call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
 130   call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
 131   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
 132   call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
 133
 134   ret i32 0
 135 }
 136
 137 define void @gep_v4i32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 138 ; CHECK-LABEL: 'gep_v4i32'
 139 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
 140 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
 141 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
 142 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 143 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i32, ptr %base, <4 x i32> %indzext
 144 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef)
 145 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
 146 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 147 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i32, ptr %base, <4 x i32> %indsext
 148 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef)
 149 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
 150 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepu = getelementptr i32, ptr %base, <4 x i32> %ind32
 151 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef)
 152 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
 153 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
 154 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
 155 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef)
 156 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
 157 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
 158 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
 159 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef)
 160 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
 161 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 162 ;
 163   %gep1 = getelementptr i32, ptr %base, <4 x i32> %ind32
 164   %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
 165   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
 166
 167   %indzext = zext <4 x i16> %ind16 to <4 x i32>
 168   %gep2 = getelementptr i32, ptr %base, <4 x i32> %indzext
 169   %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef)
 170   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
 171
 172   %indsext = sext <4 x i16> %ind16 to <4 x i32>
 173   %gep3 = getelementptr i32, ptr %base, <4 x i32> %indsext
 174   %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef)
 175   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
 176
 177   ; unaligned
 178   %gepu = getelementptr i32, ptr %base, <4 x i32> %ind32
 179   %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef)
 180   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
 181
 182   ; 1 scale
 183   %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
 184   %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
 185   %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef)
 186   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
 187
 188   ; bad scale (but doesn't really matter because i32)
 189   %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
 190   %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
 191   %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef)
 192   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
 193   ret void
 194 }
 195
 196 define void @gep_v4f32(ptr %base, ptr %base16, ptr %base8, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 197 ; CHECK-LABEL: 'gep_v4f32'
 198 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
 199 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
 200 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
 201 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 202 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr float, ptr %base, <4 x i32> %indzext
 203 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef)
 204 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
 205 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 206 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr float, ptr %base, <4 x i32> %indsext
 207 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef)
 208 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
 209 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gepu = getelementptr float, ptr %base, <4 x i32> %ind32
 210 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef)
 211 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
 212 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
 213 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
 214 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef)
 215 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
 216 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
 217 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
 218 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef)
 219 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
 220 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 221 ;
 222   %gep1 = getelementptr float, ptr %base, <4 x i32> %ind32
 223   %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
 224   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res1, <4 x ptr> %gep1, i32 4, <4 x i1> %mask)
 225
 226   %indzext = zext <4 x i16> %ind16 to <4 x i32>
 227   %gep2 = getelementptr float, ptr %base, <4 x i32> %indzext
 228   %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef)
 229   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res2, <4 x ptr> %gep2, i32 4, <4 x i1> %mask)
 230
 231   %indsext = sext <4 x i16> %ind16 to <4 x i32>
 232   %gep3 = getelementptr float, ptr %base, <4 x i32> %indsext
 233   %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef)
 234   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %res3, <4 x ptr> %gep3, i32 4, <4 x i1> %mask)
 235
 236   ; unaligned
 237   %gepu = getelementptr float, ptr %base, <4 x i32> %ind32
 238   %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef)
 239   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resu, <4 x ptr> %gepu, i32 1, <4 x i1> %mask)
 240
 241   ; 1 scale
 242   %gepos = getelementptr i8, ptr %base8, <4 x i32> %indzext
 243   %geposb = bitcast <4 x ptr> %gepos to <4 x ptr>
 244   %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef)
 245   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resos, <4 x ptr> %geposb, i32 4, <4 x i1> %mask)
 246
 247   ; bad scale (but doesn't really matter because i32)
 248   %gepbs = getelementptr i16, ptr %base16, <4 x i32> %indzext
 249   %gepbsb = bitcast <4 x ptr> %gepbs to <4 x ptr>
 250   %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef)
 251   call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %resbs, <4 x ptr> %gepbsb, i32 4, <4 x i1> %mask)
 252   ret void
 253 }
 254
 255 define void @gep_v4i16(ptr %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1> %mask)  {
 256 ; CHECK-LABEL: 'gep_v4i16'
 257 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
 258 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
 259 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res1, <4 x ptr> %gep1, i32 2, <4 x i1> %mask)
 260 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
 261 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, ptr %base, <4 x i32> %indzext
 262 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef)
 263 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res2, <4 x ptr> %gep2, i32 2, <4 x i1> %mask)
 264 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
 265 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, ptr %base, <4 x i32> %indsext
 266 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
 267 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res3, <4 x ptr> %gep3, i32 2, <4 x i1> %mask)
 268 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i16, ptr %base, <4 x i16> %ind16
 269 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
 270 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <4 x i16> %res5 to <4 x i32>
 271 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
 272 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
 273 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
 274 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <4 x i16> %res6 to <4 x i32>
 275 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
 276 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
 277 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 278 ;
 279   %gep1 = getelementptr i16, ptr %base, <4 x i32> %ind32
 280   %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
 281   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res1, <4 x ptr> %gep1, i32 2, <4 x i1> %mask)
 282
 283   %indzext = zext <4 x i16> %ind16 to <4 x i32>
 284   %gep2 = getelementptr i16, ptr %base, <4 x i32> %indzext
 285   %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef)
 286   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res2, <4 x ptr> %gep2, i32 2, <4 x i1> %mask)
 287
 288   %indsext = sext <4 x i16> %ind16 to <4 x i32>
 289   %gep3 = getelementptr i16, ptr %base, <4 x i32> %indsext
 290   %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
 291   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res3, <4 x ptr> %gep3, i32 2, <4 x i1> %mask)
 292
 293   ; result zext
 294   %gep5 = getelementptr i16, ptr %base, <4 x i16> %ind16
 295   %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
 296   %res5zext = zext <4 x i16> %res5 to <4 x i32>
 297   %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
 298   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
 299
 300   ; result sext
 301   %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
 302   %res6sext = sext <4 x i16> %res6 to <4 x i32>
 303   %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
 304   call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
 305
 306   ret void
 307 }
 308
 309 define void @gep_v4i8(ptr %base, <4 x i8> %ind8, <4 x i1> %mask)  {
 310 ; CHECK-LABEL: 'gep_v4i8'
 311 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, ptr %base, <4 x i8> %ind8
 312 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
 313 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res5zext = zext <4 x i8> %res5 to <4 x i32>
 314 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
 315 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
 316 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
 317 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %res6sext = sext <4 x i8> %res6 to <4 x i32>
 318 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
 319 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
 320 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 321 ;
 322   ; result zext
 323   %gep5 = getelementptr i8, ptr %base, <4 x i8> %ind8
 324   %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
 325   %res5zext = zext <4 x i8> %res5 to <4 x i32>
 326   %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
 327   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res5trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
 328
 329   ; result sext
 330   %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
 331   %res6sext = sext <4 x i8> %res6 to <4 x i32>
 332   %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
 333   call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %res6trunc, <4 x ptr> %gep5, i32 4, <4 x i1> %mask)
 334
 335   ret void
 336 }
 337
 338 define void @gep_v8i16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i8> %ind8, <8 x i1> %mask)  {
 339 ; CHECK-LABEL: 'gep_v8i16'
 340 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, ptr %base, <8 x i32> %ind32
 341 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
 342 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
 343 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32>
 344 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, ptr %base, <8 x i32> %indzext
 345 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
 346 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
 347 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32>
 348 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, ptr %base, <8 x i32> %indsext
 349 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
 350 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
 351 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef)
 352 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
 353 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
 354 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
 355 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef)
 356 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
 357 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
 358 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
 359 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
 360 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
 361 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
 362 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i16, ptr %base, <8 x i32> %indzext4
 363 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
 364 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %indtrunc, <8 x ptr> %gep4, i32 2, <8 x i1> %mask)
 365 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
 366 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %ressext = sext <8 x i16> %res to <8 x i32>
 367 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %restrunc = trunc <8 x i32> %ressext to <8 x i16>
 368 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %restrunc, <8 x ptr> %gep4, i32 4, <8 x i1> %mask)
 369 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 370 ;
 371   ; no offset ext
 372   %gep1 = getelementptr i16, ptr %base, <8 x i32> %ind32
 373   %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
 374   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
 375
 376   ; offset zext
 377   %indzext = zext <8 x i16> %ind16 to <8 x i32>
 378   %gep2 = getelementptr i16, ptr %base, <8 x i32> %indzext
 379   %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
 380   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
 381
 382   ; offset sext
 383   %indsext = sext <8 x i16> %ind16 to <8 x i32>
 384   %gep3 = getelementptr i16, ptr %base, <8 x i32> %indsext
 385   %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
 386   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
 387
 388   ; unaligned
 389   %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef)
 390   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
 391
 392   ; 1 scale
 393   %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
 394   %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
 395   %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef)
 396   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
 397
 398   ; bad scale
 399   %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
 400   %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
 401   %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
 402   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
 403
 404   ; trunc scatter
 405   %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
 406   %gep4 = getelementptr i16, ptr %base, <8 x i32> %indzext4
 407   %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
 408   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %indtrunc, <8 x ptr> %gep4, i32 2, <8 x i1> %mask)
 409
 410   ; ext result to <8 x i32>
 411   %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
 412   %ressext = sext <8 x i16> %res to <8 x i32>
 413   %restrunc = trunc <8 x i32> %ressext to <8 x i16>
 414   call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %restrunc, <8 x ptr> %gep4, i32 4, <8 x i1> %mask)
 415
 416   ret void
 417 }
 418
 419 define void @gep_v8f16(ptr %base, ptr %base8, ptr %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask)  {
 420 ; CHECK-LABEL: 'gep_v8f16'
 421 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr half, ptr %base, <8 x i32> %ind32
 422 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
 423 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
 424 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32>
 425 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr half, ptr %base, <8 x i32> %indzext
 426 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
 427 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
 428 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32>
 429 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr half, ptr %base, <8 x i32> %indsext
 430 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
 431 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
 432 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef)
 433 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
 434 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
 435 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
 436 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef)
 437 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
 438 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
 439 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
 440 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
 441 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
 442 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 443 ;
 444   ; no offset ext
 445   %gep1 = getelementptr half, ptr %base, <8 x i32> %ind32
 446   %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
 447   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res1, <8 x ptr> %gep1, i32 2, <8 x i1> %mask)
 448
 449   ; offset zext
 450   %indzext = zext <8 x i16> %ind16 to <8 x i32>
 451   %gep2 = getelementptr half, ptr %base, <8 x i32> %indzext
 452   %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
 453   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res2, <8 x ptr> %gep2, i32 2, <8 x i1> %mask)
 454
 455   ; offset sext
 456   %indsext = sext <8 x i16> %ind16 to <8 x i32>
 457   %gep3 = getelementptr half, ptr %base, <8 x i32> %indsext
 458   %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
 459   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %res3, <8 x ptr> %gep3, i32 2, <8 x i1> %mask)
 460
 461   ; unaligned
 462   %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef)
 463   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resu, <8 x ptr> %gep2, i32 1, <8 x i1> %mask)
 464
 465   ; 1 scale
 466   %gepos = getelementptr i8, ptr %base8, <8 x i32> %indzext
 467   %geposb = bitcast <8 x ptr> %gepos to <8 x ptr>
 468   %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef)
 469   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resos, <8 x ptr> %geposb, i32 2, <8 x i1> %mask)
 470
 471   ; bad scale
 472   %gepbs = getelementptr i32, ptr %base32, <8 x i32> %indzext
 473   %gepbsb = bitcast <8 x ptr> %gepbs to <8 x ptr>
 474   %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
 475   call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %resbs, <8 x ptr> %gepbsb, i32 2, <8 x i1> %mask)
 476
 477   ret void
 478 }
 479
 480 define void @gep_v8i8(ptr %base, <8 x i8> %ind8, <8 x i1> %mask)  {
 481 ; CHECK-LABEL: 'gep_v8i8'
 482 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %indzext = zext <8 x i8> %ind8 to <8 x i32>
 483 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, ptr %base, <8 x i32> %indzext
 484 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
 485 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <8 x i8> %res5 to <8 x i16>
 486 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
 487 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res5trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
 488 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
 489 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <8 x i8> %res6 to <8 x i16>
 490 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
 491 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res6trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
 492 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 493 ;
 494   ; result zext
 495   %indzext = zext <8 x i8> %ind8 to <8 x i32>
 496   %gep5 = getelementptr i8, ptr %base, <8 x i32> %indzext
 497   %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
 498   %res5zext = zext <8 x i8> %res5 to <8 x i16>
 499   %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
 500   call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res5trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
 501
 502   ; result sext
 503   %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
 504   %res6sext = sext <8 x i8> %res6 to <8 x i16>
 505   %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
 506   call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %res6trunc, <8 x ptr> %gep5, i32 4, <8 x i1> %mask)
 507
 508   ret void
 509 }
 510
 511 define void @gep_v16i8(ptr %base, ptr %base16, <16 x i8> %ind8, <16 x i32> %ind32, <16 x i1> %mask)  {
 512 ; CHECK-LABEL: 'gep_v16i8'
 513 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i8, ptr %base, <16 x i32> %ind32
 514 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
 515 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res1, <16 x ptr> %gep1, i32 2, <16 x i1> %mask)
 516 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %indzext = zext <16 x i8> %ind8 to <16 x i32>
 517 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i8, ptr %base, <16 x i32> %indzext
 518 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
 519 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res2, <16 x ptr> %gep2, i32 2, <16 x i1> %mask)
 520 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %indsext = sext <16 x i8> %ind8 to <16 x i32>
 521 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i8, ptr %base, <16 x i32> %indsext
 522 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
 523 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res3, <16 x ptr> %gep3, i32 2, <16 x i1> %mask)
 524 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, ptr %base16, <16 x i32> %indzext
 525 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <16 x ptr> %gepbs to <16 x ptr>
 526 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
 527 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbsb, i32 2, <16 x i1> %mask)
 528 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
 529 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %gep4 = getelementptr i8, ptr %base, <16 x i32> %indzext
 530 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
 531 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %indtrunc, <16 x ptr> %gep4, i32 2, <16 x i1> %mask)
 532 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 533 ;
 534   ; no offset ext
 535   %gep1 = getelementptr i8, ptr %base, <16 x i32> %ind32
 536   %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
 537   call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res1, <16 x ptr> %gep1, i32 2, <16 x i1> %mask)
 538
 539   ; offset zext
 540   %indzext = zext <16 x i8> %ind8 to <16 x i32>
 541   %gep2 = getelementptr i8, ptr %base, <16 x i32> %indzext
 542   %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
 543   call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res2, <16 x ptr> %gep2, i32 2, <16 x i1> %mask)
 544
 545   ; offset sext
 546   %indsext = sext <16 x i8> %ind8 to <16 x i32>
 547   %gep3 = getelementptr i8, ptr %base, <16 x i32> %indsext
 548   %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
 549   call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %res3, <16 x ptr> %gep3, i32 2, <16 x i1> %mask)
 550
 551   ; Bad scale
 552   %gepbs = getelementptr i16, ptr %base16, <16 x i32> %indzext
 553   %gepbsb = bitcast <16 x ptr> %gepbs to <16 x ptr>
 554   %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
 555   call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbsb, i32 2, <16 x i1> %mask)
 556
 557   ; trunc scatter
 558   %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
 559   %gep4 = getelementptr i8, ptr %base, <16 x i32> %indzext
 560   %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
 561   call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %indtrunc, <16 x ptr> %gep4, i32 2, <16 x i1> %mask)
 562
 563   ret void
 564 }
 565
 566 define void @gep_v16i8p(<16 x ptr> %base, i32 %off, <16 x i1> %mask)  {
 567 ; CHECK-LABEL: 'gep_v16i8p'
 568 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
 569 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef)
 570 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbs, i32 2, <16 x i1> %mask)
 571 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 572 ;
 573   %gepbs = getelementptr i8, <16 x ptr> %base, i32 %off
 574   %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef)
 575   call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %resbs, <16 x ptr> %gepbs, i32 2, <16 x i1> %mask)
 576   ret void
 577 }
 578
 579 declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>)
 580 declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>)
 581
 582 declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
 583 declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
 584 declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
 585 declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
 586
 587 declare <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x half>)
 588 declare <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x half>)
 589 declare <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x half>)
 590 declare <2 x half> @llvm.masked.gather.v2f16.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x half>)
 591
 592 declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>)
 593 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>)
 594
 595 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
 596 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
 597 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
 598 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
 599 declare <4 x ptr> @llvm.masked.gather.v4p0.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x ptr>)
 600
 601 declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
 602 declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
 603 declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
 604 declare <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i16>)
 605
 606 declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
 607 declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
 608 declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
 609 declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i8>)
 610 declare <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i8>)
 611
 612 declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32, <4 x i1>)
 613 declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>)
 614
 615 declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
 616 declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
 617 declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
 618 declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
 619
 620 declare void @llvm.masked.scatter.v16f16.v16p0(<16 x half>, <16 x ptr>, i32, <16 x i1>)
 621 declare void @llvm.masked.scatter.v8f16.v8p0(<8 x half>, <8 x ptr>, i32, <8 x i1>)
 622 declare void @llvm.masked.scatter.v4f16.v4p0(<4 x half>, <4 x ptr>, i32, <4 x i1>)
 623 declare void @llvm.masked.scatter.v2f16.v2p0(<2 x half>, <2 x ptr>, i32, <2 x i1>)
 624
 625 declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32, <4 x i1>)
 626 declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32, <2 x i1>)
 627
 628 declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
 629 declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
 630 declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
 631 declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
 632
 633 declare void @llvm.masked.scatter.v16i16.v16p0(<16 x i16>, <16 x ptr>, i32, <16 x i1>)
 634 declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
 635 declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
 636 declare void @llvm.masked.scatter.v2i16.v2p0(<2 x i16>, <2 x ptr>, i32, <2 x i1>)
 637
 638 declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32, <32 x i1>)
 639 declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
 640 declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
 641 declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32, <4 x i1>)
 642 declare void @llvm.masked.scatter.v2i8.v2p0(<2 x i8>, <2 x ptr>, i32, <2 x i1>)