llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
   3 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
   4
   5 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
   6
   7 define float @matching_fp_scalar(float* align 16 dereferenceable(16) %p) {
   8 ; CHECK-LABEL: @matching_fp_scalar(
   9 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[P:%.*]], align 16
  10 ; CHECK-NEXT:    ret float [[R]]
  11 ;
  12   %r = load float, float* %p, align 16
  13   ret float %r
  14 }
  15
  16 define float @matching_fp_scalar_volatile(float* align 16 dereferenceable(16) %p) {
  17 ; CHECK-LABEL: @matching_fp_scalar_volatile(
  18 ; CHECK-NEXT:    [[R:%.*]] = load volatile float, float* [[P:%.*]], align 16
  19 ; CHECK-NEXT:    ret float [[R]]
  20 ;
  21   %r = load volatile float, float* %p, align 16
  22   ret float %r
  23 }
  24
  25 define double @larger_fp_scalar(float* align 16 dereferenceable(16) %p) {
  26 ; CHECK-LABEL: @larger_fp_scalar(
  27 ; CHECK-NEXT:    [[BC:%.*]] = bitcast float* [[P:%.*]] to double*
  28 ; CHECK-NEXT:    [[R:%.*]] = load double, double* [[BC]], align 16
  29 ; CHECK-NEXT:    ret double [[R]]
  30 ;
  31   %bc = bitcast float* %p to double*
  32   %r = load double, double* %bc, align 16
  33   ret double %r
  34 }
  35
  36 define float @smaller_fp_scalar(double* align 16 dereferenceable(16) %p) {
  37 ; CHECK-LABEL: @smaller_fp_scalar(
  38 ; CHECK-NEXT:    [[BC:%.*]] = bitcast double* [[P:%.*]] to float*
  39 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
  40 ; CHECK-NEXT:    ret float [[R]]
  41 ;
  42   %bc = bitcast double* %p to float*
  43   %r = load float, float* %bc, align 16
  44   ret float %r
  45 }
  46
  47 define float @matching_fp_vector(<4 x float>* align 16 dereferenceable(16) %p) {
  48 ; CHECK-LABEL: @matching_fp_vector(
  49 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
  50 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
  51 ; CHECK-NEXT:    ret float [[R]]
  52 ;
  53   %bc = bitcast <4 x float>* %p to float*
  54   %r = load float, float* %bc, align 16
  55   ret float %r
  56 }
  57
  58 define float @matching_fp_vector_gep00(<4 x float>* align 16 dereferenceable(16) %p) {
  59 ; CHECK-LABEL: @matching_fp_vector_gep00(
  60 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
  61 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
  62 ; CHECK-NEXT:    ret float [[R]]
  63 ;
  64   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
  65   %r = load float, float* %gep, align 16
  66   ret float %r
  67 }
  68
  69 define float @matching_fp_vector_gep01(<4 x float>* align 16 dereferenceable(20) %p) {
  70 ; CHECK-LABEL: @matching_fp_vector_gep01(
  71 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 1
  72 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 4
  73 ; CHECK-NEXT:    ret float [[R]]
  74 ;
  75   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 1
  76   %r = load float, float* %gep, align 4
  77   ret float %r
  78 }
  79
  80 define float @matching_fp_vector_gep01_deref(<4 x float>* align 16 dereferenceable(19) %p) {
  81 ; CHECK-LABEL: @matching_fp_vector_gep01_deref(
  82 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 1
  83 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 4
  84 ; CHECK-NEXT:    ret float [[R]]
  85 ;
  86   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 1
  87   %r = load float, float* %gep, align 4
  88   ret float %r
  89 }
  90
  91 define float @matching_fp_vector_gep10(<4 x float>* align 16 dereferenceable(32) %p) {
  92 ; CHECK-LABEL: @matching_fp_vector_gep10(
  93 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 1, i64 0
  94 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
  95 ; CHECK-NEXT:    ret float [[R]]
  96 ;
  97   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 1, i64 0
  98   %r = load float, float* %gep, align 16
  99   ret float %r
 100 }
 101
 102 define float @matching_fp_vector_gep10_deref(<4 x float>* align 16 dereferenceable(31) %p) {
 103 ; CHECK-LABEL: @matching_fp_vector_gep10_deref(
 104 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 1, i64 0
 105 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
 106 ; CHECK-NEXT:    ret float [[R]]
 107 ;
 108   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 1, i64 0
 109   %r = load float, float* %gep, align 16
 110   ret float %r
 111 }
 112
 113 define float @nonmatching_int_vector(<2 x i64>* align 16 dereferenceable(16) %p) {
 114 ; CHECK-LABEL: @nonmatching_int_vector(
 115 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64>* [[P:%.*]] to float*
 116 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
 117 ; CHECK-NEXT:    ret float [[R]]
 118 ;
 119   %bc = bitcast <2 x i64>* %p to float*
 120   %r = load float, float* %bc, align 16
 121   ret float %r
 122 }
 123
 124 define double @less_aligned(double* align 4 dereferenceable(16) %p) {
 125 ; CHECK-LABEL: @less_aligned(
 126 ; CHECK-NEXT:    [[R:%.*]] = load double, double* [[P:%.*]], align 4
 127 ; CHECK-NEXT:    ret double [[R]]
 128 ;
 129   %r = load double, double* %p, align 4
 130   ret double %r
 131 }
 132
 133 define float @matching_fp_scalar_small_deref(float* align 16 dereferenceable(15) %p) {
 134 ; CHECK-LABEL: @matching_fp_scalar_small_deref(
 135 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[P:%.*]], align 16
 136 ; CHECK-NEXT:    ret float [[R]]
 137 ;
 138   %r = load float, float* %p, align 16
 139   ret float %r
 140 }
 141
 142 define i64 @larger_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) {
 143 ; CHECK-LABEL: @larger_int_scalar(
 144 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i64*
 145 ; CHECK-NEXT:    [[R:%.*]] = load i64, i64* [[BC]], align 16
 146 ; CHECK-NEXT:    ret i64 [[R]]
 147 ;
 148   %bc = bitcast <4 x float>* %p to i64*
 149   %r = load i64, i64* %bc, align 16
 150   ret i64 %r
 151 }
 152
 153 define i8 @smaller_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) {
 154 ; CHECK-LABEL: @smaller_int_scalar(
 155 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i8*
 156 ; CHECK-NEXT:    [[R:%.*]] = load i8, i8* [[BC]], align 16
 157 ; CHECK-NEXT:    ret i8 [[R]]
 158 ;
 159   %bc = bitcast <4 x float>* %p to i8*
 160   %r = load i8, i8* %bc, align 16
 161   ret i8 %r
 162 }
 163
 164 define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable(32) %p) {
 165 ; CHECK-LABEL: @larger_fp_scalar_256bit_vec(
 166 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x float>* [[P:%.*]] to double*
 167 ; CHECK-NEXT:    [[R:%.*]] = load double, double* [[BC]], align 32
 168 ; CHECK-NEXT:    ret double [[R]]
 169 ;
 170   %bc = bitcast <8 x float>* %p to double*
 171   %r = load double, double* %bc, align 32
 172   ret double %r
 173 }
 174
 175 define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) nofree nosync {
 176 ; CHECK-LABEL: @load_f32_insert_v4f32(
 177 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 178 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 179 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 180 ; CHECK-NEXT:    ret <4 x float> [[R]]
 181 ;
 182   %s = load float, float* %p, align 4
 183   %r = insertelement <4 x float> poison, float %s, i32 0
 184   ret <4 x float> %r
 185 }
 186
 187 define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) nofree nosync {
 188 ; CHECK-LABEL: @casted_load_f32_insert_v4f32(
 189 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
 190 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 191 ; CHECK-NEXT:    ret <4 x float> [[R]]
 192 ;
 193   %b = bitcast <4 x float>* %p to float*
 194   %s = load float, float* %b, align 4
 195   %r = insertelement <4 x float> poison, float %s, i32 0
 196   ret <4 x float> %r
 197 }
 198
 199 ; Element type does not change cost.
 200
 201 define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) nofree nosync {
 202 ; CHECK-LABEL: @load_i32_insert_v4i32(
 203 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
 204 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
 205 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 206 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 207 ;
 208   %s = load i32, i32* %p, align 4
 209   %r = insertelement <4 x i32> poison, i32 %s, i32 0
 210   ret <4 x i32> %r
 211 }
 212
 213 ; Pointer type does not change cost.
 214
 215 define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) nofree nosync {
 216 ; CHECK-LABEL: @casted_load_i32_insert_v4i32(
 217 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
 218 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 219 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 220 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 221 ;
 222   %b = bitcast <16 x i8>* %p to i32*
 223   %s = load i32, i32* %b, align 4
 224   %r = insertelement <4 x i32> poison, i32 %s, i32 0
 225   ret <4 x i32> %r
 226 }
 227
 228 ; This is canonical form for vector element access.
 229
 230 define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) nofree nosync {
 231 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
 232 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
 233 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 234 ; CHECK-NEXT:    ret <4 x float> [[R]]
 235 ;
 236   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
 237   %s = load float, float* %gep, align 16
 238   %r = insertelement <4 x float> poison, float %s, i64 0
 239   ret <4 x float> %r
 240 }
 241
 242 ; Should work with addrspace as well.
 243
 244 define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) nofree nosync {
 245 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
 246 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16
 247 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 248 ; CHECK-NEXT:    ret <4 x float> [[R]]
 249 ;
 250   %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0
 251   %s = load float, float addrspace(44)* %gep, align 16
 252   %r = insertelement <4 x float> poison, float %s, i64 0
 253   ret <4 x float> %r
 254 }
 255
 256 ; Should work with addrspace even when peeking past unsafe loads through geps
 257
 258 define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(i32* align 16 dereferenceable(16) %v3) {
 259 ; CHECK-LABEL: @unsafe_load_i32_insert_v4i32_addrspace(
 260 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast i32* [[V3:%.*]] to <4 x i32> addrspace(42)*
 261 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(42)* [[TMP1]], align 16
 262 ; CHECK-NEXT:    [[INSELT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
 263 ; CHECK-NEXT:    ret <4 x i32> [[INSELT]]
 264 ;
 265   %t0 = getelementptr inbounds i32, i32* %v3, i32 1
 266   %t1 = addrspacecast i32* %t0 to i32 addrspace(42)*
 267   %t2 = getelementptr inbounds i32, i32 addrspace(42)* %t1, i64 1
 268   %val = load i32, i32 addrspace(42)* %t2, align 4
 269   %inselt = insertelement <4 x i32> poison, i32 %val, i32 0
 270   ret <4 x i32> %inselt
 271 }
 272
 273 ; If there are enough dereferenceable bytes, we can offset the vector load.
 274
 275 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync {
 276 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 277 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
 278 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
 279 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 280 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 281 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 282 ;
 283   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
 284   %s = load i16, i16* %gep, align 2
 285   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 286   ret <8 x i16> %r
 287 }
 288
 289 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 290
 291 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) nofree nosync {
 292 ; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
 293 ; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
 294 ; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
 295 ; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 296 ; SSE2-NEXT:    ret <8 x i16> [[R]]
 297 ;
 298 ; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
 299 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
 300 ; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 301 ; AVX2-NEXT:    ret <8 x i16> [[R]]
 302 ;
 303   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
 304   %s = load i16, i16* %gep, align 2
 305   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 306   ret <8 x i16> %r
 307 }
 308
 309 ; Verify that alignment of the new load is not over-specified.
 310
 311 define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) nofree nosync {
 312 ; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
 313 ; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
 314 ; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
 315 ; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 316 ; SSE2-NEXT:    ret <8 x i16> [[R]]
 317 ;
 318 ; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
 319 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2
 320 ; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 321 ; AVX2-NEXT:    ret <8 x i16> [[R]]
 322 ;
 323   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
 324   %s = load i16, i16* %gep, align 8
 325   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 326   ret <8 x i16> %r
 327 }
 328
 329 ; Negative test - if we are shuffling a load from the base pointer, the address offset
 330 ; must be a multiple of element size.
 331 ; TODO: Could bitcast around this limitation.
 332
 333 define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) nofree nosync {
 334 ; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
 335 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1
 336 ; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
 337 ; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
 338 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
 339 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 340 ;
 341   %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 1
 342   %b = bitcast i8* %gep to i32*
 343   %s = load i32, i32* %b, align 1
 344   %r = insertelement <4 x i32> poison, i32 %s, i64 0
 345   ret <4 x i32> %r
 346 }
 347
 348 define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync {
 349 ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
 350 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
 351 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
 352 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
 353 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 354 ;
 355   %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12
 356   %b = bitcast i8* %gep to i32*
 357   %s = load i32, i32* %b, align 1
 358   %r = insertelement <4 x i32> poison, i32 %s, i64 0
 359   ret <4 x i32> %r
 360 }
 361
 362 ; Negative test - if we are shuffling a load from the base pointer, the address offset
 363 ; must be a multiple of element size and the offset must be low enough to fit in the vector
 364 ; (bitcasting would not help this case).
 365
 366 define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync {
 367 ; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32(
 368 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 13
 369 ; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
 370 ; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
 371 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
 372 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 373 ;
 374   %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 13
 375   %b = bitcast i8* %gep to i32*
 376   %s = load i32, i32* %b, align 1
 377   %r = insertelement <4 x i32> poison, i32 %s, i64 0
 378   ret <4 x i32> %r
 379 }
 380
 381 ; If there are enough dereferenceable bytes, we can offset the vector load.
 382
 383 define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) nofree nosync {
 384 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 385 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 386 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
 387 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
 388 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 389 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 390 ;
 391   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 392   %s = load i16, i16* %gep, align 16
 393   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 394   ret <8 x i16> %r
 395 }
 396
 397 ; Negative test - disable under asan because widened load can cause spurious
 398 ; use-after-poison issues when __asan_poison_memory_region is used.
 399
 400 define <8 x i16> @gep10_load_i16_insert_v8i16_asan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_address {
 401 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_asan(
 402 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 403 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 404 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 405 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 406 ;
 407   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 408   %s = load i16, i16* %gep, align 16
 409   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 410   ret <8 x i16> %r
 411 }
 412
 413 ; hwasan and memtag should be similarly suppressed.
 414
 415 define <8 x i16> @gep10_load_i16_insert_v8i16_hwasan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_hwaddress {
 416 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_hwasan(
 417 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 418 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 419 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 420 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 421 ;
 422   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 423   %s = load i16, i16* %gep, align 16
 424   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 425   ret <8 x i16> %r
 426 }
 427
 428 define <8 x i16> @gep10_load_i16_insert_v8i16_memtag(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_memtag {
 429 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_memtag(
 430 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 431 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 432 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 433 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 434 ;
 435   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 436   %s = load i16, i16* %gep, align 16
 437   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 438   ret <8 x i16> %r
 439 }
 440
 441 ; Negative test - disable under tsan because widened load may overlap bytes
 442 ; being concurrently modified. tsan does not know that some bytes are undef.
 443
 444 define <8 x i16> @gep10_load_i16_insert_v8i16_tsan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_thread {
 445 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_tsan(
 446 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 447 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 448 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 449 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 450 ;
 451   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 452   %s = load i16, i16* %gep, align 16
 453   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 454   ret <8 x i16> %r
 455 }
 456
 457 ; Negative test - can't safely load the offset vector, but could load+shuffle.
 458
 459 define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) nofree nosync {
 460 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref(
 461 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 462 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 463 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 464 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 465 ;
 466   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 467   %s = load i16, i16* %gep, align 16
 468   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 469   ret <8 x i16> %r
 470 }
 471
 472 ; Negative test - do not alter volatile.
 473
 474 define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceable(16) %p) nofree nosync {
 475 ; CHECK-LABEL: @load_f32_insert_v4f32_volatile(
 476 ; CHECK-NEXT:    [[S:%.*]] = load volatile float, float* [[P:%.*]], align 4
 477 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
 478 ; CHECK-NEXT:    ret <4 x float> [[R]]
 479 ;
 480   %s = load volatile float, float* %p, align 4
 481   %r = insertelement <4 x float> poison, float %s, i32 0
 482   ret <4 x float> %r
 483 }
 484
 485 ; Pointer is not as aligned as load, but that's ok.
 486 ; The new load uses the larger alignment value.
 487
 488 define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) nofree nosync {
 489 ; CHECK-LABEL: @load_f32_insert_v4f32_align(
 490 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 491 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 492 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 493 ; CHECK-NEXT:    ret <4 x float> [[R]]
 494 ;
 495   %s = load float, float* %p, align 4
 496   %r = insertelement <4 x float> poison, float %s, i32 0
 497   ret <4 x float> %r
 498 }
 499
 500 ; Negative test - not enough bytes.
 501
 502 define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) nofree nosync {
 503 ; CHECK-LABEL: @load_f32_insert_v4f32_deref(
 504 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
 505 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
 506 ; CHECK-NEXT:    ret <4 x float> [[R]]
 507 ;
 508   %s = load float, float* %p, align 4
 509   %r = insertelement <4 x float> poison, float %s, i32 0
 510   ret <4 x float> %r
 511 }
 512
 513 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) nofree nosync {
 514 ; CHECK-LABEL: @load_i32_insert_v8i32(
 515 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
 516 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
 517 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 518 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 519 ;
 520   %s = load i32, i32* %p, align 4
 521   %r = insertelement <8 x i32> poison, i32 %s, i32 0
 522   ret <8 x i32> %r
 523 }
 524
 525 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) nofree nosync {
 526 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
 527 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
 528 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 529 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 530 ;
 531   %b = bitcast <4 x i32>* %p to i32*
 532   %s = load i32, i32* %b, align 4
 533   %r = insertelement <8 x i32> poison, i32 %s, i32 0
 534   ret <8 x i32> %r
 535 }
 536
 537 define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) nofree nosync {
 538 ; CHECK-LABEL: @load_f32_insert_v16f32(
 539 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 540 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 541 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 542 ; CHECK-NEXT:    ret <16 x float> [[R]]
 543 ;
 544   %s = load float, float* %p, align 4
 545   %r = insertelement <16 x float> poison, float %s, i32 0
 546   ret <16 x float> %r
 547 }
 548
 549 define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) nofree nosync {
 550 ; CHECK-LABEL: @load_f32_insert_v2f32(
 551 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 552 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 553 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 undef>
 554 ; CHECK-NEXT:    ret <2 x float> [[R]]
 555 ;
 556   %s = load float, float* %p, align 4
 557   %r = insertelement <2 x float> poison, float %s, i32 0
 558   ret <2 x float> %r
 559 }
 560
 561 ; Negative test - suppress load widening for asan/hwasan/memtag/tsan.
 562
 563 define <2 x float> @load_f32_insert_v2f32_asan(float* align 16 dereferenceable(16) %p) sanitize_address {
 564 ; CHECK-LABEL: @load_f32_insert_v2f32_asan(
 565 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
 566 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> poison, float [[S]], i32 0
 567 ; CHECK-NEXT:    ret <2 x float> [[R]]
 568 ;
 569   %s = load float, float* %p, align 4
 570   %r = insertelement <2 x float> poison, float %s, i32 0
 571   ret <2 x float> %r
 572 }
 573
 574 declare float* @getscaleptr()
 575 define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr, <2 x float>* nocapture nonnull readonly %opptr) nofree nosync {
 576 ; CHECK-LABEL: @PR47558_multiple_use_load(
 577 ; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr()
 578 ; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, <2 x float>* [[OPPTR:%.*]], align 4
 579 ; CHECK-NEXT:    [[SCALE:%.*]] = load float, float* [[SCALEPTR]], align 16
 580 ; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
 581 ; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
 582 ; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
 583 ; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
 584 ; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
 585 ; CHECK-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
 586 ; CHECK-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
 587 ; CHECK-NEXT:    store <2 x float> [[RESULT1]], <2 x float>* [[RESULTPTR:%.*]], align 8
 588 ; CHECK-NEXT:    ret void
 589 ;
 590   %scaleptr = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr()
 591   %op = load <2 x float>, <2 x float>* %opptr, align 4
 592   %scale = load float, float* %scaleptr, align 16
 593   %t1 = insertelement <2 x float> poison, float %scale, i32 0
 594   %t2 = insertelement <2 x float> %t1, float %scale, i32 1
 595   %t3 = fmul <2 x float> %op, %t2
 596   %t4 = extractelement <2 x float> %t3, i32 0
 597   %result0 = insertelement <2 x float> poison, float %t4, i32 0
 598   %t5 = extractelement <2 x float> %t3, i32 1
 599   %result1 = insertelement <2 x float> %result0, float %t5, i32 1
 600   store <2 x float> %result1, <2 x float>* %resultptr, align 8
 601   ret void
 602 }
 603
 604 define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync {
 605 ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
 606 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
 607 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 608 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 609 ; CHECK-NEXT:    ret <4 x float> [[R]]
 610 ;
 611   %l = load <2 x float>, <2 x float>* %p, align 4
 612   %s = extractelement <2 x float> %l, i32 0
 613   %r = insertelement <4 x float> poison, float %s, i32 0
 614   ret <4 x float> %r
 615 }
 616
 617 define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) nofree nosync {
 618 ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
 619 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>*
 620 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 621 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 622 ; CHECK-NEXT:    ret <4 x float> [[R]]
 623 ;
 624   %l = load <8 x float>, <8 x float>* %p, align 4
 625   %s = extractelement <8 x float> %l, i32 0
 626   %r = insertelement <4 x float> poison, float %s, i32 0
 627   ret <4 x float> %r
 628 }
 629
 630 define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(<1 x i32>* align 16 dereferenceable(16) %p, <1 x i32>* %store_ptr) nofree nosync {
 631 ; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
 632 ; CHECK-NEXT:    [[L:%.*]] = load <1 x i32>, <1 x i32>* [[P:%.*]], align 4
 633 ; CHECK-NEXT:    store <1 x i32> [[L]], <1 x i32>* [[STORE_PTR:%.*]], align 4
 634 ; CHECK-NEXT:    [[S:%.*]] = extractelement <1 x i32> [[L]], i32 0
 635 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> poison, i32 [[S]], i32 0
 636 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 637 ;
 638   %l = load <1 x i32>, <1 x i32>* %p, align 4
 639   store <1 x i32> %l, <1 x i32>* %store_ptr
 640   %s = extractelement <1 x i32> %l, i32 0
 641   %r = insertelement <8 x i32> poison, i32 %s, i32 0
 642   ret <8 x i32> %r
 643 }
 644
 645 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 646
 647 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) nofree nosync {
 648 ; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
 649 ; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
 650 ; SSE2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0
 651 ; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[TMP1]], align 8
 652 ; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 653 ; SSE2-NEXT:    ret <8 x i16> [[R]]
 654 ;
 655 ; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
 656 ; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
 657 ; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
 658 ; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 659 ; AVX2-NEXT:    ret <8 x i16> [[R]]
 660 ;
 661   %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
 662   %l = load <2 x i16>, <2 x i16>* %gep, align 8
 663   %s = extractelement <2 x i16> %l, i32 0
 664   %r = insertelement <8 x i16> poison, i16 %s, i64 0
 665   ret <8 x i16> %r
 666 }
 667
 668 ; PR30986 - split vector loads for scalarized operations
 669 define <2 x i64> @PR30986(<2 x i64>* %0) {
 670 ; CHECK-LABEL: @PR30986(
 671 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[TMP0:%.*]], i32 0, i32 0
 672 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 16
 673 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP3]])
 674 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i32 0
 675 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[TMP0]], i32 0, i32 1
 676 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
 677 ; CHECK-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP7]])
 678 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1
 679 ; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
 680 ;
 681   %2 = load <2 x i64>, <2 x i64>* %0, align 16
 682   %3 = extractelement <2 x i64> %2, i32 0
 683   %4 = tail call i64 @llvm.ctpop.i64(i64 %3)
 684   %5 = insertelement <2 x i64> poison, i64 %4, i32 0
 685   %6 = extractelement <2 x i64> %2, i32 1
 686   %7 = tail call i64 @llvm.ctpop.i64(i64 %6)
 687   %8 = insertelement <2 x i64> %5, i64 %7, i32 1
 688   ret <2 x i64> %8
 689 }
 690 declare i64 @llvm.ctpop.i64(i64)