llvm/test/Transforms/VectorCombine/X86/load.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
   3 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
   4
   5 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
   6
   7 define float @matching_fp_scalar(float* align 16 dereferenceable(16) %p) {
   8 ; CHECK-LABEL: @matching_fp_scalar(
   9 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[P:%.*]], align 16
  10 ; CHECK-NEXT:    ret float [[R]]
  11 ;
  12   %r = load float, float* %p, align 16
  13   ret float %r
  14 }
  15
  16 define float @matching_fp_scalar_volatile(float* align 16 dereferenceable(16) %p) {
  17 ; CHECK-LABEL: @matching_fp_scalar_volatile(
  18 ; CHECK-NEXT:    [[R:%.*]] = load volatile float, float* [[P:%.*]], align 16
  19 ; CHECK-NEXT:    ret float [[R]]
  20 ;
  21   %r = load volatile float, float* %p, align 16
  22   ret float %r
  23 }
  24
  25 define double @larger_fp_scalar(float* align 16 dereferenceable(16) %p) {
  26 ; CHECK-LABEL: @larger_fp_scalar(
  27 ; CHECK-NEXT:    [[BC:%.*]] = bitcast float* [[P:%.*]] to double*
  28 ; CHECK-NEXT:    [[R:%.*]] = load double, double* [[BC]], align 16
  29 ; CHECK-NEXT:    ret double [[R]]
  30 ;
  31   %bc = bitcast float* %p to double*
  32   %r = load double, double* %bc, align 16
  33   ret double %r
  34 }
  35
  36 define float @smaller_fp_scalar(double* align 16 dereferenceable(16) %p) {
  37 ; CHECK-LABEL: @smaller_fp_scalar(
  38 ; CHECK-NEXT:    [[BC:%.*]] = bitcast double* [[P:%.*]] to float*
  39 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
  40 ; CHECK-NEXT:    ret float [[R]]
  41 ;
  42   %bc = bitcast double* %p to float*
  43   %r = load float, float* %bc, align 16
  44   ret float %r
  45 }
  46
  47 define float @matching_fp_vector(<4 x float>* align 16 dereferenceable(16) %p) {
  48 ; CHECK-LABEL: @matching_fp_vector(
  49 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
  50 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
  51 ; CHECK-NEXT:    ret float [[R]]
  52 ;
  53   %bc = bitcast <4 x float>* %p to float*
  54   %r = load float, float* %bc, align 16
  55   ret float %r
  56 }
  57
  58 define float @matching_fp_vector_gep00(<4 x float>* align 16 dereferenceable(16) %p) {
  59 ; CHECK-LABEL: @matching_fp_vector_gep00(
  60 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
  61 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
  62 ; CHECK-NEXT:    ret float [[R]]
  63 ;
  64   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
  65   %r = load float, float* %gep, align 16
  66   ret float %r
  67 }
  68
  69 define float @matching_fp_vector_gep01(<4 x float>* align 16 dereferenceable(20) %p) {
  70 ; CHECK-LABEL: @matching_fp_vector_gep01(
  71 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 1
  72 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 4
  73 ; CHECK-NEXT:    ret float [[R]]
  74 ;
  75   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 1
  76   %r = load float, float* %gep, align 4
  77   ret float %r
  78 }
  79
  80 define float @matching_fp_vector_gep01_deref(<4 x float>* align 16 dereferenceable(19) %p) {
  81 ; CHECK-LABEL: @matching_fp_vector_gep01_deref(
  82 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 1
  83 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 4
  84 ; CHECK-NEXT:    ret float [[R]]
  85 ;
  86   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 1
  87   %r = load float, float* %gep, align 4
  88   ret float %r
  89 }
  90
  91 define float @matching_fp_vector_gep10(<4 x float>* align 16 dereferenceable(32) %p) {
  92 ; CHECK-LABEL: @matching_fp_vector_gep10(
  93 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 1, i64 0
  94 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
  95 ; CHECK-NEXT:    ret float [[R]]
  96 ;
  97   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 1, i64 0
  98   %r = load float, float* %gep, align 16
  99   ret float %r
 100 }
 101
 102 define float @matching_fp_vector_gep10_deref(<4 x float>* align 16 dereferenceable(31) %p) {
 103 ; CHECK-LABEL: @matching_fp_vector_gep10_deref(
 104 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 1, i64 0
 105 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[GEP]], align 16
 106 ; CHECK-NEXT:    ret float [[R]]
 107 ;
 108   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 1, i64 0
 109   %r = load float, float* %gep, align 16
 110   ret float %r
 111 }
 112
 113 define float @nonmatching_int_vector(<2 x i64>* align 16 dereferenceable(16) %p) {
 114 ; CHECK-LABEL: @nonmatching_int_vector(
 115 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64>* [[P:%.*]] to float*
 116 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[BC]], align 16
 117 ; CHECK-NEXT:    ret float [[R]]
 118 ;
 119   %bc = bitcast <2 x i64>* %p to float*
 120   %r = load float, float* %bc, align 16
 121   ret float %r
 122 }
 123
 124 define double @less_aligned(double* align 4 dereferenceable(16) %p) {
 125 ; CHECK-LABEL: @less_aligned(
 126 ; CHECK-NEXT:    [[R:%.*]] = load double, double* [[P:%.*]], align 4
 127 ; CHECK-NEXT:    ret double [[R]]
 128 ;
 129   %r = load double, double* %p, align 4
 130   ret double %r
 131 }
 132
 133 define float @matching_fp_scalar_small_deref(float* align 16 dereferenceable(15) %p) {
 134 ; CHECK-LABEL: @matching_fp_scalar_small_deref(
 135 ; CHECK-NEXT:    [[R:%.*]] = load float, float* [[P:%.*]], align 16
 136 ; CHECK-NEXT:    ret float [[R]]
 137 ;
 138   %r = load float, float* %p, align 16
 139   ret float %r
 140 }
 141
 142 define i64 @larger_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) {
 143 ; CHECK-LABEL: @larger_int_scalar(
 144 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i64*
 145 ; CHECK-NEXT:    [[R:%.*]] = load i64, i64* [[BC]], align 16
 146 ; CHECK-NEXT:    ret i64 [[R]]
 147 ;
 148   %bc = bitcast <4 x float>* %p to i64*
 149   %r = load i64, i64* %bc, align 16
 150   ret i64 %r
 151 }
 152
 153 define i8 @smaller_int_scalar(<4 x float>* align 16 dereferenceable(16) %p) {
 154 ; CHECK-LABEL: @smaller_int_scalar(
 155 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x float>* [[P:%.*]] to i8*
 156 ; CHECK-NEXT:    [[R:%.*]] = load i8, i8* [[BC]], align 16
 157 ; CHECK-NEXT:    ret i8 [[R]]
 158 ;
 159   %bc = bitcast <4 x float>* %p to i8*
 160   %r = load i8, i8* %bc, align 16
 161   ret i8 %r
 162 }
 163
 164 define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable(32) %p) {
 165 ; CHECK-LABEL: @larger_fp_scalar_256bit_vec(
 166 ; CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x float>* [[P:%.*]] to double*
 167 ; CHECK-NEXT:    [[R:%.*]] = load double, double* [[BC]], align 32
 168 ; CHECK-NEXT:    ret double [[R]]
 169 ;
 170   %bc = bitcast <8 x float>* %p to double*
 171   %r = load double, double* %bc, align 32
 172   ret double %r
 173 }
 174
 175 define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) nofree nosync {
 176 ; CHECK-LABEL: @load_f32_insert_v4f32(
 177 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 178 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 179 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 180 ; CHECK-NEXT:    ret <4 x float> [[R]]
 181 ;
 182   %s = load float, float* %p, align 4
 183   %r = insertelement <4 x float> undef, float %s, i32 0
 184   ret <4 x float> %r
 185 }
 186
 187 define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) nofree nosync {
 188 ; CHECK-LABEL: @casted_load_f32_insert_v4f32(
 189 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
 190 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 191 ; CHECK-NEXT:    ret <4 x float> [[R]]
 192 ;
 193   %b = bitcast <4 x float>* %p to float*
 194   %s = load float, float* %b, align 4
 195   %r = insertelement <4 x float> undef, float %s, i32 0
 196   ret <4 x float> %r
 197 }
 198
 199 ; Element type does not change cost.
 200
 201 define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) nofree nosync {
 202 ; CHECK-LABEL: @load_i32_insert_v4i32(
 203 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
 204 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
 205 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 206 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 207 ;
 208   %s = load i32, i32* %p, align 4
 209   %r = insertelement <4 x i32> undef, i32 %s, i32 0
 210   ret <4 x i32> %r
 211 }
 212
 213 ; Pointer type does not change cost.
 214
 215 define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) nofree nosync {
 216 ; CHECK-LABEL: @casted_load_i32_insert_v4i32(
 217 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
 218 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 219 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 220 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 221 ;
 222   %b = bitcast <16 x i8>* %p to i32*
 223   %s = load i32, i32* %b, align 4
 224   %r = insertelement <4 x i32> undef, i32 %s, i32 0
 225   ret <4 x i32> %r
 226 }
 227
 228 ; This is canonical form for vector element access.
 229
 230 define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) nofree nosync {
 231 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
 232 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
 233 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 234 ; CHECK-NEXT:    ret <4 x float> [[R]]
 235 ;
 236   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
 237   %s = load float, float* %gep, align 16
 238   %r = insertelement <4 x float> undef, float %s, i64 0
 239   ret <4 x float> %r
 240 }
 241
 242 ; Should work with addrspace as well.
 243
 244 define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) nofree nosync {
 245 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
 246 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16
 247 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 248 ; CHECK-NEXT:    ret <4 x float> [[R]]
 249 ;
 250   %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0
 251   %s = load float, float addrspace(44)* %gep, align 16
 252   %r = insertelement <4 x float> undef, float %s, i64 0
 253   ret <4 x float> %r
 254 }
 255
 256 ; If there are enough dereferenceable bytes, we can offset the vector load.
 257
 258 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync {
 259 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 260 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
 261 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
 262 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
 263 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 264 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 265 ;
 266   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
 267   %s = load i16, i16* %gep, align 2
 268   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 269   ret <8 x i16> %r
 270 }
 271
 272 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 273
 274 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) nofree nosync {
 275 ; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
 276 ; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
 277 ; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
 278 ; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 279 ; SSE2-NEXT:    ret <8 x i16> [[R]]
 280 ;
 281 ; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
 282 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
 283 ; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 284 ; AVX2-NEXT:    ret <8 x i16> [[R]]
 285 ;
 286   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
 287   %s = load i16, i16* %gep, align 2
 288   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 289   ret <8 x i16> %r
 290 }
 291
 292 ; Verify that alignment of the new load is not over-specified.
 293
 294 define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) nofree nosync {
 295 ; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
 296 ; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
 297 ; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
 298 ; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 299 ; SSE2-NEXT:    ret <8 x i16> [[R]]
 300 ;
 301 ; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
 302 ; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2
 303 ; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 304 ; AVX2-NEXT:    ret <8 x i16> [[R]]
 305 ;
 306   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
 307   %s = load i16, i16* %gep, align 8
 308   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 309   ret <8 x i16> %r
 310 }
 311
 312 ; Negative test - if we are shuffling a load from the base pointer, the address offset
 313 ; must be a multiple of element size.
 314 ; TODO: Could bitcast around this limitation.
 315
 316 define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) {
 317 ; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
 318 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1
 319 ; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
 320 ; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
 321 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
 322 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 323 ;
 324   %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 1
 325   %b = bitcast i8* %gep to i32*
 326   %s = load i32, i32* %b, align 1
 327   %r = insertelement <4 x i32> undef, i32 %s, i64 0
 328   ret <4 x i32> %r
 329 }
 330
 331 define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync {
 332 ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
 333 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
 334 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
 335 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
 336 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 337 ;
 338   %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12
 339   %b = bitcast i8* %gep to i32*
 340   %s = load i32, i32* %b, align 1
 341   %r = insertelement <4 x i32> undef, i32 %s, i64 0
 342   ret <4 x i32> %r
 343 }
 344
 345 ; Negative test - if we are shuffling a load from the base pointer, the address offset
 346 ; must be a multiple of element size and the offset must be low enough to fit in the vector
 347 ; (bitcasting would not help this case).
 348
 349 define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync {
 350 ; CHECK-LABEL: @gep013_bitcast_load_i32_insert_v4i32(
 351 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 13
 352 ; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
 353 ; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
 354 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
 355 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 356 ;
 357   %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 13
 358   %b = bitcast i8* %gep to i32*
 359   %s = load i32, i32* %b, align 1
 360   %r = insertelement <4 x i32> undef, i32 %s, i64 0
 361   ret <4 x i32> %r
 362 }
 363
 364 ; If there are enough dereferenceable bytes, we can offset the vector load.
 365
 366 define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) nofree nosync {
 367 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 368 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 369 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
 370 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
 371 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 372 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 373 ;
 374   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 375   %s = load i16, i16* %gep, align 16
 376   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 377   ret <8 x i16> %r
 378 }
 379
 380 ; Negative test - disable under asan because widened load can cause spurious
 381 ; use-after-poison issues when __asan_poison_memory_region is used.
 382
 383 define <8 x i16> @gep10_load_i16_insert_v8i16_asan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_address nofree nosync {
 384 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_asan(
 385 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 386 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 387 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 388 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 389 ;
 390   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 391   %s = load i16, i16* %gep, align 16
 392   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 393   ret <8 x i16> %r
 394 }
 395
 396 ; hwasan and memtag should be similarly suppressed.
 397
 398 define <8 x i16> @gep10_load_i16_insert_v8i16_hwasan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_hwaddress nofree nosync {
 399 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_hwasan(
 400 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 401 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 402 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 403 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 404 ;
 405   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 406   %s = load i16, i16* %gep, align 16
 407   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 408   ret <8 x i16> %r
 409 }
 410
 411 define <8 x i16> @gep10_load_i16_insert_v8i16_memtag(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_memtag nofree nosync {
 412 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_memtag(
 413 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 414 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 415 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 416 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 417 ;
 418   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 419   %s = load i16, i16* %gep, align 16
 420   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 421   ret <8 x i16> %r
 422 }
 423
 424 ; Negative test - disable under tsan because widened load may overlap bytes
 425 ; being concurrently modified. tsan does not know that some bytes are undef.
 426
 427 define <8 x i16> @gep10_load_i16_insert_v8i16_tsan(<8 x i16>* align 16 dereferenceable(32) %p) sanitize_thread nofree nosync {
 428 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_tsan(
 429 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 430 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 431 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 432 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 433 ;
 434   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 435   %s = load i16, i16* %gep, align 16
 436   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 437   ret <8 x i16> %r
 438 }
 439
 440 ; Negative test - can't safely load the offset vector, but could load+shuffle.
 441
 442 define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) nofree nosync {
 443 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref(
 444 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
 445 ; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
 446 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 447 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 448 ;
 449   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
 450   %s = load i16, i16* %gep, align 16
 451   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 452   ret <8 x i16> %r
 453 }
 454
 455 ; Negative test - do not alter volatile.
 456
 457 define <4 x float> @load_f32_insert_v4f32_volatile(float* align 16 dereferenceable(16) %p) nofree nosync {
 458 ; CHECK-LABEL: @load_f32_insert_v4f32_volatile(
 459 ; CHECK-NEXT:    [[S:%.*]] = load volatile float, float* [[P:%.*]], align 4
 460 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
 461 ; CHECK-NEXT:    ret <4 x float> [[R]]
 462 ;
 463   %s = load volatile float, float* %p, align 4
 464   %r = insertelement <4 x float> undef, float %s, i32 0
 465   ret <4 x float> %r
 466 }
 467
 468 ; Pointer is not as aligned as load, but that's ok.
 469 ; The new load uses the larger alignment value.
 470
 471 define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) nofree nosync {
 472 ; CHECK-LABEL: @load_f32_insert_v4f32_align(
 473 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 474 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
 475 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 476 ; CHECK-NEXT:    ret <4 x float> [[R]]
 477 ;
 478   %s = load float, float* %p, align 4
 479   %r = insertelement <4 x float> undef, float %s, i32 0
 480   ret <4 x float> %r
 481 }
 482
 483 ; Negative test - not enough bytes.
 484
 485 define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) nofree nosync {
 486 ; CHECK-LABEL: @load_f32_insert_v4f32_deref(
 487 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
 488 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
 489 ; CHECK-NEXT:    ret <4 x float> [[R]]
 490 ;
 491   %s = load float, float* %p, align 4
 492   %r = insertelement <4 x float> undef, float %s, i32 0
 493   ret <4 x float> %r
 494 }
 495
 496 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) nofree nosync {
 497 ; CHECK-LABEL: @load_i32_insert_v8i32(
 498 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
 499 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
 500 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 501 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 502 ;
 503   %s = load i32, i32* %p, align 4
 504   %r = insertelement <8 x i32> undef, i32 %s, i32 0
 505   ret <8 x i32> %r
 506 }
 507
 508 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) nofree nosync {
 509 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
 510 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
 511 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 512 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 513 ;
 514   %b = bitcast <4 x i32>* %p to i32*
 515   %s = load i32, i32* %b, align 4
 516   %r = insertelement <8 x i32> undef, i32 %s, i32 0
 517   ret <8 x i32> %r
 518 }
 519
 520 define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) nofree nosync {
 521 ; CHECK-LABEL: @load_f32_insert_v16f32(
 522 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 523 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 524 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 525 ; CHECK-NEXT:    ret <16 x float> [[R]]
 526 ;
 527   %s = load float, float* %p, align 4
 528   %r = insertelement <16 x float> undef, float %s, i32 0
 529   ret <16 x float> %r
 530 }
 531
 532 define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) nofree nosync {
 533 ; CHECK-LABEL: @load_f32_insert_v2f32(
 534 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
 535 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 536 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 undef>
 537 ; CHECK-NEXT:    ret <2 x float> [[R]]
 538 ;
 539   %s = load float, float* %p, align 4
 540   %r = insertelement <2 x float> undef, float %s, i32 0
 541   ret <2 x float> %r
 542 }
 543
 544 ; Negative test - suppress load widening for asan/hwasan/memtag/tsan.
 545
 546 define <2 x float> @load_f32_insert_v2f32_asan(float* align 16 dereferenceable(16) %p) sanitize_address {
 547 ; CHECK-LABEL: @load_f32_insert_v2f32_asan(
 548 ; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
 549 ; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
 550 ; CHECK-NEXT:    ret <2 x float> [[R]]
 551 ;
 552   %s = load float, float* %p, align 4
 553   %r = insertelement <2 x float> undef, float %s, i32 0
 554   ret <2 x float> %r
 555 }
 556
 557 declare float* @getscaleptr()
 558 define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr, <2 x float>* nocapture nonnull readonly %opptr) {
 559 ; CHECK-LABEL: @PR47558_multiple_use_load(
 560 ; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr()
 561 ; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, <2 x float>* [[OPPTR:%.*]], align 4
 562 ; CHECK-NEXT:    [[SCALE:%.*]] = load float, float* [[SCALEPTR]], align 16
 563 ; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
 564 ; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
 565 ; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
 566 ; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
 567 ; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
 568 ; CHECK-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
 569 ; CHECK-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
 570 ; CHECK-NEXT:    store <2 x float> [[RESULT1]], <2 x float>* [[RESULTPTR:%.*]], align 8
 571 ; CHECK-NEXT:    ret void
 572 ;
 573   %scaleptr = tail call nonnull align 16 dereferenceable(64) float* @getscaleptr()
 574   %op = load <2 x float>, <2 x float>* %opptr, align 4
 575   %scale = load float, float* %scaleptr, align 16
 576   %t1 = insertelement <2 x float> undef, float %scale, i32 0
 577   %t2 = insertelement <2 x float> %t1, float %scale, i32 1
 578   %t3 = fmul <2 x float> %op, %t2
 579   %t4 = extractelement <2 x float> %t3, i32 0
 580   %result0 = insertelement <2 x float> undef, float %t4, i32 0
 581   %t5 = extractelement <2 x float> %t3, i32 1
 582   %result1 = insertelement <2 x float> %result0, float %t5, i32 1
 583   store <2 x float> %result1, <2 x float>* %resultptr, align 8
 584   ret void
 585 }
 586
 587 define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync {
 588 ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
 589 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
 590 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 591 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 592 ; CHECK-NEXT:    ret <4 x float> [[R]]
 593 ;
 594   %l = load <2 x float>, <2 x float>* %p, align 4
 595   %s = extractelement <2 x float> %l, i32 0
 596   %r = insertelement <4 x float> undef, float %s, i32 0
 597   ret <4 x float> %r
 598 }
 599
 600 define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) nofree nosync {
 601 ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
 602 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>*
 603 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
 604 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 605 ; CHECK-NEXT:    ret <4 x float> [[R]]
 606 ;
 607   %l = load <8 x float>, <8 x float>* %p, align 4
 608   %s = extractelement <8 x float> %l, i32 0
 609   %r = insertelement <4 x float> undef, float %s, i32 0
 610   ret <4 x float> %r
 611 }
 612
 613 define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(<1 x i32>* align 16 dereferenceable(16) %p, <1 x i32>* %store_ptr) nofree nosync {
 614 ; CHECK-LABEL: @load_v1i32_extract_insert_v8i32_extra_use(
 615 ; CHECK-NEXT:    [[L:%.*]] = load <1 x i32>, <1 x i32>* [[P:%.*]], align 4
 616 ; CHECK-NEXT:    store <1 x i32> [[L]], <1 x i32>* [[STORE_PTR:%.*]], align 4
 617 ; CHECK-NEXT:    [[S:%.*]] = extractelement <1 x i32> [[L]], i32 0
 618 ; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
 619 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 620 ;
 621   %l = load <1 x i32>, <1 x i32>* %p, align 4
 622   store <1 x i32> %l, <1 x i32>* %store_ptr
 623   %s = extractelement <1 x i32> %l, i32 0
 624   %r = insertelement <8 x i32> undef, i32 %s, i32 0
 625   ret <8 x i32> %r
 626 }
 627
 628 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 629
 630 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) nofree nosync {
 631 ; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
 632 ; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
 633 ; SSE2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0
 634 ; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[TMP1]], align 8
 635 ; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 636 ; SSE2-NEXT:    ret <8 x i16> [[R]]
 637 ;
 638 ; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
 639 ; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
 640 ; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
 641 ; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 642 ; AVX2-NEXT:    ret <8 x i16> [[R]]
 643 ;
 644   %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
 645   %l = load <2 x i16>, <2 x i16>* %gep, align 8
 646   %s = extractelement <2 x i16> %l, i32 0
 647   %r = insertelement <8 x i16> undef, i16 %s, i64 0
 648   ret <8 x i16> %r
 649 }
 650
 651 ; PR30986 - split vector loads for scalarized operations
 652 define <2 x i64> @PR30986(<2 x i64>* %0) {
 653 ; CHECK-LABEL: @PR30986(
 654 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[TMP0:%.*]], i32 0, i32 0
 655 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 16
 656 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP3]])
 657 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
 658 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <2 x i64>, <2 x i64>* [[TMP0]], i32 0, i32 1
 659 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
 660 ; CHECK-NEXT:    [[TMP8:%.*]] = tail call i64 @llvm.ctpop.i64(i64 [[TMP7]])
 661 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1
 662 ; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
 663 ;
 664   %2 = load <2 x i64>, <2 x i64>* %0, align 16
 665   %3 = extractelement <2 x i64> %2, i32 0
 666   %4 = tail call i64 @llvm.ctpop.i64(i64 %3)
 667   %5 = insertelement <2 x i64> undef, i64 %4, i32 0
 668   %6 = extractelement <2 x i64> %2, i32 1
 669   %7 = tail call i64 @llvm.ctpop.i64(i64 %6)
 670   %8 = insertelement <2 x i64> %5, i64 %7, i32 1
 671   ret <2 x i64> %8
 672 }
 673 declare i64 @llvm.ctpop.i64(i64)