llvm/test/CodeGen/X86/load-slice.ll

   1 ; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
   2 ; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR
   3 ;
   4 ; <rdar://problem/14477220>
   5
   6 %class.Complex = type { float, float }
   7
   8
   9 ; Check that independent slices leads to independent loads then the slices leads to
  10 ; different register file.
  11 ;
  12 ; The layout is:
  13 ; LSB 0 1 2 3 | 4 5 6 7 MSB
  14 ;       Low      High
  15 ; The base address points to 0 and is 8-bytes aligned.
  16 ; Low slice starts at 0 (base) and is 8-bytes aligned.
  17 ; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
  18 ;
  19 ; STRESS-LABEL: _t1:
  20 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
  21 ; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
  22 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
  23 ; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
  24 ; Add low slice: out[out_start].real, this is base + 0.
  25 ; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
  26 ; Add high slice: out[out_start].imm, this is base + 4.
  27 ; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
  28 ; Swap Imm and Real.
  29 ; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
  30 ; Put the results back into out[out_start].
  31 ; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
  32 ;
  33 ; Same for REGULAR, we eliminate register bank copy with each slices.
  34 ; REGULAR-LABEL: _t1:
  35 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
  36 ; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
  37 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
  38 ; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
  39 ; Add low slice: out[out_start].real, this is base + 0.
  40 ; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
  41 ; Add high slice: out[out_start].imm, this is base + 4.
  42 ; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
  43 ; Swap Imm and Real.
  44 ; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
  45 ; Put the results back into out[out_start].
  46 ; REGULAR-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
  47 define void @t1(ptr nocapture %out, i64 %out_start) {
  48 entry:
  49   %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start
  50   %tmp1 = load i64, ptr %arrayidx, align 8
  51   %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
  52   %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
  53   %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
  54   %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
  55   %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
  56   %add = add i64 %out_start, 8
  57   %arrayidx2 = getelementptr inbounds %class.Complex, ptr %out, i64 %add
  58   %tmp4 = load float, ptr %arrayidx2, align 4
  59   %add.i = fadd float %tmp4, %tmp2
  60   %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
  61   %r.i = getelementptr inbounds %class.Complex, ptr %arrayidx2, i64 0, i32 1
  62   %tmp5 = load float, ptr %r.i, align 4
  63   %add5.i = fadd float %tmp5, %tmp3
  64   %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
  65   store <2 x float> %retval.sroa.0.4.vec.insert.i, ptr %arrayidx, align 4
  66   ret void
  67 }
  68
  69 ; Function Attrs: nounwind
  70 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
  71
  72 ; Function Attrs: nounwind
  73 declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
  74
  75 ; Function Attrs: nounwind
  76 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
  77
  78 ; Check that we do not read outside of the chunk of bits of the original loads.
  79 ;
  80 ; The 64-bits should have been split in one 32-bits and one 16-bits slices.
  81 ; The 16-bits should be zero extended to match the final type.
  82 ;
  83 ; The memory layout is:
  84 ; LSB 0 1 2 3 | 4 5 | 6 7 MSB
  85 ;      Low            High
  86 ; The base address points to 0 and is 8-bytes aligned.
  87 ; Low slice starts at 0 (base) and is 8-bytes aligned.
  88 ; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
  89 ;
  90 ; STRESS-LABEL: _t2:
  91 ; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
  92 ; STRESS-NEXT: addl ([[BASE]]), %eax
  93 ; STRESS-NEXT: ret
  94 ;
  95 ; For the REGULAR heuristic, this is not profitable to slice things that are not
  96 ; next to each other in memory. Here we have a hole with bytes #4-5.
  97 ; REGULAR-LABEL: _t2:
  98 ; REGULAR: shrq $48
  99 define i32 @t2(ptr nocapture %out, i64 %out_start) {
 100   %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start
 101   %chunk64 = load i64, ptr %arrayidx, align 8
 102   %slice32_low = trunc i64 %chunk64 to i32
 103   %shift48 = lshr i64 %chunk64, 48
 104   %slice32_high = trunc i64 %shift48 to i32
 105   %res = add i32 %slice32_high, %slice32_low
 106   ret i32 %res
 107 }
 108
 109 ; Check that we do not optimize overlapping slices.
 110 ;
 111 ; The 64-bits should NOT have been split in as slices are overlapping.
 112 ; First slice uses bytes numbered 0 to 3.
 113 ; Second slice uses bytes numbered 6 and 7.
 114 ; Third slice uses bytes numbered 4 to 7.
 115 ;
 116 ; STRESS-LABEL: _t3:
 117 ; STRESS: shrq $48
 118 ; STRESS: shrq $32
 119 ;
 120 ; REGULAR-LABEL: _t3:
 121 ; REGULAR: shrq $48
 122 ; REGULAR: shrq $32
 123 define i32 @t3(ptr nocapture %out, i64 %out_start) {
 124   %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start
 125   %chunk64 = load i64, ptr %arrayidx, align 8
 126   %slice32_low = trunc i64 %chunk64 to i32
 127   %shift48 = lshr i64 %chunk64, 48
 128   %slice32_high = trunc i64 %shift48 to i32
 129   %shift32 = lshr i64 %chunk64, 32
 130   %slice32_lowhigh = trunc i64 %shift32 to i32
 131   %tmpres = add i32 %slice32_high, %slice32_low
 132   %res = add i32 %slice32_lowhigh, %tmpres
 133   ret i32 %res
 134 }