test/CodeGen/X86/2009-07-16-LoadFoldingBug.ll

   1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
   2
   3 ; CHECK: _foo:
   4 ; CHECK: pavgw LCPI1_4(%rip)
   5
   6 ; rdar://7057804
   7
   8 define void @foo(i16* %out8x8, i16* %in8x8, i32 %lastrow) optsize ssp {
   9 entry:
  10         %0 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 6518, i16 6518, i16 6518, i16 6518, i16 6518, i16 6518, i16 6518, i16 6518>, <8 x i16> undef) nounwind readnone               ; <<8 x i16>> [#uses=2]
  11         %1 = call <8 x i16> @llvm.x86.sse2.pcmpeq.w(<8 x i16> %0, <8 x i16> <i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384>) nounwind readnone         ; <<8 x i16>> [#uses=1]
  12         %2 = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> zeroinitializer, i32 14) nounwind readnone         ; <<8 x i16>> [#uses=1]
  13         %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %2, <8 x i16> zeroinitializer) nounwind readnone            ; <<8 x i16>> [#uses=1]
  14         %tmp.i.i10 = add <8 x i16> %0, %3               ; <<8 x i16>> [#uses=1]
  15         %4 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> zeroinitializer, <8 x i16> %1) nounwind readnone           ; <<8 x i16>> [#uses=1]
  16         %5 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %tmp.i.i10, <8 x i16> %4) nounwind readnone                ; <<8 x i16>> [#uses=3]
  17         %6 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %5, <8 x i16> undef) nounwind readnone             ; <<8 x i16>> [#uses=1]
  18         %7 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 6518, i16 6518, i16 6518, i16 6518, i16 6518, i16 6518, i16 6518, i16 6518>, <8 x i16> undef) nounwind readnone               ; <<8 x i16>> [#uses=2]
  19         %8 = call <8 x i16> @llvm.x86.sse2.pcmpeq.w(<8 x i16> %7, <8 x i16> <i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384>) nounwind readnone         ; <<8 x i16>> [#uses=1]
  20         %9 = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> zeroinitializer, i32 14) nounwind readnone         ; <<8 x i16>> [#uses=1]
  21         %10 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %9, <8 x i16> zeroinitializer) nounwind readnone           ; <<8 x i16>> [#uses=1]
  22         %tmp.i.i8 = add <8 x i16> %7, %10               ; <<8 x i16>> [#uses=1]
  23         %11 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> undef, <8 x i16> %8) nounwind readnone            ; <<8 x i16>> [#uses=1]
  24         %12 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %tmp.i.i8, <8 x i16> %11) nounwind readnone               ; <<8 x i16>> [#uses=1]
  25         %13 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> undef, <8 x i16> undef) nounwind readnone         ; <<8 x i16>> [#uses=1]
  26         %14 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %5, <8 x i16> undef) nounwind readnone            ; <<8 x i16>> [#uses=1]
  27         %15 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %5, <8 x i16> undef) nounwind readnone            ; <<8 x i16>> [#uses=1]
  28         %16 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %6, <8 x i16> undef) nounwind readnone            ; <<8 x i16>> [#uses=1]
  29         %17 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %12, <8 x i16> undef) nounwind readnone           ; <<8 x i16>> [#uses=1]
  30         %18 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %13, <8 x i16> %15) nounwind readnone             ; <<8 x i16>> [#uses=1]
  31         %19 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> undef, <8 x i16> %14) nounwind readnone           ; <<8 x i16>> [#uses=2]
  32         %20 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> undef, <8 x i16> undef) nounwind readnone         ; <<8 x i16>> [#uses=4]
  33         %21 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> undef, <8 x i16> %17) nounwind readnone           ; <<8 x i16>> [#uses=1]
  34         %22 = bitcast <8 x i16> %21 to <2 x i64>                ; <<2 x i64>> [#uses=1]
  35         %23 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170>, <8 x i16> undef) nounwind readnone              ; <<8 x i16>> [#uses=2]
  36         %24 = call <8 x i16> @llvm.x86.sse2.pcmpeq.w(<8 x i16> %23, <8 x i16> <i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384>) nounwind readnone               ; <<8 x i16>> [#uses=1]
  37         %25 = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> zeroinitializer, i32 14) nounwind readnone                ; <<8 x i16>> [#uses=1]
  38         %26 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %25, <8 x i16> zeroinitializer) nounwind readnone          ; <<8 x i16>> [#uses=1]
  39         %tmp.i.i6 = add <8 x i16> %23, %26              ; <<8 x i16>> [#uses=1]
  40         %27 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> undef, <8 x i16> %24) nounwind readnone           ; <<8 x i16>> [#uses=1]
  41         %28 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %tmp.i.i6, <8 x i16> %27) nounwind readnone               ; <<8 x i16>> [#uses=1]
  42         %29 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170>, <8 x i16> undef) nounwind readnone              ; <<8 x i16>> [#uses=2]
  43         %30 = call <8 x i16> @llvm.x86.sse2.pcmpeq.w(<8 x i16> %29, <8 x i16> <i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384>) nounwind readnone               ; <<8 x i16>> [#uses=1]
  44         %31 = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> zeroinitializer, i32 14) nounwind readnone                ; <<8 x i16>> [#uses=1]
  45         %32 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %31, <8 x i16> zeroinitializer) nounwind readnone          ; <<8 x i16>> [#uses=1]
  46         %tmp.i.i4 = add <8 x i16> %29, %32              ; <<8 x i16>> [#uses=1]
  47         %33 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> undef, <8 x i16> %30) nounwind readnone           ; <<8 x i16>> [#uses=1]
  48         %34 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %tmp.i.i4, <8 x i16> %33) nounwind readnone               ; <<8 x i16>> [#uses=1]
  49         %35 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170>, <8 x i16> %20) nounwind readnone                ; <<8 x i16>> [#uses=2]
  50         %tmp.i2.i1 = mul <8 x i16> %20, <i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170, i16 23170>                ; <<8 x i16>> [#uses=1]
  51         %36 = call <8 x i16> @llvm.x86.sse2.pcmpeq.w(<8 x i16> %35, <8 x i16> <i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384>) nounwind readnone               ; <<8 x i16>> [#uses=1]
  52         %37 = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %tmp.i2.i1, i32 14) nounwind readnone             ; <<8 x i16>> [#uses=1]
  53         %38 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %37, <8 x i16> zeroinitializer) nounwind readnone          ; <<8 x i16>> [#uses=1]
  54         %tmp.i.i2 = add <8 x i16> %35, %38              ; <<8 x i16>> [#uses=1]
  55         %39 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %19, <8 x i16> %36) nounwind readnone             ; <<8 x i16>> [#uses=1]
  56         %40 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %tmp.i.i2, <8 x i16> %39) nounwind readnone               ; <<8 x i16>> [#uses=1]
  57         %41 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> <i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170>, <8 x i16> %20) nounwind readnone                ; <<8 x i16>> [#uses=2]
  58         %tmp.i2.i = mul <8 x i16> %20, <i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170>         ; <<8 x i16>> [#uses=1]
  59         %42 = call <8 x i16> @llvm.x86.sse2.pcmpeq.w(<8 x i16> %41, <8 x i16> <i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384, i16 16384>) nounwind readnone               ; <<8 x i16>> [#uses=1]
  60         %43 = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %tmp.i2.i, i32 14) nounwind readnone              ; <<8 x i16>> [#uses=1]
  61         %44 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %43, <8 x i16> zeroinitializer) nounwind readnone          ; <<8 x i16>> [#uses=1]
  62         %tmp.i.i = add <8 x i16> %41, %44               ; <<8 x i16>> [#uses=1]
  63         %45 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %19, <8 x i16> %42) nounwind readnone             ; <<8 x i16>> [#uses=1]
  64         %46 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %tmp.i.i, <8 x i16> %45) nounwind readnone                ; <<8 x i16>> [#uses=1]
  65         %47 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %18, <8 x i16> %16) nounwind readnone             ; <<8 x i16>> [#uses=1]
  66         %48 = bitcast <8 x i16> %47 to <2 x i64>                ; <<2 x i64>> [#uses=1]
  67         %49 = bitcast <8 x i16> %28 to <2 x i64>                ; <<2 x i64>> [#uses=1]
  68         %50 = getelementptr i16* %out8x8, i64 8         ; <i16*> [#uses=1]
  69         %51 = bitcast i16* %50 to <2 x i64>*            ; <<2 x i64>*> [#uses=1]
  70         store <2 x i64> %49, <2 x i64>* %51, align 16
  71         %52 = bitcast <8 x i16> %40 to <2 x i64>                ; <<2 x i64>> [#uses=1]
  72         %53 = getelementptr i16* %out8x8, i64 16                ; <i16*> [#uses=1]
  73         %54 = bitcast i16* %53 to <2 x i64>*            ; <<2 x i64>*> [#uses=1]
  74         store <2 x i64> %52, <2 x i64>* %54, align 16
  75         %55 = getelementptr i16* %out8x8, i64 24                ; <i16*> [#uses=1]
  76         %56 = bitcast i16* %55 to <2 x i64>*            ; <<2 x i64>*> [#uses=1]
  77         store <2 x i64> %48, <2 x i64>* %56, align 16
  78         %57 = bitcast <8 x i16> %46 to <2 x i64>                ; <<2 x i64>> [#uses=1]
  79         %58 = getelementptr i16* %out8x8, i64 40                ; <i16*> [#uses=1]
  80         %59 = bitcast i16* %58 to <2 x i64>*            ; <<2 x i64>*> [#uses=1]
  81         store <2 x i64> %57, <2 x i64>* %59, align 16
  82         %60 = bitcast <8 x i16> %34 to <2 x i64>                ; <<2 x i64>> [#uses=1]
  83         %61 = getelementptr i16* %out8x8, i64 48                ; <i16*> [#uses=1]
  84         %62 = bitcast i16* %61 to <2 x i64>*            ; <<2 x i64>*> [#uses=1]
  85         store <2 x i64> %60, <2 x i64>* %62, align 16
  86         %63 = getelementptr i16* %out8x8, i64 56                ; <i16*> [#uses=1]
  87         %64 = bitcast i16* %63 to <2 x i64>*            ; <<2 x i64>*> [#uses=1]
  88         store <2 x i64> %22, <2 x i64>* %64, align 16
  89         ret void
  90 }
  91
  92 declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
  93
  94 declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
  95
  96 declare <8 x i16> @llvm.x86.sse2.pcmpeq.w(<8 x i16>, <8 x i16>) nounwind readnone
  97
  98 declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
  99
 100 declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
 101
 102 declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone