llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+sse2 | FileCheck %s --check-prefix=SSE
   3 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx  | FileCheck %s --check-prefix=AVX
   4 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=AVX
   5
   6 ; Verify that the SLP vectorizer is able to figure out that commutativity
   7 ; offers the possibility to splat/broadcast %c and thus make it profitable
   8 ; to vectorize this case
   9
  10 @cle = external unnamed_addr global [32 x i8], align 16
  11 @cle32 = external unnamed_addr global [32 x i32], align 16
  12
  13
  14 ; Check that we correctly detect a splat/broadcast by leveraging the
  15 ; commutativity property of `xor`.
  16
  17 define void @splat(i8 %a, i8 %b, i8 %c) {
  18 ; SSE-LABEL: @splat(
  19 ; SSE-NEXT:    [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
  20 ; SSE-NEXT:    store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
  21 ; SSE-NEXT:    [[TMP2:%.*]] = xor i8 [[A]], [[C]]
  22 ; SSE-NEXT:    store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1), align 1
  23 ; SSE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], [[C]]
  24 ; SSE-NEXT:    store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2), align 1
  25 ; SSE-NEXT:    [[TMP4:%.*]] = xor i8 [[A]], [[C]]
  26 ; SSE-NEXT:    store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3), align 1
  27 ; SSE-NEXT:    [[TMP5:%.*]] = xor i8 [[C]], [[A]]
  28 ; SSE-NEXT:    store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4), align 1
  29 ; SSE-NEXT:    [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
  30 ; SSE-NEXT:    store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5), align 1
  31 ; SSE-NEXT:    [[TMP7:%.*]] = xor i8 [[C]], [[A]]
  32 ; SSE-NEXT:    store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1
  33 ; SSE-NEXT:    [[TMP8:%.*]] = xor i8 [[C]], [[B]]
  34 ; SSE-NEXT:    store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1
  35 ; SSE-NEXT:    [[TMP9:%.*]] = xor i8 [[A]], [[C]]
  36 ; SSE-NEXT:    store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1
  37 ; SSE-NEXT:    [[TMP10:%.*]] = xor i8 [[A]], [[C]]
  38 ; SSE-NEXT:    store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1
  39 ; SSE-NEXT:    [[TMP11:%.*]] = xor i8 [[A]], [[C]]
  40 ; SSE-NEXT:    store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1
  41 ; SSE-NEXT:    [[TMP12:%.*]] = xor i8 [[A]], [[C]]
  42 ; SSE-NEXT:    store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1
  43 ; SSE-NEXT:    [[TMP13:%.*]] = xor i8 [[A]], [[C]]
  44 ; SSE-NEXT:    store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1
  45 ; SSE-NEXT:    [[TMP14:%.*]] = xor i8 [[A]], [[C]]
  46 ; SSE-NEXT:    store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1
  47 ; SSE-NEXT:    [[TMP15:%.*]] = xor i8 [[A]], [[C]]
  48 ; SSE-NEXT:    store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1
  49 ; SSE-NEXT:    [[TMP16:%.*]] = xor i8 [[A]], [[C]]
  50 ; SSE-NEXT:    store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1
  51 ; SSE-NEXT:    ret void
  52 ;
  53 ; AVX-LABEL: @splat(
  54 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
  55 ; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
  56 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
  57 ; AVX-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[B:%.*]], i32 1
  58 ; AVX-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
  59 ; AVX-NEXT:    [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]]
  60 ; AVX-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
  61 ; AVX-NEXT:    ret void
  62 ;
  63   %1 = xor i8 %c, %a
  64   store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
  65   %2 = xor i8 %a, %c
  66   store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
  67   %3 = xor i8 %a, %c
  68   store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
  69   %4 = xor i8 %a, %c
  70   store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
  71   %5 = xor i8 %c, %a
  72   store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
  73   %6 = xor i8 %c, %b
  74   store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
  75   %7 = xor i8 %c, %a
  76   store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
  77   %8 = xor i8 %c, %b
  78   store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
  79   %9 = xor i8 %a, %c
  80   store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
  81   %10 = xor i8 %a, %c
  82   store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
  83   %11 = xor i8 %a, %c
  84   store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
  85   %12 = xor i8 %a, %c
  86   store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
  87   %13 = xor i8 %a, %c
  88   store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
  89   %14 = xor i8 %a, %c
  90   store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
  91   %15 = xor i8 %a, %c
  92   store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
  93   %16 = xor i8 %a, %c
  94   store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
  95   ret void
  96 }
  97
  98 ; Check that we correctly detect that we can have the same opcode on one side by
  99 ; leveraging the commutativity property of `xor`.
 100
 101 define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
 102 ; SSE-LABEL: @same_opcode_on_one_side(
 103 ; SSE-NEXT:    [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]]
 104 ; SSE-NEXT:    [[ADD2:%.*]] = add i32 [[C]], [[A]]
 105 ; SSE-NEXT:    [[ADD3:%.*]] = add i32 [[A]], [[C]]
 106 ; SSE-NEXT:    [[ADD4:%.*]] = add i32 [[C]], [[A]]
 107 ; SSE-NEXT:    [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]]
 108 ; SSE-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
 109 ; SSE-NEXT:    [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]]
 110 ; SSE-NEXT:    store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1), align 4
 111 ; SSE-NEXT:    [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]]
 112 ; SSE-NEXT:    store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2), align 4
 113 ; SSE-NEXT:    [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]]
 114 ; SSE-NEXT:    store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3), align 4
 115 ; SSE-NEXT:    ret void
 116 ;
 117 ; AVX-LABEL: @same_opcode_on_one_side(
 118 ; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
 119 ; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 120 ; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
 121 ; AVX-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 122 ; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 123 ; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 1
 124 ; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[C]], i32 2
 125 ; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 3
 126 ; AVX-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP3]], [[TMP6]]
 127 ; AVX-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
 128 ; AVX-NEXT:    ret void
 129 ;
 130   %add1 = add i32 %c, %a
 131   %add2 = add i32 %c, %a
 132   %add3 = add i32 %a, %c
 133   %add4 = add i32 %c, %a
 134   %1 = xor i32 %add1, %a
 135   store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
 136   %2 = xor i32 %b, %add2
 137   store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
 138   %3 = xor i32 %c, %add3
 139   store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
 140   %4 = xor i32 %a, %add4
 141   store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)
 142   ret void
 143 }