1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+sse2 | FileCheck %s --check-prefix=SSE
3 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx | FileCheck %s --check-prefix=AVX
4 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6 ; Verify that the SLP vectorizer is able to figure out that commutativity
7 ; offers the possibility to splat/broadcast %c and thus make it profitable
8 ; to vectorize this case
10 @cle = external unnamed_addr global [32 x i8], align 16
11 @cle32 = external unnamed_addr global [32 x i32], align 16
14 ; Check that we correctly detect a splat/broadcast by leveraging the
15 ; commutativity property of `xor`.
17 define void @splat(i8 %a, i8 %b, i8 %c) {
19 ; SSE-NEXT: [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
20 ; SSE-NEXT: store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
21 ; SSE-NEXT: [[TMP2:%.*]] = xor i8 [[A]], [[C]]
22 ; SSE-NEXT: store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1), align 1
23 ; SSE-NEXT: [[TMP3:%.*]] = xor i8 [[A]], [[C]]
24 ; SSE-NEXT: store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2), align 1
25 ; SSE-NEXT: [[TMP4:%.*]] = xor i8 [[A]], [[C]]
26 ; SSE-NEXT: store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3), align 1
27 ; SSE-NEXT: [[TMP5:%.*]] = xor i8 [[C]], [[A]]
28 ; SSE-NEXT: store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4), align 1
29 ; SSE-NEXT: [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
30 ; SSE-NEXT: store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5), align 1
31 ; SSE-NEXT: [[TMP7:%.*]] = xor i8 [[C]], [[A]]
32 ; SSE-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1
33 ; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]]
34 ; SSE-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1
35 ; SSE-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]]
36 ; SSE-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1
37 ; SSE-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]]
38 ; SSE-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1
39 ; SSE-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]]
40 ; SSE-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1
41 ; SSE-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]]
42 ; SSE-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1
43 ; SSE-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]]
44 ; SSE-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1
45 ; SSE-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]]
46 ; SSE-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1
47 ; SSE-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]]
48 ; SSE-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1
49 ; SSE-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]]
50 ; SSE-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1
54 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
55 ; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer
56 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
57 ; AVX-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[B:%.*]], i32 1
58 ; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
59 ; AVX-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[SHUFFLE]], [[SHUFFLE1]]
60 ; AVX-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
64 store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
66 store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
68 store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
70 store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
72 store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
74 store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
76 store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
78 store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
80 store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
82 store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
84 store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
86 store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
88 store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
90 store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
92 store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
94 store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
98 ; Check that we correctly detect that we can have the same opcode on one side by
99 ; leveraging the commutativity property of `xor`.
101 define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
102 ; SSE-LABEL: @same_opcode_on_one_side(
103 ; SSE-NEXT: [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]]
104 ; SSE-NEXT: [[ADD2:%.*]] = add i32 [[C]], [[A]]
105 ; SSE-NEXT: [[ADD3:%.*]] = add i32 [[A]], [[C]]
106 ; SSE-NEXT: [[ADD4:%.*]] = add i32 [[C]], [[A]]
107 ; SSE-NEXT: [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]]
108 ; SSE-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
109 ; SSE-NEXT: [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]]
110 ; SSE-NEXT: store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1), align 4
111 ; SSE-NEXT: [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]]
112 ; SSE-NEXT: store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2), align 4
113 ; SSE-NEXT: [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]]
114 ; SSE-NEXT: store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3), align 4
117 ; AVX-LABEL: @same_opcode_on_one_side(
118 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
119 ; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
120 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
121 ; AVX-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
122 ; AVX-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
123 ; AVX-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 1
124 ; AVX-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[C]], i32 2
125 ; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 3
126 ; AVX-NEXT: [[TMP7:%.*]] = xor <4 x i32> [[TMP3]], [[TMP6]]
127 ; AVX-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
130 %add1 = add i32 %c, %a
131 %add2 = add i32 %c, %a
132 %add3 = add i32 %a, %c
133 %add4 = add i32 %c, %a
134 %1 = xor i32 %add1, %a
135 store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
136 %2 = xor i32 %b, %add2
137 store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
138 %3 = xor i32 %c, %add3
139 store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
140 %4 = xor i32 %a, %add4
141 store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)