1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -passes=slp-vectorizer -S -mattr=+sse2 | FileCheck %s --check-prefix=SSE
3 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -passes=slp-vectorizer -S -mattr=+avx | FileCheck %s --check-prefix=AVX
4 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -passes=slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=AVX
6 ; Verify that the SLP vectorizer is able to figure out that commutativity
7 ; offers the possibility to splat/broadcast %c and thus make it profitable
8 ; to vectorize this case
10 @cle = external unnamed_addr global [32 x i8], align 16
11 @cle32 = external unnamed_addr global [32 x i32], align 16
14 ; Check that we correctly detect a splat/broadcast by leveraging the
15 ; commutativity property of `xor`.
17 define void @splat(i8 %a, i8 %b, i8 %c) {
19 ; SSE-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
20 ; SSE-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1
21 ; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
22 ; SSE-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
23 ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer
24 ; SSE-NEXT: [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]]
25 ; SSE-NEXT: store <16 x i8> [[TMP6]], ptr @cle, align 16
29 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i32 0
30 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[B:%.*]], i32 1
31 ; AVX-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
32 ; AVX-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> poison, i8 [[C:%.*]], i32 0
33 ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <16 x i8> [[TMP4]], <16 x i8> poison, <16 x i32> zeroinitializer
34 ; AVX-NEXT: [[TMP6:%.*]] = xor <16 x i8> [[TMP3]], [[TMP5]]
35 ; AVX-NEXT: store <16 x i8> [[TMP6]], ptr @cle, align 16
39 store i8 %1, ptr @cle, align 16
41 store i8 %2, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 1)
43 store i8 %3, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 2)
45 store i8 %4, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 3)
47 store i8 %5, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 4)
49 store i8 %6, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 5)
51 store i8 %7, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 6)
53 store i8 %8, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 7)
55 store i8 %9, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 8)
57 store i8 %10, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 9)
59 store i8 %11, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 10)
61 store i8 %12, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 11)
63 store i8 %13, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 12)
65 store i8 %14, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 13)
67 store i8 %15, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 14)
69 store i8 %16, ptr getelementptr inbounds ([32 x i8], ptr @cle, i64 0, i64 15)
73 ; Check that we correctly detect that we can have the same opcode on one side by
74 ; leveraging the commutativity property of `xor`.
76 define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
77 ; SSE-LABEL: @same_opcode_on_one_side(
78 ; SSE-NEXT: [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]]
79 ; SSE-NEXT: [[ADD2:%.*]] = add i32 [[C]], [[A]]
80 ; SSE-NEXT: [[ADD3:%.*]] = add i32 [[A]], [[C]]
81 ; SSE-NEXT: [[ADD4:%.*]] = add i32 [[C]], [[A]]
82 ; SSE-NEXT: [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]]
83 ; SSE-NEXT: store i32 [[TMP1]], ptr @cle32, align 16
84 ; SSE-NEXT: [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]]
85 ; SSE-NEXT: store i32 [[TMP2]], ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 1), align 4
86 ; SSE-NEXT: [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]]
87 ; SSE-NEXT: store i32 [[TMP3]], ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 2), align 4
88 ; SSE-NEXT: [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]]
89 ; SSE-NEXT: store i32 [[TMP4]], ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 3), align 4
92 ; AVX-LABEL: @same_opcode_on_one_side(
93 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
94 ; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
95 ; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
96 ; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer
97 ; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
98 ; AVX-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 poison, i32 4, i32 0>
99 ; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[B:%.*]], i32 1
100 ; AVX-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP5]], [[TMP7]]
101 ; AVX-NEXT: store <4 x i32> [[TMP8]], ptr @cle32, align 16
104 %add1 = add i32 %c, %a
105 %add2 = add i32 %c, %a
106 %add3 = add i32 %a, %c
107 %add4 = add i32 %c, %a
108 %1 = xor i32 %add1, %a
109 store i32 %1, ptr @cle32, align 16
110 %2 = xor i32 %b, %add2
111 store i32 %2, ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 1)
112 %3 = xor i32 %c, %add3
113 store i32 %3, ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 2)
114 %4 = xor i32 %a, %add4
115 store i32 %4, ptr getelementptr inbounds ([32 x i32], ptr @cle32, i64 0, i64 3)