1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64 -S | FileCheck %s --check-prefixes=SSE2
3 ; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v2 -S | FileCheck %s --check-prefixes=SSE42
4 ; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX2
5 ; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX512
8 ; typedef int v4si __attribute__ ((vector_size (16)));
10 ; inline int reduce_and4(int acc, v4si v1, v4si v2, v4si v3, v4si v4) {
11 ; acc &= v1[0] & v1[1] & v1[2] & v1[3];
12 ; acc &= v2[0] & v2[1] & v2[2] & v2[3];
13 ; acc &= v3[0] & v3[1] & v3[2] & v3[3];
14 ; acc &= v4[0] & v4[1] & v4[2] & v4[3];
18 define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
19 ; SSE2-LABEL: @reduce_and4(
21 ; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
22 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
23 ; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
24 ; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
25 ; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
26 ; SSE2-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
27 ; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[ACC:%.*]]
28 ; SSE2-NEXT: ret i32 [[OP_RDX]]
30 ; SSE42-LABEL: @reduce_and4(
32 ; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
33 ; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
34 ; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
35 ; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
36 ; SSE42-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
37 ; SSE42-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]])
38 ; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[ACC:%.*]]
39 ; SSE42-NEXT: ret i32 [[OP_RDX]]
41 ; AVX2-LABEL: @reduce_and4(
43 ; AVX2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
44 ; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
45 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
46 ; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP2]])
47 ; AVX2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[ACC:%.*]]
48 ; AVX2-NEXT: ret i32 [[OP_RDX]]
50 ; AVX512-LABEL: @reduce_and4(
52 ; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
53 ; AVX512-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
54 ; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
55 ; AVX512-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
56 ; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
57 ; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
58 ; AVX512-NEXT: ret i32 [[OP_RDX1]]
61 %vecext = extractelement <4 x i32> %v1, i64 0
62 %vecext1 = extractelement <4 x i32> %v1, i64 1
63 %vecext2 = extractelement <4 x i32> %v1, i64 2
64 %vecext4 = extractelement <4 x i32> %v1, i64 3
65 %vecext7 = extractelement <4 x i32> %v2, i64 0
66 %vecext8 = extractelement <4 x i32> %v2, i64 1
67 %vecext10 = extractelement <4 x i32> %v2, i64 2
68 %vecext12 = extractelement <4 x i32> %v2, i64 3
69 %vecext15 = extractelement <4 x i32> %v3, i64 0
70 %vecext16 = extractelement <4 x i32> %v3, i64 1
71 %vecext18 = extractelement <4 x i32> %v3, i64 2
72 %vecext20 = extractelement <4 x i32> %v3, i64 3
73 %vecext23 = extractelement <4 x i32> %v4, i64 0
74 %vecext24 = extractelement <4 x i32> %v4, i64 1
75 %vecext26 = extractelement <4 x i32> %v4, i64 2
76 %vecext28 = extractelement <4 x i32> %v4, i64 3
77 %and25 = and i32 %vecext1, %acc
78 %and27 = and i32 %and25, %vecext
79 %and29 = and i32 %and27, %vecext2
80 %and17 = and i32 %and29, %vecext4
81 %and19 = and i32 %and17, %vecext8
82 %and21 = and i32 %and19, %vecext7
83 %and9 = and i32 %and21, %vecext10
84 %and11 = and i32 %and9, %vecext12
85 %and13 = and i32 %and11, %vecext16
86 %and = and i32 %and13, %vecext15
87 %and3 = and i32 %and, %vecext18
88 %and5 = and i32 %and3, %vecext20
89 %and6 = and i32 %and5, %vecext24
90 %and14 = and i32 %and6, %vecext23
91 %and22 = and i32 %and14, %vecext26
92 %and30 = and i32 %and22, %vecext28
96 ; int reduce_and4_transpose(int acc, v4si v1, v4si v2, v4si v3, v4si v4) {
97 ; acc &= v1[0] & v2[0] & v3[0] & v4[0];
98 ; acc &= v1[1] & v2[1] & v3[1] & v4[1];
99 ; acc &= v1[2] & v2[2] & v3[2] & v4[2];
100 ; acc &= v1[3] & v2[3] & v3[3] & v4[3];
104 define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
105 ; SSE2-LABEL: @reduce_and4_transpose(
106 ; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
107 ; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
108 ; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
109 ; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
110 ; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
111 ; SSE2-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP5]])
112 ; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[ACC:%.*]]
113 ; SSE2-NEXT: ret i32 [[OP_RDX]]
115 ; SSE42-LABEL: @reduce_and4_transpose(
116 ; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
117 ; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
118 ; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
119 ; SSE42-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
120 ; SSE42-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
121 ; SSE42-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP5]])
122 ; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[ACC:%.*]]
123 ; SSE42-NEXT: ret i32 [[OP_RDX]]
125 ; AVX2-LABEL: @reduce_and4_transpose(
126 ; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
127 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
128 ; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
129 ; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
130 ; AVX2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[ACC:%.*]]
131 ; AVX2-NEXT: ret i32 [[OP_RDX]]
133 ; AVX512-LABEL: @reduce_and4_transpose(
134 ; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
135 ; AVX512-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
136 ; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
137 ; AVX512-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
138 ; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
139 ; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
140 ; AVX512-NEXT: ret i32 [[OP_RDX1]]
142 %vecext = extractelement <4 x i32> %v1, i64 0
143 %vecext1 = extractelement <4 x i32> %v2, i64 0
144 %vecext2 = extractelement <4 x i32> %v3, i64 0
145 %vecext4 = extractelement <4 x i32> %v4, i64 0
146 %vecext7 = extractelement <4 x i32> %v1, i64 1
147 %vecext8 = extractelement <4 x i32> %v2, i64 1
148 %vecext10 = extractelement <4 x i32> %v3, i64 1
149 %vecext12 = extractelement <4 x i32> %v4, i64 1
150 %vecext15 = extractelement <4 x i32> %v1, i64 2
151 %vecext16 = extractelement <4 x i32> %v2, i64 2
152 %vecext18 = extractelement <4 x i32> %v3, i64 2
153 %vecext20 = extractelement <4 x i32> %v4, i64 2
154 %vecext23 = extractelement <4 x i32> %v1, i64 3
155 %vecext24 = extractelement <4 x i32> %v2, i64 3
156 %vecext26 = extractelement <4 x i32> %v3, i64 3
157 %vecext28 = extractelement <4 x i32> %v4, i64 3
158 %and = and i32 %vecext23, %acc
159 %and3 = and i32 %and, %vecext15
160 %and5 = and i32 %and3, %vecext7
161 %and6 = and i32 %and5, %vecext
162 %and9 = and i32 %and6, %vecext24
163 %and11 = and i32 %and9, %vecext16
164 %and13 = and i32 %and11, %vecext8
165 %and14 = and i32 %and13, %vecext1
166 %and17 = and i32 %and14, %vecext26
167 %and19 = and i32 %and17, %vecext18
168 %and21 = and i32 %and19, %vecext10
169 %and22 = and i32 %and21, %vecext2
170 %and25 = and i32 %and22, %vecext28
171 %and27 = and i32 %and25, %vecext20
172 %and29 = and i32 %and27, %vecext12
173 %and30 = and i32 %and29, %vecext4