1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6 declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
7 declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
9 define <4 x i32> @combine_pmaddwd_zero(<8 x i16> %a0, <8 x i16> %a1) {
10 ; SSE-LABEL: combine_pmaddwd_zero:
12 ; SSE-NEXT: xorps %xmm0, %xmm0
15 ; AVX-LABEL: combine_pmaddwd_zero:
17 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
19 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> zeroinitializer)
23 define <4 x i32> @combine_pmaddwd_zero_commute(<8 x i16> %a0, <8 x i16> %a1) {
24 ; SSE-LABEL: combine_pmaddwd_zero_commute:
26 ; SSE-NEXT: xorps %xmm0, %xmm0
29 ; AVX-LABEL: combine_pmaddwd_zero_commute:
31 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
33 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> zeroinitializer, <8 x i16> %a0)
37 define <8 x i32> @combine_pmaddwd_concat(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
38 ; SSE-LABEL: combine_pmaddwd_concat:
40 ; SSE-NEXT: pmaddwd %xmm1, %xmm0
41 ; SSE-NEXT: pmaddwd %xmm3, %xmm2
42 ; SSE-NEXT: movdqa %xmm2, %xmm1
45 ; AVX1-LABEL: combine_pmaddwd_concat:
47 ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
48 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm2, %xmm1
49 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
52 ; AVX2-LABEL: combine_pmaddwd_concat:
54 ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
55 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
56 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
57 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
58 ; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
60 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
61 %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a2, <8 x i16> %a3)
62 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
66 define <8 x i32> @combine_pmaddwd_concat_freeze(<8 x i16> %a0, <8 x i16> %a1) {
67 ; SSE-LABEL: combine_pmaddwd_concat_freeze:
69 ; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
70 ; SSE-NEXT: pmaddwd %xmm2, %xmm0
71 ; SSE-NEXT: pmaddwd %xmm2, %xmm1
74 ; AVX1-LABEL: combine_pmaddwd_concat_freeze:
76 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
77 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
78 ; AVX1-NEXT: vpmaddwd %xmm2, %xmm1, %xmm1
79 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
82 ; AVX2-LABEL: combine_pmaddwd_concat_freeze:
84 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
85 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
86 ; AVX2-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
88 %lo = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
89 %hi = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
90 %flo = freeze <4 x i32> %lo
91 %fhi = freeze <4 x i32> %hi
92 %res = shufflevector <4 x i32> %flo, <4 x i32> %fhi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
96 define <4 x i32> @combine_pmaddwd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
97 ; SSE-LABEL: combine_pmaddwd_demandedelts:
99 ; SSE-NEXT: pmaddwd %xmm1, %xmm0
100 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
103 ; AVX1-LABEL: combine_pmaddwd_demandedelts:
105 ; AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
106 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
109 ; AVX2-LABEL: combine_pmaddwd_demandedelts:
111 ; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
112 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
114 %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
115 %2 = shufflevector <8 x i16> %a1, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 7, i32 7>
116 %3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %1, <8 x i16> %2)
117 %4 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> zeroinitializer
121 ; [2]: (-5*13)+(6*-15) = -155 = 4294967141
122 define <4 x i32> @combine_pmaddwd_constant() {
123 ; SSE-LABEL: combine_pmaddwd_constant:
125 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [19,17,4294967141,271]
128 ; AVX-LABEL: combine_pmaddwd_constant:
130 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [19,17,4294967141,271]
132 %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -1, i16 2, i16 3, i16 -4, i16 -5, i16 6, i16 7, i16 -8>, <8 x i16> <i16 -5, i16 7, i16 -9, i16 -11, i16 13, i16 -15, i16 17, i16 -19>)
136 ; ensure we don't assume pmaddwd performs add nsw
137 ; [0]: (-32768*-32768)+(-32768*-32768) = 0x80000000 = 2147483648
138 define <4 x i32> @combine_pmaddwd_constant_nsw() {
139 ; SSE-LABEL: combine_pmaddwd_constant_nsw:
141 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
144 ; AVX-LABEL: combine_pmaddwd_constant_nsw:
146 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
148 %1 = insertelement <8 x i16> undef, i16 32768, i32 0
149 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
150 %3 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %2, <8 x i16> %2)
154 define <8 x i16> @combine_pmaddubsw_zero(<16 x i8> %a0, <16 x i8> %a1) {
155 ; SSE-LABEL: combine_pmaddubsw_zero:
157 ; SSE-NEXT: xorps %xmm0, %xmm0
160 ; AVX-LABEL: combine_pmaddubsw_zero:
162 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
164 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> zeroinitializer)
168 define <8 x i16> @combine_pmaddubsw_zero_commute(<16 x i8> %a0, <16 x i8> %a1) {
169 ; SSE-LABEL: combine_pmaddubsw_zero_commute:
171 ; SSE-NEXT: xorps %xmm0, %xmm0
174 ; AVX-LABEL: combine_pmaddubsw_zero_commute:
176 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
178 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> zeroinitializer, <16 x i8> %a0)
182 define <16 x i16> @combine_pmaddubsw_concat(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> %a3) {
183 ; SSE-LABEL: combine_pmaddubsw_concat:
185 ; SSE-NEXT: pmaddubsw %xmm1, %xmm0
186 ; SSE-NEXT: pmaddubsw %xmm3, %xmm2
187 ; SSE-NEXT: movdqa %xmm2, %xmm1
190 ; AVX1-LABEL: combine_pmaddubsw_concat:
192 ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
193 ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm1
194 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
197 ; AVX2-LABEL: combine_pmaddubsw_concat:
199 ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
200 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
201 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
202 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
203 ; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
205 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
206 %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a2, <16 x i8> %a3)
207 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
211 define <16 x i16> @combine_pmaddubsw_concat_freeze(<16 x i8> %a0, <16 x i8> %a1) {
212 ; SSE-LABEL: combine_pmaddubsw_concat_freeze:
214 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
215 ; SSE-NEXT: pmaddubsw %xmm2, %xmm0
216 ; SSE-NEXT: pmaddubsw %xmm2, %xmm1
219 ; AVX1-LABEL: combine_pmaddubsw_concat_freeze:
221 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
222 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
223 ; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
224 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
227 ; AVX2-LABEL: combine_pmaddubsw_concat_freeze:
229 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
230 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
231 ; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
233 %lo = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
234 %hi = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
235 %flo = freeze <8 x i16> %lo
236 %fhi = freeze <8 x i16> %hi
237 %res = shufflevector <8 x i16> %flo, <8 x i16> %fhi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
241 define <8 x i16> @combine_pmaddubsw_demandedelts(<16 x i8> %a0, <16 x i8> %a1) {
242 ; SSE-LABEL: combine_pmaddubsw_demandedelts:
244 ; SSE-NEXT: pmaddubsw %xmm1, %xmm0
245 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
248 ; AVX1-LABEL: combine_pmaddubsw_demandedelts:
250 ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
251 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
254 ; AVX2-LABEL: combine_pmaddubsw_demandedelts:
256 ; AVX2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
257 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
259 %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
260 %2 = shufflevector <16 x i8> %a1, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
261 %3 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %1, <16 x i8> %2)
262 %4 = shufflevector <8 x i16> %3, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
266 ; [3]: ((uint16_t)-6*7)+(7*-8) = (250*7)+(7*-8) = 1694
267 define i32 @combine_pmaddubsw_constant() {
268 ; CHECK-LABEL: combine_pmaddubsw_constant:
270 ; CHECK-NEXT: movl $1694, %eax # imm = 0x69E
272 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
273 %2 = extractelement <8 x i16> %1, i32 3
274 %3 = sext i16 %2 to i32
278 ; [0]: add_sat_i16(((uint16_t)-1*-128),((uint16_t)-1*-128)_ = add_sat_i16(255*-128),(255*-128)) = sat_i16(-65280) = -32768
279 define i32 @combine_pmaddubsw_constant_sat() {
280 ; CHECK-LABEL: combine_pmaddubsw_constant_sat:
282 ; CHECK-NEXT: movl $-32768, %eax # imm = 0x8000
284 %1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> <i8 -1, i8 -1, i8 2, i8 3, i8 4, i8 5, i8 -6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8> <i8 -128, i8 -128, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>)
285 %2 = extractelement <8 x i16> %1, i32 0
286 %3 = sext i16 %2 to i32
290 ; Constant folding PMADDWD was causing an infinite loop in the PCMPGT commuting between 2 constant values.
291 define i1 @pmaddwd_pcmpgt_infinite_loop() {
292 ; CHECK-LABEL: pmaddwd_pcmpgt_infinite_loop:
294 ; CHECK-NEXT: movb $1, %al
296 %1 = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>, <8 x i16> <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>)
297 %2 = icmp eq <4 x i32> %1, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
298 %3 = select <4 x i1> %2, <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, <4 x i32> zeroinitializer
299 %4 = add <4 x i32> %3, <i32 -8, i32 -9, i32 -10, i32 -11>
300 %.not = trunc <4 x i32> %3 to <4 x i1>
301 %5 = icmp sgt <4 x i32> %4, <i32 2147483640, i32 2147483639, i32 2147483638, i32 2147483637>
302 %6 = select <4 x i1> %.not, <4 x i1> %5, <4 x i1> zeroinitializer
303 %7 = bitcast <4 x i1> %6 to i4
304 %8 = icmp eq i4 %7, 0