1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt < %s -instcombine -S | FileCheck %s
8 define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
9 ; CHECK-LABEL: @undef_pmuludq_128(
10 ; CHECK-NEXT: ret <2 x i64> zeroinitializer
12 %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
16 define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
17 ; CHECK-LABEL: @undef_pmuludq_256(
18 ; CHECK-NEXT: ret <4 x i64> zeroinitializer
20 %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
24 define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
25 ; CHECK-LABEL: @undef_pmuludq_512(
26 ; CHECK-NEXT: ret <8 x i64> zeroinitializer
28 %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
32 define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
33 ; CHECK-LABEL: @undef_pmuldq_128(
34 ; CHECK-NEXT: ret <2 x i64> zeroinitializer
36 %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
40 define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
41 ; CHECK-LABEL: @undef_pmuldq_256(
42 ; CHECK-NEXT: ret <4 x i64> zeroinitializer
44 %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
48 define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
49 ; CHECK-LABEL: @undef_pmuldq_512(
50 ; CHECK-NEXT: ret <8 x i64> zeroinitializer
52 %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
56 define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
57 ; CHECK-LABEL: @undef_zero_pmuludq_128(
58 ; CHECK-NEXT: ret <2 x i64> zeroinitializer
60 %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
64 define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
65 ; CHECK-LABEL: @undef_zero_pmuludq_256(
66 ; CHECK-NEXT: ret <4 x i64> zeroinitializer
68 %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
72 define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
73 ; CHECK-LABEL: @undef_zero_pmuludq_512(
74 ; CHECK-NEXT: ret <8 x i64> zeroinitializer
76 %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
80 define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
81 ; CHECK-LABEL: @undef_zero_pmuldq_128(
82 ; CHECK-NEXT: ret <2 x i64> zeroinitializer
84 %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
88 define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
89 ; CHECK-LABEL: @undef_zero_pmuldq_256(
90 ; CHECK-NEXT: ret <4 x i64> zeroinitializer
92 %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
96 define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
97 ; CHECK-LABEL: @undef_zero_pmuldq_512(
98 ; CHECK-NEXT: ret <8 x i64> zeroinitializer
100 %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
108 define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
109 ; CHECK-LABEL: @fold_pmuludq_128(
110 ; CHECK-NEXT: ret <2 x i64> <i64 9223372030412324865, i64 4294967295>
112 %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
116 define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
117 ; CHECK-LABEL: @fold_pmuludq_256(
118 ; CHECK-NEXT: ret <4 x i64> zeroinitializer
120 %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
124 define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
125 ; CHECK-LABEL: @fold_pmuludq_512(
126 ; CHECK-NEXT: ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>
128 %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
132 define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
133 ; CHECK-LABEL: @fold_pmuldq_128(
134 ; CHECK-NEXT: ret <2 x i64> <i64 0, i64 2>
136 %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
140 define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
141 ; CHECK-LABEL: @fold_pmuldq_256(
142 ; CHECK-NEXT: ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>
144 %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
148 define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
149 ; CHECK-LABEL: @fold_pmuldq_512(
150 ; CHECK-NEXT: ret <8 x i64> zeroinitializer
152 %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
157 ; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
160 define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
161 ; CHECK-LABEL: @test_demanded_elts_pmuludq_128(
162 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
163 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
164 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
165 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
166 ; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP3]], <i64 4294967295, i64 poison>
167 ; CHECK-NEXT: [[TMP6:%.*]] = and <2 x i64> [[TMP4]], <i64 4294967295, i64 poison>
168 ; CHECK-NEXT: [[TMP7:%.*]] = mul <2 x i64> [[TMP5]], [[TMP6]]
169 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> undef, <2 x i32> zeroinitializer
170 ; CHECK-NEXT: ret <2 x i64> [[TMP8]]
172 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
173 %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
174 %3 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %1, <4 x i32> %2)
175 %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
179 define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
180 ; CHECK-LABEL: @test_demanded_elts_pmuludq_256(
181 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
182 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
183 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
184 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
185 ; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
186 ; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
187 ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i64> [[TMP5]], [[TMP6]]
188 ; CHECK-NEXT: ret <4 x i64> [[TMP7]]
190 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
191 %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
192 %3 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %1, <8 x i32> %2)
196 define <8 x i64> @test_demanded_elts_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
197 ; CHECK-LABEL: @test_demanded_elts_pmuludq_512(
198 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
199 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
200 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
201 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
202 ; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
203 ; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
204 ; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <8 x i64> [[TMP5]], [[TMP6]]
205 ; CHECK-NEXT: ret <8 x i64> [[TMP7]]
207 %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
208 %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
209 %3 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %1, <16 x i32> %2)
213 define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
214 ; CHECK-LABEL: @test_demanded_elts_pmuldq_128(
215 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
216 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
217 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
218 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
219 ; CHECK-NEXT: [[TMP5:%.*]] = shl <2 x i64> [[TMP3]], <i64 32, i64 32>
220 ; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <2 x i64> [[TMP5]], <i64 32, i64 32>
221 ; CHECK-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP4]], <i64 32, i64 32>
222 ; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <2 x i64> [[TMP7]], <i64 32, i64 32>
223 ; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <2 x i64> [[TMP6]], [[TMP8]]
224 ; CHECK-NEXT: ret <2 x i64> [[TMP9]]
226 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
227 %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
228 %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %1, <4 x i32> %2)
232 define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
233 ; CHECK-LABEL: @test_demanded_elts_pmuldq_256(
234 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
235 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
236 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
237 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
238 ; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32>
239 ; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
240 ; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32>
241 ; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <4 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32>
242 ; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i64> [[TMP6]], [[TMP8]]
243 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
244 ; CHECK-NEXT: ret <4 x i64> [[TMP10]]
246 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
247 %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
248 %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %1, <8 x i32> %2)
249 %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
253 define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
254 ; CHECK-LABEL: @test_demanded_elts_pmuldq_512(
255 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
256 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
257 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
258 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
259 ; CHECK-NEXT: [[TMP5:%.*]] = shl <8 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
260 ; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <8 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
261 ; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
262 ; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <8 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
263 ; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP8]]
264 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
265 ; CHECK-NEXT: ret <8 x i64> [[TMP10]]
267 %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
268 %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
269 %3 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %1, <16 x i32> %2)
270 %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
274 declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
275 declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
277 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
278 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
280 declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
281 declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) nounwind readnone