1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=+false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=ENABLE
3 ; RUN: llc -verify-machineinstrs -mcpu=sapphirerapids -mattr=-false-deps-mullq -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefixes=DISABLE
5 define <2 x i64> @pmullq_128(<2 x i64> %a0, <2 x i64> %a1) {
6 ; ENABLE-LABEL: pmullq_128:
8 ; ENABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11 ; ENABLE-NEXT: #NO_APP
12 ; ENABLE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
14 ; ENABLE-NEXT: vpmullq %xmm2, %xmm0, %xmm1
15 ; ENABLE-NEXT: vpaddq %xmm2, %xmm0, %xmm0
16 ; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
19 ; DISABLE-LABEL: pmullq_128:
21 ; DISABLE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
24 ; DISABLE-NEXT: #NO_APP
25 ; DISABLE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
26 ; DISABLE-NEXT: vpmullq %xmm2, %xmm0, %xmm1
27 ; DISABLE-NEXT: vpaddq %xmm2, %xmm0, %xmm0
28 ; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
30 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
31 %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
32 %3 = add <2 x i64> %a0, %a1
33 %res = add <2 x i64> %2, %3
37 define <2 x i64> @pmullq_mem_128(<2 x i64> %a0, ptr %p1) {
38 ; ENABLE-LABEL: pmullq_mem_128:
42 ; ENABLE-NEXT: #NO_APP
43 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
44 ; ENABLE-NEXT: vpmullq (%rdi), %xmm0, %xmm1
45 ; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
48 ; DISABLE-LABEL: pmullq_mem_128:
52 ; DISABLE-NEXT: #NO_APP
53 ; DISABLE-NEXT: vpmullq (%rdi), %xmm0, %xmm1
54 ; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
56 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
57 %a1 = load <2 x i64>, ptr %p1, align 64
58 %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
59 %res = add <2 x i64> %2, %a0
63 define <2 x i64> @pmullq_broadcast_128(<2 x i64> %a0, ptr %p1) {
64 ; ENABLE-LABEL: pmullq_broadcast_128:
68 ; ENABLE-NEXT: #NO_APP
69 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
70 ; ENABLE-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1
71 ; ENABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
74 ; DISABLE-LABEL: pmullq_broadcast_128:
78 ; DISABLE-NEXT: #NO_APP
79 ; DISABLE-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1
80 ; DISABLE-NEXT: vpaddq %xmm0, %xmm1, %xmm0
82 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
83 %v1 = load i64, ptr %p1, align 4
84 %t0 = insertelement <2 x i64> undef, i64 %v1, i64 0
85 %a1 = shufflevector <2 x i64> %t0, <2 x i64> undef, <2 x i32> zeroinitializer
86 %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
87 %res = add <2 x i64> %2, %a0
91 define <2 x i64> @pmullq_maskz_128(<2 x i64> %a0, <2 x i64> %a1, ptr %pmask) {
92 ; ENABLE-LABEL: pmullq_maskz_128:
94 ; ENABLE-NEXT: vpmullq %xmm1, %xmm0, %xmm2
95 ; ENABLE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
98 ; ENABLE-NEXT: #NO_APP
99 ; ENABLE-NEXT: kmovb (%rdi), %k1
100 ; ENABLE-NEXT: vpaddq %xmm1, %xmm0, %xmm0
101 ; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload
104 ; DISABLE-LABEL: pmullq_maskz_128:
106 ; DISABLE-NEXT: vpmullq %xmm1, %xmm0, %xmm2
107 ; DISABLE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
110 ; DISABLE-NEXT: #NO_APP
111 ; DISABLE-NEXT: kmovb (%rdi), %k1
112 ; DISABLE-NEXT: vpaddq %xmm1, %xmm0, %xmm0
113 ; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} # 16-byte Folded Reload
115 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
116 %mask = load i8, ptr %pmask
117 %2 = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> zeroinitializer, i8 %mask)
118 %3 = add <2 x i64> %a0, %a1
119 %res = add <2 x i64> %2, %3
123 declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask)
125 define <4 x i64> @pmullq_256(<4 x i64> %a0, <4 x i64> %a1) {
126 ; ENABLE-LABEL: pmullq_256:
128 ; ENABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
131 ; ENABLE-NEXT: #NO_APP
132 ; ENABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
133 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
134 ; ENABLE-NEXT: vpmullq %ymm2, %ymm0, %ymm1
135 ; ENABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0
136 ; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
139 ; DISABLE-LABEL: pmullq_256:
141 ; DISABLE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
144 ; DISABLE-NEXT: #NO_APP
145 ; DISABLE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
146 ; DISABLE-NEXT: vpmullq %ymm2, %ymm0, %ymm1
147 ; DISABLE-NEXT: vpaddq %ymm2, %ymm0, %ymm0
148 ; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
150 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
151 %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
152 %3 = add <4 x i64> %a0, %a1
153 %res = add <4 x i64> %2, %3
157 define <4 x i64> @pmullq_mem_256(<4 x i64> %a0, ptr %p1) {
158 ; ENABLE-LABEL: pmullq_mem_256:
162 ; ENABLE-NEXT: #NO_APP
163 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
164 ; ENABLE-NEXT: vpmullq (%rdi), %ymm0, %ymm1
165 ; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
168 ; DISABLE-LABEL: pmullq_mem_256:
172 ; DISABLE-NEXT: #NO_APP
173 ; DISABLE-NEXT: vpmullq (%rdi), %ymm0, %ymm1
174 ; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
176 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
177 %a1 = load <4 x i64>, ptr %p1, align 64
178 %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
179 %res = add <4 x i64> %2, %a0
183 define <4 x i64> @pmullq_broadcast_256(<4 x i64> %a0, ptr %p1) {
184 ; ENABLE-LABEL: pmullq_broadcast_256:
188 ; ENABLE-NEXT: #NO_APP
189 ; ENABLE-NEXT: vxorps %xmm1, %xmm1, %xmm1
190 ; ENABLE-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1
191 ; ENABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
194 ; DISABLE-LABEL: pmullq_broadcast_256:
198 ; DISABLE-NEXT: #NO_APP
199 ; DISABLE-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1
200 ; DISABLE-NEXT: vpaddq %ymm0, %ymm1, %ymm0
202 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
203 %v1 = load i64, ptr %p1, align 4
204 %t0 = insertelement <4 x i64> undef, i64 %v1, i64 0
205 %a1 = shufflevector <4 x i64> %t0, <4 x i64> undef, <4 x i32> zeroinitializer
206 %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
207 %res = add <4 x i64> %2, %a0
211 define <4 x i64> @pmullq_maskz_256(<4 x i64> %a0, <4 x i64> %a1, ptr %pmask) {
212 ; ENABLE-LABEL: pmullq_maskz_256:
214 ; ENABLE-NEXT: vpmullq %ymm1, %ymm0, %ymm2
215 ; ENABLE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
218 ; ENABLE-NEXT: #NO_APP
219 ; ENABLE-NEXT: kmovb (%rdi), %k1
220 ; ENABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
221 ; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload
224 ; DISABLE-LABEL: pmullq_maskz_256:
226 ; DISABLE-NEXT: vpmullq %ymm1, %ymm0, %ymm2
227 ; DISABLE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
230 ; DISABLE-NEXT: #NO_APP
231 ; DISABLE-NEXT: kmovb (%rdi), %k1
232 ; DISABLE-NEXT: vpaddq %ymm1, %ymm0, %ymm0
233 ; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} # 32-byte Folded Reload
235 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
236 %mask = load i8, ptr %pmask
237 %2 = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> zeroinitializer, i8 %mask)
238 %3 = add <4 x i64> %a0, %a1
239 %res = add <4 x i64> %2, %3
243 declare <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask)
245 define <8 x i64> @pmullq_512(<8 x i64> %a0, <8 x i64> %a1) {
246 ; ENABLE-LABEL: pmullq_512:
248 ; ENABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
251 ; ENABLE-NEXT: #NO_APP
252 ; ENABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
253 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
254 ; ENABLE-NEXT: vpmullq %zmm2, %zmm0, %zmm1
255 ; ENABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0
256 ; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
259 ; DISABLE-LABEL: pmullq_512:
261 ; DISABLE-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
264 ; DISABLE-NEXT: #NO_APP
265 ; DISABLE-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
266 ; DISABLE-NEXT: vpmullq %zmm2, %zmm0, %zmm1
267 ; DISABLE-NEXT: vpaddq %zmm2, %zmm0, %zmm0
268 ; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
270 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
271 %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
272 %3 = add <8 x i64> %a0, %a1
273 %res = add <8 x i64> %2, %3
277 define <8 x i64> @pmullq_mem_512(<8 x i64> %a0, ptr %p1) {
278 ; ENABLE-LABEL: pmullq_mem_512:
282 ; ENABLE-NEXT: #NO_APP
283 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
284 ; ENABLE-NEXT: vpmullq (%rdi), %zmm0, %zmm1
285 ; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
288 ; DISABLE-LABEL: pmullq_mem_512:
292 ; DISABLE-NEXT: #NO_APP
293 ; DISABLE-NEXT: vpmullq (%rdi), %zmm0, %zmm1
294 ; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
296 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
297 %a1 = load <8 x i64>, ptr %p1, align 64
298 %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
299 %res = add <8 x i64> %2, %a0
303 define <8 x i64> @pmullq_broadcast_512(<8 x i64> %a0, ptr %p1) {
304 ; ENABLE-LABEL: pmullq_broadcast_512:
308 ; ENABLE-NEXT: #NO_APP
309 ; ENABLE-NEXT: vpxor %xmm1, %xmm1, %xmm1
310 ; ENABLE-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1
311 ; ENABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
314 ; DISABLE-LABEL: pmullq_broadcast_512:
318 ; DISABLE-NEXT: #NO_APP
319 ; DISABLE-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1
320 ; DISABLE-NEXT: vpaddq %zmm0, %zmm1, %zmm0
322 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
323 %v1 = load i64, ptr %p1, align 4
324 %t0 = insertelement <8 x i64> undef, i64 %v1, i64 0
325 %a1 = shufflevector <8 x i64> %t0, <8 x i64> undef, <8 x i32> zeroinitializer
326 %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> undef, i8 -1)
327 %res = add <8 x i64> %2, %a0
331 define <8 x i64> @pmullq_maskz_512(<8 x i64> %a0, <8 x i64> %a1, ptr %pmask) {
332 ; ENABLE-LABEL: pmullq_maskz_512:
334 ; ENABLE-NEXT: vpmullq %zmm1, %zmm0, %zmm2
335 ; ENABLE-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
338 ; ENABLE-NEXT: #NO_APP
339 ; ENABLE-NEXT: kmovb (%rdi), %k1
340 ; ENABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
341 ; ENABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
344 ; DISABLE-LABEL: pmullq_maskz_512:
346 ; DISABLE-NEXT: vpmullq %zmm1, %zmm0, %zmm2
347 ; DISABLE-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
350 ; DISABLE-NEXT: #NO_APP
351 ; DISABLE-NEXT: kmovb (%rdi), %k1
352 ; DISABLE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
353 ; DISABLE-NEXT: vpaddq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} # 64-byte Folded Reload
355 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
356 %mask = load i8, ptr %pmask
357 %2 = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
358 %3 = add <8 x i64> %a0, %a1
359 %res = add <8 x i64> %2, %3
363 declare <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)