1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl < %s | FileCheck %s
4 declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
5 declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
7 define <8 x i64> @stack_fold_vpmadd52huq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
8 ; CHECK-LABEL: stack_fold_vpmadd52huq:
10 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14 ; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
16 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
17 %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
21 define <8 x i64> @stack_fold_vpmadd52huq_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
22 ; CHECK-LABEL: stack_fold_vpmadd52huq_commuted:
24 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
28 ; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
30 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
31 %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
35 define <8 x i64> @stack_fold_vpmadd52huq_mask(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
36 ; CHECK-LABEL: stack_fold_vpmadd52huq_mask:
38 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
42 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
43 ; CHECK-NEXT: kmovw %esi, %k1
44 ; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
45 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
47 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
48 %2 = load <8 x i64>, ptr %a0
49 %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %2, <8 x i64> %a1, <8 x i64> %a2)
50 %4 = bitcast i8 %mask to <8 x i1>
51 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
55 define <8 x i64> @stack_fold_vpmadd52huq_mask_commuted(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
56 ; CHECK-LABEL: stack_fold_vpmadd52huq_mask_commuted:
58 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
62 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
63 ; CHECK-NEXT: kmovw %esi, %k1
64 ; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
65 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
67 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
68 %2 = load <8 x i64>, ptr %a0
69 %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %2, <8 x i64> %a2, <8 x i64> %a1)
70 %4 = bitcast i8 %mask to <8 x i1>
71 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
75 define <8 x i64> @stack_fold_vpmadd52huq_maskz(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
76 ; CHECK-LABEL: stack_fold_vpmadd52huq_maskz:
78 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
82 ; CHECK-NEXT: movzbl (%rdi), %eax
83 ; CHECK-NEXT: kmovw %eax, %k1
84 ; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
86 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
87 %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
88 %3 = load i8, ptr %mask
89 %4 = bitcast i8 %3 to <8 x i1>
90 %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
94 define <8 x i64> @stack_fold_vpmadd52huq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
95 ; CHECK-LABEL: stack_fold_vpmadd52huq_maskz_commuted:
97 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
100 ; CHECK-NEXT: #NO_APP
101 ; CHECK-NEXT: movzbl (%rdi), %eax
102 ; CHECK-NEXT: kmovw %eax, %k1
103 ; CHECK-NEXT: vpmadd52huq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
105 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
106 %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
107 %3 = load i8, ptr %mask
108 %4 = bitcast i8 %3 to <8 x i1>
109 %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
113 define <8 x i64> @stack_fold_vpmadd52luq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
114 ; CHECK-LABEL: stack_fold_vpmadd52luq:
116 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
119 ; CHECK-NEXT: #NO_APP
120 ; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
122 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
123 %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
127 define <8 x i64> @stack_fold_vpmadd52luq_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2) {
128 ; CHECK-LABEL: stack_fold_vpmadd52luq_commuted:
130 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
133 ; CHECK-NEXT: #NO_APP
134 ; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
136 %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
137 %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
141 define <8 x i64> @stack_fold_vpmadd52luq_mask(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
142 ; CHECK-LABEL: stack_fold_vpmadd52luq_mask:
144 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
147 ; CHECK-NEXT: #NO_APP
148 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
149 ; CHECK-NEXT: kmovw %esi, %k1
150 ; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
151 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
153 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
154 %2 = load <8 x i64>, ptr %a0
155 %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %2, <8 x i64> %a1, <8 x i64> %a2)
156 %4 = bitcast i8 %mask to <8 x i1>
157 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
161 define <8 x i64> @stack_fold_vpmadd52luq_mask_commuted(ptr %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
162 ; CHECK-LABEL: stack_fold_vpmadd52luq_mask_commuted:
164 ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
167 ; CHECK-NEXT: #NO_APP
168 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
169 ; CHECK-NEXT: kmovw %esi, %k1
170 ; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
171 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
173 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
174 %2 = load <8 x i64>, ptr %a0
175 %3 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %2, <8 x i64> %a2, <8 x i64> %a1)
176 %4 = bitcast i8 %mask to <8 x i1>
177 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %2
181 define <8 x i64> @stack_fold_vpmadd52luq_maskz(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
182 ; CHECK-LABEL: stack_fold_vpmadd52luq_maskz:
184 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
187 ; CHECK-NEXT: #NO_APP
188 ; CHECK-NEXT: movzbl (%rdi), %eax
189 ; CHECK-NEXT: kmovw %eax, %k1
190 ; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
192 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
193 %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2)
194 %3 = load i8, ptr %mask
195 %4 = bitcast i8 %3 to <8 x i1>
196 %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer
200 define <8 x i64> @stack_fold_vpmadd52luq_maskz_commuted(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, ptr %mask) {
201 ; CHECK-LABEL: stack_fold_vpmadd52luq_maskz_commuted:
203 ; CHECK-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
206 ; CHECK-NEXT: #NO_APP
207 ; CHECK-NEXT: movzbl (%rdi), %eax
208 ; CHECK-NEXT: kmovw %eax, %k1
209 ; CHECK-NEXT: vpmadd52luq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
211 %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
212 %2 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %a0, <8 x i64> %a2, <8 x i64> %a1)
213 %3 = load i8, ptr %mask
214 %4 = bitcast i8 %3 to <8 x i1>
215 %5 = select <8 x i1> %4, <8 x i64> %2, <8 x i64> zeroinitializer