1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
7 define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
8 ; X86-LABEL: test_mm_mask_fmadd_pd:
9 ; X86: # %bb.0: # %entry
10 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
11 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
12 ; X86-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
13 ; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
14 ; X86-NEXT: retl # encoding: [0xc3]
16 ; X64-LABEL: test_mm_mask_fmadd_pd:
17 ; X64: # %bb.0: # %entry
18 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
19 ; X64-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
20 ; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
21 ; X64-NEXT: retq # encoding: [0xc3]
23 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
24 %1 = bitcast i8 %__U to <8 x i1>
25 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
26 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
30 define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
31 ; X86-LABEL: test_mm_mask_fmsub_pd:
32 ; X86: # %bb.0: # %entry
33 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
34 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
35 ; X86-NEXT: vfmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9a,0xc1]
36 ; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
37 ; X86-NEXT: retl # encoding: [0xc3]
39 ; X64-LABEL: test_mm_mask_fmsub_pd:
40 ; X64: # %bb.0: # %entry
41 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
42 ; X64-NEXT: vfmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9a,0xc1]
43 ; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
44 ; X64-NEXT: retq # encoding: [0xc3]
46 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
47 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
48 %1 = bitcast i8 %__U to <8 x i1>
49 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
50 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
54 define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
55 ; X86-LABEL: test_mm_mask3_fmadd_pd:
56 ; X86: # %bb.0: # %entry
57 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
58 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
59 ; X86-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1]
60 ; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
61 ; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
62 ; X86-NEXT: retl # encoding: [0xc3]
64 ; X64-LABEL: test_mm_mask3_fmadd_pd:
65 ; X64: # %bb.0: # %entry
66 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
67 ; X64-NEXT: vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1]
68 ; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
69 ; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
70 ; X64-NEXT: retq # encoding: [0xc3]
72 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
73 %1 = bitcast i8 %__U to <8 x i1>
74 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
75 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
79 define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
80 ; X86-LABEL: test_mm_mask3_fnmadd_pd:
81 ; X86: # %bb.0: # %entry
82 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
83 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
84 ; X86-NEXT: vfnmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbc,0xd1]
85 ; X86-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
86 ; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
87 ; X86-NEXT: retl # encoding: [0xc3]
89 ; X64-LABEL: test_mm_mask3_fnmadd_pd:
90 ; X64: # %bb.0: # %entry
91 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
92 ; X64-NEXT: vfnmadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbc,0xd1]
93 ; X64-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
94 ; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
95 ; X64-NEXT: retq # encoding: [0xc3]
97 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
98 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
99 %1 = bitcast i8 %__U to <8 x i1>
100 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
101 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
105 define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
106 ; X86-LABEL: test_mm_maskz_fmadd_pd:
107 ; X86: # %bb.0: # %entry
108 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
109 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
110 ; X86-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xc2]
111 ; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
112 ; X86-NEXT: retl # encoding: [0xc3]
114 ; X64-LABEL: test_mm_maskz_fmadd_pd:
115 ; X64: # %bb.0: # %entry
116 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
117 ; X64-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa8,0xc2]
118 ; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
119 ; X64-NEXT: retq # encoding: [0xc3]
121 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
122 %1 = bitcast i8 %__U to <8 x i1>
123 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
124 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
128 define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
129 ; X86-LABEL: test_mm_maskz_fmsub_pd:
130 ; X86: # %bb.0: # %entry
131 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
132 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
133 ; X86-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaa,0xc2]
134 ; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
135 ; X86-NEXT: retl # encoding: [0xc3]
137 ; X64-LABEL: test_mm_maskz_fmsub_pd:
138 ; X64: # %bb.0: # %entry
139 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
140 ; X64-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaa,0xc2]
141 ; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
142 ; X64-NEXT: retq # encoding: [0xc3]
144 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
145 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
146 %1 = bitcast i8 %__U to <8 x i1>
147 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
148 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
152 define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
153 ; X86-LABEL: test_mm_maskz_fnmadd_pd:
154 ; X86: # %bb.0: # %entry
155 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
156 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
157 ; X86-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xac,0xc2]
158 ; X86-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
159 ; X86-NEXT: retl # encoding: [0xc3]
161 ; X64-LABEL: test_mm_maskz_fnmadd_pd:
162 ; X64: # %bb.0: # %entry
163 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
164 ; X64-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xac,0xc2]
165 ; X64-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
166 ; X64-NEXT: retq # encoding: [0xc3]
168 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
169 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9
170 %1 = bitcast i8 %__U to <8 x i1>
171 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
172 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
176 define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
177 ; X86-LABEL: test_mm_maskz_fnmsub_pd:
178 ; X86: # %bb.0: # %entry
179 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
180 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
181 ; X86-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xae,0xc2]
182 ; X86-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
183 ; X86-NEXT: retl # encoding: [0xc3]
185 ; X64-LABEL: test_mm_maskz_fnmsub_pd:
186 ; X64: # %bb.0: # %entry
187 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
188 ; X64-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xae,0xc2]
189 ; X64-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
190 ; X64-NEXT: retq # encoding: [0xc3]
192 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A
193 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
194 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9
195 %1 = bitcast i8 %__U to <8 x i1>
196 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
197 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
201 define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
202 ; X86-LABEL: test_mm256_mask_fmadd_pd:
203 ; X86: # %bb.0: # %entry
204 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
205 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
206 ; X86-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
207 ; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
208 ; X86-NEXT: retl # encoding: [0xc3]
210 ; X64-LABEL: test_mm256_mask_fmadd_pd:
211 ; X64: # %bb.0: # %entry
212 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
213 ; X64-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
214 ; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
215 ; X64-NEXT: retq # encoding: [0xc3]
217 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
218 %1 = bitcast i8 %__U to <8 x i1>
219 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
220 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
224 define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
225 ; X86-LABEL: test_mm256_mask_fmsub_pd:
226 ; X86: # %bb.0: # %entry
227 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
228 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
229 ; X86-NEXT: vfmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9a,0xc1]
230 ; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
231 ; X86-NEXT: retl # encoding: [0xc3]
233 ; X64-LABEL: test_mm256_mask_fmsub_pd:
234 ; X64: # %bb.0: # %entry
235 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
236 ; X64-NEXT: vfmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9a,0xc1]
237 ; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
238 ; X64-NEXT: retq # encoding: [0xc3]
240 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
241 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
242 %1 = bitcast i8 %__U to <8 x i1>
243 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
244 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
248 define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
249 ; X86-LABEL: test_mm256_mask3_fmadd_pd:
250 ; X86: # %bb.0: # %entry
251 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
252 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
253 ; X86-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1]
254 ; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
255 ; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
256 ; X86-NEXT: retl # encoding: [0xc3]
258 ; X64-LABEL: test_mm256_mask3_fmadd_pd:
259 ; X64: # %bb.0: # %entry
260 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
261 ; X64-NEXT: vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1]
262 ; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
263 ; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
264 ; X64-NEXT: retq # encoding: [0xc3]
266 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
267 %1 = bitcast i8 %__U to <8 x i1>
268 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
269 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
273 define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
274 ; X86-LABEL: test_mm256_mask3_fnmadd_pd:
275 ; X86: # %bb.0: # %entry
276 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
277 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
278 ; X86-NEXT: vfnmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbc,0xd1]
279 ; X86-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
280 ; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
281 ; X86-NEXT: retl # encoding: [0xc3]
283 ; X64-LABEL: test_mm256_mask3_fnmadd_pd:
284 ; X64: # %bb.0: # %entry
285 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
286 ; X64-NEXT: vfnmadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbc,0xd1]
287 ; X64-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
288 ; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
289 ; X64-NEXT: retq # encoding: [0xc3]
291 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
292 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
293 %1 = bitcast i8 %__U to <8 x i1>
294 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
295 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
299 define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
300 ; X86-LABEL: test_mm256_maskz_fmadd_pd:
301 ; X86: # %bb.0: # %entry
302 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
303 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
304 ; X86-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xc2]
305 ; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
306 ; X86-NEXT: retl # encoding: [0xc3]
308 ; X64-LABEL: test_mm256_maskz_fmadd_pd:
309 ; X64: # %bb.0: # %entry
310 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
311 ; X64-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa8,0xc2]
312 ; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
313 ; X64-NEXT: retq # encoding: [0xc3]
315 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
316 %1 = bitcast i8 %__U to <8 x i1>
317 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
318 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
322 define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
323 ; X86-LABEL: test_mm256_maskz_fmsub_pd:
324 ; X86: # %bb.0: # %entry
325 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
326 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
327 ; X86-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xaa,0xc2]
328 ; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
329 ; X86-NEXT: retl # encoding: [0xc3]
331 ; X64-LABEL: test_mm256_maskz_fmsub_pd:
332 ; X64: # %bb.0: # %entry
333 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
334 ; X64-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xaa,0xc2]
335 ; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
336 ; X64-NEXT: retq # encoding: [0xc3]
338 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
339 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
340 %1 = bitcast i8 %__U to <8 x i1>
341 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
342 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
346 define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
347 ; X86-LABEL: test_mm256_maskz_fnmadd_pd:
348 ; X86: # %bb.0: # %entry
349 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
350 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
351 ; X86-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xac,0xc2]
352 ; X86-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
353 ; X86-NEXT: retl # encoding: [0xc3]
355 ; X64-LABEL: test_mm256_maskz_fnmadd_pd:
356 ; X64: # %bb.0: # %entry
357 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
358 ; X64-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xac,0xc2]
359 ; X64-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
360 ; X64-NEXT: retq # encoding: [0xc3]
362 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
363 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9
364 %1 = bitcast i8 %__U to <8 x i1>
365 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
366 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
370 define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
371 ; X86-LABEL: test_mm256_maskz_fnmsub_pd:
372 ; X86: # %bb.0: # %entry
373 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
374 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
375 ; X86-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xae,0xc2]
376 ; X86-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
377 ; X86-NEXT: retl # encoding: [0xc3]
379 ; X64-LABEL: test_mm256_maskz_fnmsub_pd:
380 ; X64: # %bb.0: # %entry
381 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
382 ; X64-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xae,0xc2]
383 ; X64-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
384 ; X64-NEXT: retq # encoding: [0xc3]
386 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
387 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
388 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9
389 %1 = bitcast i8 %__U to <8 x i1>
390 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
391 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
395 define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
396 ; X86-LABEL: test_mm_mask_fmadd_ps:
397 ; X86: # %bb.0: # %entry
398 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
399 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
400 ; X86-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
401 ; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
402 ; X86-NEXT: retl # encoding: [0xc3]
404 ; X64-LABEL: test_mm_mask_fmadd_ps:
405 ; X64: # %bb.0: # %entry
406 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
407 ; X64-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
408 ; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) + xmm2
409 ; X64-NEXT: retq # encoding: [0xc3]
411 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
412 %1 = bitcast i8 %__U to <8 x i1>
413 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
414 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
418 define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
419 ; X86-LABEL: test_mm_mask_fmsub_ps:
420 ; X86: # %bb.0: # %entry
421 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
422 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
423 ; X86-NEXT: vfmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9a,0xc1]
424 ; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
425 ; X86-NEXT: retl # encoding: [0xc3]
427 ; X64-LABEL: test_mm_mask_fmsub_ps:
428 ; X64: # %bb.0: # %entry
429 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
430 ; X64-NEXT: vfmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9a,0xc1]
431 ; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) - xmm2
432 ; X64-NEXT: retq # encoding: [0xc3]
434 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
435 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
436 %1 = bitcast i8 %__U to <8 x i1>
437 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
438 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
442 define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
443 ; X86-LABEL: test_mm_mask3_fmadd_ps:
444 ; X86: # %bb.0: # %entry
445 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
446 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
447 ; X86-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1]
448 ; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
449 ; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
450 ; X86-NEXT: retl # encoding: [0xc3]
452 ; X64-LABEL: test_mm_mask3_fmadd_ps:
453 ; X64: # %bb.0: # %entry
454 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
455 ; X64-NEXT: vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1]
456 ; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) + xmm2
457 ; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
458 ; X64-NEXT: retq # encoding: [0xc3]
460 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
461 %1 = bitcast i8 %__U to <8 x i1>
462 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
463 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
467 define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
468 ; X86-LABEL: test_mm_mask3_fnmadd_ps:
469 ; X86: # %bb.0: # %entry
470 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
471 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
472 ; X86-NEXT: vfnmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbc,0xd1]
473 ; X86-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
474 ; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
475 ; X86-NEXT: retl # encoding: [0xc3]
477 ; X64-LABEL: test_mm_mask3_fnmadd_ps:
478 ; X64: # %bb.0: # %entry
479 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
480 ; X64-NEXT: vfnmadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbc,0xd1]
481 ; X64-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
482 ; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
483 ; X64-NEXT: retq # encoding: [0xc3]
485 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
486 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
487 %1 = bitcast i8 %__U to <8 x i1>
488 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
489 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
493 define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
494 ; X86-LABEL: test_mm_maskz_fmadd_ps:
495 ; X86: # %bb.0: # %entry
496 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
497 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
498 ; X86-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa8,0xc2]
499 ; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
500 ; X86-NEXT: retl # encoding: [0xc3]
502 ; X64-LABEL: test_mm_maskz_fmadd_ps:
503 ; X64: # %bb.0: # %entry
504 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
505 ; X64-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa8,0xc2]
506 ; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
507 ; X64-NEXT: retq # encoding: [0xc3]
509 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
510 %1 = bitcast i8 %__U to <8 x i1>
511 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
512 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
516 define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
517 ; X86-LABEL: test_mm_maskz_fmsub_ps:
518 ; X86: # %bb.0: # %entry
519 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
520 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
521 ; X86-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaa,0xc2]
522 ; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
523 ; X86-NEXT: retl # encoding: [0xc3]
525 ; X64-LABEL: test_mm_maskz_fmsub_ps:
526 ; X64: # %bb.0: # %entry
527 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
528 ; X64-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaa,0xc2]
529 ; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
530 ; X64-NEXT: retq # encoding: [0xc3]
532 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
533 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
534 %1 = bitcast i8 %__U to <8 x i1>
535 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
536 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
540 define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
541 ; X86-LABEL: test_mm_maskz_fnmadd_ps:
542 ; X86: # %bb.0: # %entry
543 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
544 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
545 ; X86-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xac,0xc2]
546 ; X86-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
547 ; X86-NEXT: retl # encoding: [0xc3]
549 ; X64-LABEL: test_mm_maskz_fnmadd_ps:
550 ; X64: # %bb.0: # %entry
551 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
552 ; X64-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xac,0xc2]
553 ; X64-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
554 ; X64-NEXT: retq # encoding: [0xc3]
556 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
557 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9
558 %1 = bitcast i8 %__U to <8 x i1>
559 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
560 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
564 define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
565 ; X86-LABEL: test_mm_maskz_fnmsub_ps:
566 ; X86: # %bb.0: # %entry
567 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
568 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
569 ; X86-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xae,0xc2]
570 ; X86-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
571 ; X86-NEXT: retl # encoding: [0xc3]
573 ; X64-LABEL: test_mm_maskz_fnmsub_ps:
574 ; X64: # %bb.0: # %entry
575 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
576 ; X64-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xae,0xc2]
577 ; X64-NEXT: # xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
578 ; X64-NEXT: retq # encoding: [0xc3]
580 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
581 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
582 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9
583 %1 = bitcast i8 %__U to <8 x i1>
584 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
585 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
589 define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
590 ; X86-LABEL: test_mm256_mask_fmadd_ps:
591 ; X86: # %bb.0: # %entry
592 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
593 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
594 ; X86-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
595 ; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
596 ; X86-NEXT: retl # encoding: [0xc3]
598 ; X64-LABEL: test_mm256_mask_fmadd_ps:
599 ; X64: # %bb.0: # %entry
600 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
601 ; X64-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
602 ; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) + ymm2
603 ; X64-NEXT: retq # encoding: [0xc3]
605 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
606 %1 = bitcast i8 %__U to <8 x i1>
607 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
611 define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
612 ; X86-LABEL: test_mm256_mask_fmsub_ps:
613 ; X86: # %bb.0: # %entry
614 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
615 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
616 ; X86-NEXT: vfmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9a,0xc1]
617 ; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
618 ; X86-NEXT: retl # encoding: [0xc3]
620 ; X64-LABEL: test_mm256_mask_fmsub_ps:
621 ; X64: # %bb.0: # %entry
622 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
623 ; X64-NEXT: vfmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9a,0xc1]
624 ; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) - ymm2
625 ; X64-NEXT: retq # encoding: [0xc3]
627 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
628 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
629 %1 = bitcast i8 %__U to <8 x i1>
630 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
634 define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
635 ; X86-LABEL: test_mm256_mask3_fmadd_ps:
636 ; X86: # %bb.0: # %entry
637 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
638 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
639 ; X86-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1]
640 ; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
641 ; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
642 ; X86-NEXT: retl # encoding: [0xc3]
644 ; X64-LABEL: test_mm256_mask3_fmadd_ps:
645 ; X64: # %bb.0: # %entry
646 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
647 ; X64-NEXT: vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1]
648 ; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) + ymm2
649 ; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
650 ; X64-NEXT: retq # encoding: [0xc3]
652 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
653 %1 = bitcast i8 %__U to <8 x i1>
654 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
658 define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
659 ; X86-LABEL: test_mm256_mask3_fnmadd_ps:
660 ; X86: # %bb.0: # %entry
661 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
662 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
663 ; X86-NEXT: vfnmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbc,0xd1]
664 ; X86-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
665 ; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
666 ; X86-NEXT: retl # encoding: [0xc3]
668 ; X64-LABEL: test_mm256_mask3_fnmadd_ps:
669 ; X64: # %bb.0: # %entry
670 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
671 ; X64-NEXT: vfnmadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbc,0xd1]
672 ; X64-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) + ymm2
673 ; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
674 ; X64-NEXT: retq # encoding: [0xc3]
676 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
677 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
678 %1 = bitcast i8 %__U to <8 x i1>
679 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
683 define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
684 ; X86-LABEL: test_mm256_maskz_fmadd_ps:
685 ; X86: # %bb.0: # %entry
686 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
687 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
688 ; X86-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xc2]
689 ; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
690 ; X86-NEXT: retl # encoding: [0xc3]
692 ; X64-LABEL: test_mm256_maskz_fmadd_ps:
693 ; X64: # %bb.0: # %entry
694 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
695 ; X64-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa8,0xc2]
696 ; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2
697 ; X64-NEXT: retq # encoding: [0xc3]
699 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
700 %1 = bitcast i8 %__U to <8 x i1>
701 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
705 define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
706 ; X86-LABEL: test_mm256_maskz_fmsub_ps:
707 ; X86: # %bb.0: # %entry
708 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
709 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
710 ; X86-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xaa,0xc2]
711 ; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
712 ; X86-NEXT: retl # encoding: [0xc3]
714 ; X64-LABEL: test_mm256_maskz_fmsub_ps:
715 ; X64: # %bb.0: # %entry
716 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
717 ; X64-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xaa,0xc2]
718 ; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2
719 ; X64-NEXT: retq # encoding: [0xc3]
721 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
722 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
723 %1 = bitcast i8 %__U to <8 x i1>
724 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
728 define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
729 ; X86-LABEL: test_mm256_maskz_fnmadd_ps:
730 ; X86: # %bb.0: # %entry
731 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
732 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
733 ; X86-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xac,0xc2]
734 ; X86-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
735 ; X86-NEXT: retl # encoding: [0xc3]
737 ; X64-LABEL: test_mm256_maskz_fnmadd_ps:
738 ; X64: # %bb.0: # %entry
739 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
740 ; X64-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xac,0xc2]
741 ; X64-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2
742 ; X64-NEXT: retq # encoding: [0xc3]
744 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
745 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9
746 %1 = bitcast i8 %__U to <8 x i1>
747 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
751 define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
752 ; X86-LABEL: test_mm256_maskz_fnmsub_ps:
753 ; X86: # %bb.0: # %entry
754 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
755 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
756 ; X86-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xae,0xc2]
757 ; X86-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
758 ; X86-NEXT: retl # encoding: [0xc3]
760 ; X64-LABEL: test_mm256_maskz_fnmsub_ps:
761 ; X64: # %bb.0: # %entry
762 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
763 ; X64-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xae,0xc2]
764 ; X64-NEXT: # ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2
765 ; X64-NEXT: retq # encoding: [0xc3]
767 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
768 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
769 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9
770 %1 = bitcast i8 %__U to <8 x i1>
771 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
775 define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
776 ; X86-LABEL: test_mm_mask_fmaddsub_pd:
777 ; X86: # %bb.0: # %entry
778 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
779 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
780 ; X86-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
781 ; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
782 ; X86-NEXT: retl # encoding: [0xc3]
784 ; X64-LABEL: test_mm_mask_fmaddsub_pd:
785 ; X64: # %bb.0: # %entry
786 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
787 ; X64-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
788 ; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
789 ; X64-NEXT: retq # encoding: [0xc3]
791 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
792 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
793 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
794 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
795 %4 = bitcast i8 %__U to <8 x i1>
796 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
797 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A
801 define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
802 ; X86-LABEL: test_mm_mask_fmsubadd_pd:
803 ; X86: # %bb.0: # %entry
804 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
805 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
806 ; X86-NEXT: vfmsubadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x97,0xc1]
807 ; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
808 ; X86-NEXT: retl # encoding: [0xc3]
810 ; X64-LABEL: test_mm_mask_fmsubadd_pd:
811 ; X64: # %bb.0: # %entry
812 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
813 ; X64-NEXT: vfmsubadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x97,0xc1]
814 ; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
815 ; X64-NEXT: retq # encoding: [0xc3]
817 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
818 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
819 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
820 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
821 %3 = bitcast i8 %__U to <8 x i1>
822 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
823 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A
827 define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
828 ; X86-LABEL: test_mm_mask3_fmaddsub_pd:
829 ; X86: # %bb.0: # %entry
830 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
831 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
832 ; X86-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1]
833 ; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
834 ; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
835 ; X86-NEXT: retl # encoding: [0xc3]
837 ; X64-LABEL: test_mm_mask3_fmaddsub_pd:
838 ; X64: # %bb.0: # %entry
839 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
840 ; X64-NEXT: vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1]
841 ; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
842 ; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
843 ; X64-NEXT: retq # encoding: [0xc3]
845 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
846 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
847 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
848 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
849 %4 = bitcast i8 %__U to <8 x i1>
850 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
851 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C
855 define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
856 ; X86-LABEL: test_mm_maskz_fmaddsub_pd:
857 ; X86: # %bb.0: # %entry
858 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
859 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
860 ; X86-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xc2]
861 ; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
862 ; X86-NEXT: retl # encoding: [0xc3]
864 ; X64-LABEL: test_mm_maskz_fmaddsub_pd:
865 ; X64: # %bb.0: # %entry
866 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
867 ; X64-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa6,0xc2]
868 ; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
869 ; X64-NEXT: retq # encoding: [0xc3]
871 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
872 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
873 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9
874 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3>
875 %4 = bitcast i8 %__U to <8 x i1>
876 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
877 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer
881 define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
882 ; X86-LABEL: test_mm_maskz_fmsubadd_pd:
883 ; X86: # %bb.0: # %entry
884 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
885 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
886 ; X86-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa7,0xc2]
887 ; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
888 ; X86-NEXT: retl # encoding: [0xc3]
890 ; X64-LABEL: test_mm_maskz_fmsubadd_pd:
891 ; X64: # %bb.0: # %entry
892 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
893 ; X64-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa7,0xc2]
894 ; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
895 ; X64-NEXT: retq # encoding: [0xc3]
897 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
898 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
899 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
900 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
901 %3 = bitcast i8 %__U to <8 x i1>
902 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
903 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer
907 define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
908 ; X86-LABEL: test_mm256_mask_fmaddsub_pd:
909 ; X86: # %bb.0: # %entry
910 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
911 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
912 ; X86-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
913 ; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
914 ; X86-NEXT: retl # encoding: [0xc3]
916 ; X64-LABEL: test_mm256_mask_fmaddsub_pd:
917 ; X64: # %bb.0: # %entry
918 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
919 ; X64-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
920 ; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
921 ; X64-NEXT: retq # encoding: [0xc3]
923 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
924 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
925 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
926 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
927 %4 = bitcast i8 %__U to <8 x i1>
928 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
929 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A
933 define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
934 ; X86-LABEL: test_mm256_mask_fmsubadd_pd:
935 ; X86: # %bb.0: # %entry
936 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
937 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
938 ; X86-NEXT: vfmsubadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x97,0xc1]
939 ; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
940 ; X86-NEXT: retl # encoding: [0xc3]
942 ; X64-LABEL: test_mm256_mask_fmsubadd_pd:
943 ; X64: # %bb.0: # %entry
944 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
945 ; X64-NEXT: vfmsubadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x97,0xc1]
946 ; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
947 ; X64-NEXT: retq # encoding: [0xc3]
949 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
950 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
951 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
952 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
953 %3 = bitcast i8 %__U to <8 x i1>
954 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
955 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A
959 define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
960 ; X86-LABEL: test_mm256_mask3_fmaddsub_pd:
961 ; X86: # %bb.0: # %entry
962 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
963 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
964 ; X86-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1]
965 ; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
966 ; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
967 ; X86-NEXT: retl # encoding: [0xc3]
969 ; X64-LABEL: test_mm256_mask3_fmaddsub_pd:
970 ; X64: # %bb.0: # %entry
971 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
972 ; X64-NEXT: vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1]
973 ; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
974 ; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
975 ; X64-NEXT: retq # encoding: [0xc3]
977 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
978 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
979 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
980 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
981 %4 = bitcast i8 %__U to <8 x i1>
982 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
983 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C
987 define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
988 ; X86-LABEL: test_mm256_maskz_fmaddsub_pd:
989 ; X86: # %bb.0: # %entry
990 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
991 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
992 ; X86-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xc2]
993 ; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
994 ; X86-NEXT: retl # encoding: [0xc3]
996 ; X64-LABEL: test_mm256_maskz_fmaddsub_pd:
997 ; X64: # %bb.0: # %entry
998 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
999 ; X64-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa6,0xc2]
1000 ; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1001 ; X64-NEXT: retq # encoding: [0xc3]
1003 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1004 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1005 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9
1006 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1007 %4 = bitcast i8 %__U to <8 x i1>
1008 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1009 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer
1013 define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) {
1014 ; X86-LABEL: test_mm256_maskz_fmsubadd_pd:
1015 ; X86: # %bb.0: # %entry
1016 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1017 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1018 ; X86-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa7,0xc2]
1019 ; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1020 ; X86-NEXT: retl # encoding: [0xc3]
1022 ; X64-LABEL: test_mm256_maskz_fmsubadd_pd:
1023 ; X64: # %bb.0: # %entry
1024 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1025 ; X64-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0xa7,0xc2]
1026 ; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1027 ; X64-NEXT: retq # encoding: [0xc3]
1029 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1030 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1031 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1032 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1033 %3 = bitcast i8 %__U to <8 x i1>
1034 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1035 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer
1039 define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1040 ; X86-LABEL: test_mm_mask_fmaddsub_ps:
1041 ; X86: # %bb.0: # %entry
1042 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1043 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1044 ; X86-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
1045 ; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
1046 ; X86-NEXT: retl # encoding: [0xc3]
1048 ; X64-LABEL: test_mm_mask_fmaddsub_ps:
1049 ; X64: # %bb.0: # %entry
1050 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1051 ; X64-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
1052 ; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2
1053 ; X64-NEXT: retq # encoding: [0xc3]
1055 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1056 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1057 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1058 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1059 %4 = bitcast i8 %__U to <8 x i1>
1060 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1061 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A
1065 define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1066 ; X86-LABEL: test_mm_mask_fmsubadd_ps:
1067 ; X86: # %bb.0: # %entry
1068 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1069 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1070 ; X86-NEXT: vfmsubadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x97,0xc1]
1071 ; X86-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
1072 ; X86-NEXT: retl # encoding: [0xc3]
1074 ; X64-LABEL: test_mm_mask_fmsubadd_ps:
1075 ; X64: # %bb.0: # %entry
1076 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1077 ; X64-NEXT: vfmsubadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x97,0xc1]
1078 ; X64-NEXT: # xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2
1079 ; X64-NEXT: retq # encoding: [0xc3]
1081 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1082 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1083 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1084 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1085 %3 = bitcast i8 %__U to <8 x i1>
1086 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1087 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A
1091 define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1092 ; X86-LABEL: test_mm_mask3_fmaddsub_ps:
1093 ; X86: # %bb.0: # %entry
1094 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1095 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1096 ; X86-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1]
1097 ; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
1098 ; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1099 ; X86-NEXT: retl # encoding: [0xc3]
1101 ; X64-LABEL: test_mm_mask3_fmaddsub_ps:
1102 ; X64: # %bb.0: # %entry
1103 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1104 ; X64-NEXT: vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1]
1105 ; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2
1106 ; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1107 ; X64-NEXT: retq # encoding: [0xc3]
1109 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1110 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1111 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1112 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1113 %4 = bitcast i8 %__U to <8 x i1>
1114 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1115 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C
1119 define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
1120 ; X86-LABEL: test_mm_maskz_fmaddsub_ps:
1121 ; X86: # %bb.0: # %entry
1122 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1123 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1124 ; X86-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa6,0xc2]
1125 ; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
1126 ; X86-NEXT: retl # encoding: [0xc3]
1128 ; X64-LABEL: test_mm_maskz_fmaddsub_ps:
1129 ; X64: # %bb.0: # %entry
1130 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1131 ; X64-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa6,0xc2]
1132 ; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2
1133 ; X64-NEXT: retq # encoding: [0xc3]
1135 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1136 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1137 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9
1138 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1139 %4 = bitcast i8 %__U to <8 x i1>
1140 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1141 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer
1145 define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
1146 ; X86-LABEL: test_mm_maskz_fmsubadd_ps:
1147 ; X86: # %bb.0: # %entry
1148 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1149 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1150 ; X86-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa7,0xc2]
1151 ; X86-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
1152 ; X86-NEXT: retl # encoding: [0xc3]
1154 ; X64-LABEL: test_mm_maskz_fmsubadd_ps:
1155 ; X64: # %bb.0: # %entry
1156 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1157 ; X64-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa7,0xc2]
1158 ; X64-NEXT: # xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2
1159 ; X64-NEXT: retq # encoding: [0xc3]
1161 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1162 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1163 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1164 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1165 %3 = bitcast i8 %__U to <8 x i1>
1166 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1167 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer
1171 define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1172 ; X86-LABEL: test_mm256_mask_fmaddsub_ps:
1173 ; X86: # %bb.0: # %entry
1174 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1175 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1176 ; X86-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
1177 ; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
1178 ; X86-NEXT: retl # encoding: [0xc3]
1180 ; X64-LABEL: test_mm256_mask_fmaddsub_ps:
1181 ; X64: # %bb.0: # %entry
1182 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1183 ; X64-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
1184 ; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2
1185 ; X64-NEXT: retq # encoding: [0xc3]
1187 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1188 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1189 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1190 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1191 %4 = bitcast i8 %__U to <8 x i1>
1192 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A
1196 define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1197 ; X86-LABEL: test_mm256_mask_fmsubadd_ps:
1198 ; X86: # %bb.0: # %entry
1199 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1200 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1201 ; X86-NEXT: vfmsubadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x97,0xc1]
1202 ; X86-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
1203 ; X86-NEXT: retl # encoding: [0xc3]
1205 ; X64-LABEL: test_mm256_mask_fmsubadd_ps:
1206 ; X64: # %bb.0: # %entry
1207 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1208 ; X64-NEXT: vfmsubadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x97,0xc1]
1209 ; X64-NEXT: # ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2
1210 ; X64-NEXT: retq # encoding: [0xc3]
1212 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1213 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1214 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1215 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1216 %3 = bitcast i8 %__U to <8 x i1>
1217 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A
1221 define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1222 ; X86-LABEL: test_mm256_mask3_fmaddsub_ps:
1223 ; X86: # %bb.0: # %entry
1224 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1225 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1226 ; X86-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1]
1227 ; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
1228 ; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1229 ; X86-NEXT: retl # encoding: [0xc3]
1231 ; X64-LABEL: test_mm256_mask3_fmaddsub_ps:
1232 ; X64: # %bb.0: # %entry
1233 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1234 ; X64-NEXT: vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1]
1235 ; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2
1236 ; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1237 ; X64-NEXT: retq # encoding: [0xc3]
1239 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1240 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1241 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1242 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1243 %4 = bitcast i8 %__U to <8 x i1>
1244 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C
1248 define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
1249 ; X86-LABEL: test_mm256_maskz_fmaddsub_ps:
1250 ; X86: # %bb.0: # %entry
1251 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1252 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1253 ; X86-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xc2]
1254 ; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1255 ; X86-NEXT: retl # encoding: [0xc3]
1257 ; X64-LABEL: test_mm256_maskz_fmaddsub_ps:
1258 ; X64: # %bb.0: # %entry
1259 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1260 ; X64-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa6,0xc2]
1261 ; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2
1262 ; X64-NEXT: retq # encoding: [0xc3]
1264 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1265 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1266 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9
1267 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1268 %4 = bitcast i8 %__U to <8 x i1>
1269 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer
1273 define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) {
1274 ; X86-LABEL: test_mm256_maskz_fmsubadd_ps:
1275 ; X86: # %bb.0: # %entry
1276 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1277 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1278 ; X86-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa7,0xc2]
1279 ; X86-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1280 ; X86-NEXT: retl # encoding: [0xc3]
1282 ; X64-LABEL: test_mm256_maskz_fmsubadd_ps:
1283 ; X64: # %bb.0: # %entry
1284 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1285 ; X64-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0xa7,0xc2]
1286 ; X64-NEXT: # ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2
1287 ; X64-NEXT: retq # encoding: [0xc3]
1289 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1290 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1291 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1292 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1293 %3 = bitcast i8 %__U to <8 x i1>
1294 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer
1298 define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1299 ; X86-LABEL: test_mm_mask3_fmsub_pd:
1300 ; X86: # %bb.0: # %entry
1301 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1302 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1303 ; X86-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1]
1304 ; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1305 ; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1306 ; X86-NEXT: retl # encoding: [0xc3]
1308 ; X64-LABEL: test_mm_mask3_fmsub_pd:
1309 ; X64: # %bb.0: # %entry
1310 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1311 ; X64-NEXT: vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1]
1312 ; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1313 ; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1314 ; X64-NEXT: retq # encoding: [0xc3]
1316 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1317 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
1318 %1 = bitcast i8 %__U to <8 x i1>
1319 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1320 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
1324 define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1325 ; X86-LABEL: test_mm256_mask3_fmsub_pd:
1326 ; X86: # %bb.0: # %entry
1327 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1328 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1329 ; X86-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1]
1330 ; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1331 ; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1332 ; X86-NEXT: retl # encoding: [0xc3]
1334 ; X64-LABEL: test_mm256_mask3_fmsub_pd:
1335 ; X64: # %bb.0: # %entry
1336 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1337 ; X64-NEXT: vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1]
1338 ; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1339 ; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1340 ; X64-NEXT: retq # encoding: [0xc3]
1342 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1343 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1344 %1 = bitcast i8 %__U to <8 x i1>
1345 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1346 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
1350 define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1351 ; X86-LABEL: test_mm_mask3_fmsub_ps:
1352 ; X86: # %bb.0: # %entry
1353 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1354 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1355 ; X86-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1]
1356 ; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1357 ; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1358 ; X86-NEXT: retl # encoding: [0xc3]
1360 ; X64-LABEL: test_mm_mask3_fmsub_ps:
1361 ; X64: # %bb.0: # %entry
1362 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1363 ; X64-NEXT: vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1]
1364 ; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) - xmm2
1365 ; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1366 ; X64-NEXT: retq # encoding: [0xc3]
1368 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1369 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1370 %1 = bitcast i8 %__U to <8 x i1>
1371 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1372 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
1376 define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1377 ; X86-LABEL: test_mm256_mask3_fmsub_ps:
1378 ; X86: # %bb.0: # %entry
1379 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1380 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1381 ; X86-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1]
1382 ; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1383 ; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1384 ; X86-NEXT: retl # encoding: [0xc3]
1386 ; X64-LABEL: test_mm256_mask3_fmsub_ps:
1387 ; X64: # %bb.0: # %entry
1388 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1389 ; X64-NEXT: vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1]
1390 ; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) - ymm2
1391 ; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1392 ; X64-NEXT: retq # encoding: [0xc3]
1394 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1395 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1396 %1 = bitcast i8 %__U to <8 x i1>
1397 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
1401 define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1402 ; X86-LABEL: test_mm_mask3_fmsubadd_pd:
1403 ; X86: # %bb.0: # %entry
1404 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1405 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1406 ; X86-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1]
1407 ; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1408 ; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1409 ; X86-NEXT: retl # encoding: [0xc3]
1411 ; X64-LABEL: test_mm_mask3_fmsubadd_pd:
1412 ; X64: # %bb.0: # %entry
1413 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1414 ; X64-NEXT: vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1]
1415 ; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1416 ; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1417 ; X64-NEXT: retq # encoding: [0xc3]
1419 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1420 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9
1421 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9
1422 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3>
1423 %3 = bitcast i8 %__U to <8 x i1>
1424 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1425 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C
1429 define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1430 ; X86-LABEL: test_mm256_mask3_fmsubadd_pd:
1431 ; X86: # %bb.0: # %entry
1432 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1433 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1434 ; X86-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1]
1435 ; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1436 ; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1437 ; X86-NEXT: retl # encoding: [0xc3]
1439 ; X64-LABEL: test_mm256_mask3_fmsubadd_pd:
1440 ; X64: # %bb.0: # %entry
1441 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1442 ; X64-NEXT: vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1]
1443 ; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1444 ; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1445 ; X64-NEXT: retq # encoding: [0xc3]
1447 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1448 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9
1449 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9
1450 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1451 %3 = bitcast i8 %__U to <8 x i1>
1452 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1453 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C
1457 define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1458 ; X86-LABEL: test_mm_mask3_fmsubadd_ps:
1459 ; X86: # %bb.0: # %entry
1460 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1461 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1462 ; X86-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1]
1463 ; X86-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1464 ; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1465 ; X86-NEXT: retl # encoding: [0xc3]
1467 ; X64-LABEL: test_mm_mask3_fmsubadd_ps:
1468 ; X64: # %bb.0: # %entry
1469 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1470 ; X64-NEXT: vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1]
1471 ; X64-NEXT: # xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2
1472 ; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1473 ; X64-NEXT: retq # encoding: [0xc3]
1475 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1476 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9
1477 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9
1478 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1479 %3 = bitcast i8 %__U to <8 x i1>
1480 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1481 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C
1485 define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1486 ; X86-LABEL: test_mm256_mask3_fmsubadd_ps:
1487 ; X86: # %bb.0: # %entry
1488 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1489 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1490 ; X86-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1]
1491 ; X86-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1492 ; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1493 ; X86-NEXT: retl # encoding: [0xc3]
1495 ; X64-LABEL: test_mm256_mask3_fmsubadd_ps:
1496 ; X64: # %bb.0: # %entry
1497 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1498 ; X64-NEXT: vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1]
1499 ; X64-NEXT: # ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2
1500 ; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1501 ; X64-NEXT: retq # encoding: [0xc3]
1503 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1504 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9
1505 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9
1506 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
1507 %3 = bitcast i8 %__U to <8 x i1>
1508 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C
1512 define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
1513 ; X86-LABEL: test_mm_mask_fnmadd_pd:
1514 ; X86: # %bb.0: # %entry
1515 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1516 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1517 ; X86-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
1518 ; X86-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1519 ; X86-NEXT: retl # encoding: [0xc3]
1521 ; X64-LABEL: test_mm_mask_fnmadd_pd:
1522 ; X64: # %bb.0: # %entry
1523 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1524 ; X64-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
1525 ; X64-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1526 ; X64-NEXT: retq # encoding: [0xc3]
1528 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1529 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9
1530 %1 = bitcast i8 %__U to <8 x i1>
1531 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1532 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
1536 define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
1537 ; X86-LABEL: test_mm256_mask_fnmadd_pd:
1538 ; X86: # %bb.0: # %entry
1539 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1540 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1541 ; X86-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
1542 ; X86-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1543 ; X86-NEXT: retl # encoding: [0xc3]
1545 ; X64-LABEL: test_mm256_mask_fnmadd_pd:
1546 ; X64: # %bb.0: # %entry
1547 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1548 ; X64-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
1549 ; X64-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1550 ; X64-NEXT: retq # encoding: [0xc3]
1552 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1553 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9
1554 %1 = bitcast i8 %__U to <8 x i1>
1555 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1556 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
1560 define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1561 ; X86-LABEL: test_mm_mask_fnmadd_ps:
1562 ; X86: # %bb.0: # %entry
1563 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1564 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1565 ; X86-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
1566 ; X86-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1567 ; X86-NEXT: retl # encoding: [0xc3]
1569 ; X64-LABEL: test_mm_mask_fnmadd_ps:
1570 ; X64: # %bb.0: # %entry
1571 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1572 ; X64-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
1573 ; X64-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) + xmm2
1574 ; X64-NEXT: retq # encoding: [0xc3]
1576 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1577 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9
1578 %1 = bitcast i8 %__U to <8 x i1>
1579 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1580 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
1584 define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1585 ; X86-LABEL: test_mm256_mask_fnmadd_ps:
1586 ; X86: # %bb.0: # %entry
1587 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1588 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1589 ; X86-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
1590 ; X86-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1591 ; X86-NEXT: retl # encoding: [0xc3]
1593 ; X64-LABEL: test_mm256_mask_fnmadd_ps:
1594 ; X64: # %bb.0: # %entry
1595 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1596 ; X64-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
1597 ; X64-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) + ymm2
1598 ; X64-NEXT: retq # encoding: [0xc3]
1600 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1601 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9
1602 %1 = bitcast i8 %__U to <8 x i1>
1603 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
1607 define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) {
1608 ; X86-LABEL: test_mm_mask_fnmsub_pd:
1609 ; X86: # %bb.0: # %entry
1610 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1611 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1612 ; X86-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
1613 ; X86-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1614 ; X86-NEXT: retl # encoding: [0xc3]
1616 ; X64-LABEL: test_mm_mask_fnmsub_pd:
1617 ; X64: # %bb.0: # %entry
1618 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1619 ; X64-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
1620 ; X64-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1621 ; X64-NEXT: retq # encoding: [0xc3]
1623 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1624 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1625 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
1626 %1 = bitcast i8 %__U to <8 x i1>
1627 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1628 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A
1632 define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) {
1633 ; X86-LABEL: test_mm_mask3_fnmsub_pd:
1634 ; X86: # %bb.0: # %entry
1635 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1636 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1637 ; X86-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1]
1638 ; X86-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1639 ; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1640 ; X86-NEXT: retl # encoding: [0xc3]
1642 ; X64-LABEL: test_mm_mask3_fnmsub_pd:
1643 ; X64: # %bb.0: # %entry
1644 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1645 ; X64-NEXT: vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1]
1646 ; X64-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1647 ; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2]
1648 ; X64-NEXT: retq # encoding: [0xc3]
1650 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B
1651 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C
1652 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9
1653 %1 = bitcast i8 %__U to <8 x i1>
1654 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
1655 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C
1659 define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) {
1660 ; X86-LABEL: test_mm256_mask_fnmsub_pd:
1661 ; X86: # %bb.0: # %entry
1662 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1663 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1664 ; X86-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
1665 ; X86-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1666 ; X86-NEXT: retl # encoding: [0xc3]
1668 ; X64-LABEL: test_mm256_mask_fnmsub_pd:
1669 ; X64: # %bb.0: # %entry
1670 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1671 ; X64-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
1672 ; X64-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1673 ; X64-NEXT: retq # encoding: [0xc3]
1675 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1676 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1677 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
1678 %1 = bitcast i8 %__U to <8 x i1>
1679 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1680 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A
1684 define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) {
1685 ; X86-LABEL: test_mm256_mask3_fnmsub_pd:
1686 ; X86: # %bb.0: # %entry
1687 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1688 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1689 ; X86-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1]
1690 ; X86-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1691 ; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1692 ; X86-NEXT: retl # encoding: [0xc3]
1694 ; X64-LABEL: test_mm256_mask3_fnmsub_pd:
1695 ; X64: # %bb.0: # %entry
1696 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1697 ; X64-NEXT: vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1]
1698 ; X64-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1699 ; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2]
1700 ; X64-NEXT: retq # encoding: [0xc3]
1702 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
1703 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
1704 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9
1705 %1 = bitcast i8 %__U to <8 x i1>
1706 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1707 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C
1711 define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) {
1712 ; X86-LABEL: test_mm_mask_fnmsub_ps:
1713 ; X86: # %bb.0: # %entry
1714 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1715 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1716 ; X86-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
1717 ; X86-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1718 ; X86-NEXT: retl # encoding: [0xc3]
1720 ; X64-LABEL: test_mm_mask_fnmsub_ps:
1721 ; X64: # %bb.0: # %entry
1722 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1723 ; X64-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
1724 ; X64-NEXT: # xmm0 {%k1} = -(xmm0 * xmm1) - xmm2
1725 ; X64-NEXT: retq # encoding: [0xc3]
1727 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1728 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1729 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
1730 %1 = bitcast i8 %__U to <8 x i1>
1731 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1732 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A
1736 define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) {
1737 ; X86-LABEL: test_mm_mask3_fnmsub_ps:
1738 ; X86: # %bb.0: # %entry
1739 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1740 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1741 ; X86-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1]
1742 ; X86-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1743 ; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1744 ; X86-NEXT: retl # encoding: [0xc3]
1746 ; X64-LABEL: test_mm_mask3_fnmsub_ps:
1747 ; X64: # %bb.0: # %entry
1748 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1749 ; X64-NEXT: vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1]
1750 ; X64-NEXT: # xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
1751 ; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
1752 ; X64-NEXT: retq # encoding: [0xc3]
1754 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1755 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1756 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9
1757 %1 = bitcast i8 %__U to <8 x i1>
1758 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1759 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C
1763 define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) {
1764 ; X86-LABEL: test_mm256_mask_fnmsub_ps:
1765 ; X86: # %bb.0: # %entry
1766 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1767 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1768 ; X86-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
1769 ; X86-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1770 ; X86-NEXT: retl # encoding: [0xc3]
1772 ; X64-LABEL: test_mm256_mask_fnmsub_ps:
1773 ; X64: # %bb.0: # %entry
1774 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1775 ; X64-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
1776 ; X64-NEXT: # ymm0 {%k1} = -(ymm0 * ymm1) - ymm2
1777 ; X64-NEXT: retq # encoding: [0xc3]
1779 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1780 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1781 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
1782 %1 = bitcast i8 %__U to <8 x i1>
1783 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A
1787 define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) {
1788 ; X86-LABEL: test_mm256_mask3_fnmsub_ps:
1789 ; X86: # %bb.0: # %entry
1790 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
1791 ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
1792 ; X86-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1]
1793 ; X86-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1794 ; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1795 ; X86-NEXT: retl # encoding: [0xc3]
1797 ; X64-LABEL: test_mm256_mask3_fnmsub_ps:
1798 ; X64: # %bb.0: # %entry
1799 ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
1800 ; X64-NEXT: vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1]
1801 ; X64-NEXT: # ymm2 {%k1} = -(ymm0 * ymm1) - ymm2
1802 ; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
1803 ; X64-NEXT: retq # encoding: [0xc3]
1805 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
1806 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
1807 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9
1808 %1 = bitcast i8 %__U to <8 x i1>
1809 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C
1813 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
1814 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
1815 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
1816 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8