1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
8 define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
9 ; X86-LABEL: test_mm512_kunpackb:
10 ; X86: # %bb.0: # %entry
11 ; X86-NEXT: pushl %ebp
12 ; X86-NEXT: .cfi_def_cfa_offset 8
13 ; X86-NEXT: .cfi_offset %ebp, -8
14 ; X86-NEXT: movl %esp, %ebp
15 ; X86-NEXT: .cfi_def_cfa_register %ebp
16 ; X86-NEXT: andl $-64, %esp
17 ; X86-NEXT: subl $64, %esp
18 ; X86-NEXT: vmovdqa64 136(%ebp), %zmm3
19 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
20 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
21 ; X86-NEXT: kunpckbw %k0, %k1, %k1
22 ; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
23 ; X86-NEXT: kmovw %k0, %eax
24 ; X86-NEXT: movzwl %ax, %eax
25 ; X86-NEXT: movl %ebp, %esp
27 ; X86-NEXT: .cfi_def_cfa %esp, 4
28 ; X86-NEXT: vzeroupper
31 ; X64-LABEL: test_mm512_kunpackb:
32 ; X64: # %bb.0: # %entry
33 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
34 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
35 ; X64-NEXT: kunpckbw %k0, %k1, %k1
36 ; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
37 ; X64-NEXT: kmovw %k0, %eax
38 ; X64-NEXT: movzwl %ax, %eax
39 ; X64-NEXT: vzeroupper
42 %0 = bitcast <8 x i64> %__E to <16 x i32>
43 %1 = bitcast <8 x i64> %__F to <16 x i32>
44 %2 = bitcast <8 x i64> %__A to <16 x i32>
45 %3 = bitcast <8 x i64> %__B to <16 x i32>
46 %4 = icmp ne <16 x i32> %2, %3
47 %5 = bitcast <8 x i64> %__C to <16 x i32>
48 %6 = bitcast <8 x i64> %__D to <16 x i32>
49 %7 = icmp ne <16 x i32> %5, %6
50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
53 %11 = icmp ne <16 x i32> %0, %1
54 %12 = and <16 x i1> %11, %10
55 %13 = bitcast <16 x i1> %12 to i16
59 define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
60 ; X86-LABEL: test_mm512_kortestc:
61 ; X86: # %bb.0: # %entry
62 ; X86-NEXT: pushl %ebp
63 ; X86-NEXT: .cfi_def_cfa_offset 8
64 ; X86-NEXT: .cfi_offset %ebp, -8
65 ; X86-NEXT: movl %esp, %ebp
66 ; X86-NEXT: .cfi_def_cfa_register %ebp
67 ; X86-NEXT: andl $-64, %esp
68 ; X86-NEXT: subl $64, %esp
69 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
70 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
71 ; X86-NEXT: korw %k0, %k1, %k0
72 ; X86-NEXT: kmovw %k0, %eax
73 ; X86-NEXT: cmpw $-1, %ax
75 ; X86-NEXT: andb $1, %al
76 ; X86-NEXT: movzbl %al, %eax
77 ; X86-NEXT: movl %ebp, %esp
79 ; X86-NEXT: .cfi_def_cfa %esp, 4
80 ; X86-NEXT: vzeroupper
83 ; X64-LABEL: test_mm512_kortestc:
84 ; X64: # %bb.0: # %entry
85 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
86 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
87 ; X64-NEXT: korw %k0, %k1, %k0
88 ; X64-NEXT: kmovw %k0, %eax
89 ; X64-NEXT: cmpw $-1, %ax
91 ; X64-NEXT: andb $1, %al
92 ; X64-NEXT: movzbl %al, %eax
93 ; X64-NEXT: vzeroupper
96 %0 = bitcast <8 x i64> %__A to <16 x i32>
97 %1 = bitcast <8 x i64> %__B to <16 x i32>
98 %2 = icmp ne <16 x i32> %0, %1
99 %3 = bitcast <8 x i64> %__C to <16 x i32>
100 %4 = bitcast <8 x i64> %__D to <16 x i32>
101 %5 = icmp ne <16 x i32> %3, %4
102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16
103 %8 = icmp eq i16 %7, -1
104 %9 = zext i1 %8 to i32
108 define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
109 ; X86-LABEL: test_mm512_kortestz:
110 ; X86: # %bb.0: # %entry
111 ; X86-NEXT: pushl %ebp
112 ; X86-NEXT: .cfi_def_cfa_offset 8
113 ; X86-NEXT: .cfi_offset %ebp, -8
114 ; X86-NEXT: movl %esp, %ebp
115 ; X86-NEXT: .cfi_def_cfa_register %ebp
116 ; X86-NEXT: andl $-64, %esp
117 ; X86-NEXT: subl $64, %esp
118 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
119 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
120 ; X86-NEXT: korw %k0, %k1, %k0
121 ; X86-NEXT: kmovw %k0, %eax
122 ; X86-NEXT: cmpw $0, %ax
124 ; X86-NEXT: andb $1, %al
125 ; X86-NEXT: movzbl %al, %eax
126 ; X86-NEXT: movl %ebp, %esp
127 ; X86-NEXT: popl %ebp
128 ; X86-NEXT: .cfi_def_cfa %esp, 4
129 ; X86-NEXT: vzeroupper
132 ; X64-LABEL: test_mm512_kortestz:
133 ; X64: # %bb.0: # %entry
134 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
135 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
136 ; X64-NEXT: korw %k0, %k1, %k0
137 ; X64-NEXT: kmovw %k0, %eax
138 ; X64-NEXT: cmpw $0, %ax
140 ; X64-NEXT: andb $1, %al
141 ; X64-NEXT: movzbl %al, %eax
142 ; X64-NEXT: vzeroupper
145 %0 = bitcast <8 x i64> %__A to <16 x i32>
146 %1 = bitcast <8 x i64> %__B to <16 x i32>
147 %2 = icmp ne <16 x i32> %0, %1
148 %3 = bitcast <8 x i64> %__C to <16 x i32>
149 %4 = bitcast <8 x i64> %__D to <16 x i32>
150 %5 = icmp ne <16 x i32> %3, %4
151 %6 = or <16 x i1> %5, %2
152 %7 = bitcast <16 x i1> %6 to i16
153 %8 = icmp eq i16 %7, 0
154 %9 = zext i1 %8 to i32
158 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
159 ; CHECK-LABEL: test_mm512_shuffle_f32x4:
160 ; CHECK: # %bb.0: # %entry
161 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
162 ; CHECK-NEXT: ret{{[l|q]}}
164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
165 ret <16 x float> %shuffle
169 define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
170 ; X86-LABEL: test_mm512_mask_shuffle_f32x4:
171 ; X86: # %bb.0: # %entry
172 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
173 ; X86-NEXT: kmovw %eax, %k1
174 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
177 ; X64-LABEL: test_mm512_mask_shuffle_f32x4:
178 ; X64: # %bb.0: # %entry
179 ; X64-NEXT: kmovw %edi, %k1
180 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
183 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
184 %0 = bitcast i16 %__U to <16 x i1>
185 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
189 define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
190 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
191 ; X86: # %bb.0: # %entry
192 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
193 ; X86-NEXT: kmovw %eax, %k1
194 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
197 ; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
198 ; X64: # %bb.0: # %entry
199 ; X64-NEXT: kmovw %edi, %k1
200 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
203 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
204 %0 = bitcast i16 %__U to <16 x i1>
205 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
209 define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
210 ; CHECK-LABEL: test_mm512_shuffle_f64x2:
211 ; CHECK: # %bb.0: # %entry
212 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
213 ; CHECK-NEXT: ret{{[l|q]}}
215 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
216 ret <8 x double> %shuffle
219 define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
220 ; X86-LABEL: test_mm512_mask_shuffle_f64x2:
221 ; X86: # %bb.0: # %entry
222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
223 ; X86-NEXT: kmovw %eax, %k1
224 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
227 ; X64-LABEL: test_mm512_mask_shuffle_f64x2:
228 ; X64: # %bb.0: # %entry
229 ; X64-NEXT: kmovw %edi, %k1
230 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
233 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
234 %0 = bitcast i8 %__U to <8 x i1>
235 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
239 define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
240 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
241 ; X86: # %bb.0: # %entry
242 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
243 ; X86-NEXT: kmovw %eax, %k1
244 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
247 ; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
248 ; X64: # %bb.0: # %entry
249 ; X64-NEXT: kmovw %edi, %k1
250 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
253 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
254 %0 = bitcast i8 %__U to <8 x i1>
255 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
259 define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
260 ; CHECK-LABEL: test_mm512_shuffle_i32x4:
261 ; CHECK: # %bb.0: # %entry
262 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
263 ; CHECK-NEXT: ret{{[l|q]}}
265 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
266 ret <8 x i64> %shuffle
269 define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
270 ; X86-LABEL: test_mm512_mask_shuffle_i32x4:
271 ; X86: # %bb.0: # %entry
272 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
273 ; X86-NEXT: kmovw %eax, %k1
274 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
277 ; X64-LABEL: test_mm512_mask_shuffle_i32x4:
278 ; X64: # %bb.0: # %entry
279 ; X64-NEXT: kmovw %edi, %k1
280 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
283 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
284 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
285 %1 = bitcast <8 x i64> %__W to <16 x i32>
286 %2 = bitcast i16 %__U to <16 x i1>
287 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
288 %4 = bitcast <16 x i32> %3 to <8 x i64>
292 define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
293 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
294 ; X86: # %bb.0: # %entry
295 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
296 ; X86-NEXT: kmovw %eax, %k1
297 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
300 ; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
301 ; X64: # %bb.0: # %entry
302 ; X64-NEXT: kmovw %edi, %k1
303 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
306 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
307 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
308 %1 = bitcast i16 %__U to <16 x i1>
309 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
310 %3 = bitcast <16 x i32> %2 to <8 x i64>
314 define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
315 ; CHECK-LABEL: test_mm512_shuffle_i64x2:
316 ; CHECK: # %bb.0: # %entry
317 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
318 ; CHECK-NEXT: ret{{[l|q]}}
320 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
321 ret <8 x i64> %shuffle
324 define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
325 ; X86-LABEL: test_mm512_mask_shuffle_i64x2:
326 ; X86: # %bb.0: # %entry
327 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
328 ; X86-NEXT: kmovw %eax, %k1
329 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
332 ; X64-LABEL: test_mm512_mask_shuffle_i64x2:
333 ; X64: # %bb.0: # %entry
334 ; X64-NEXT: kmovw %edi, %k1
335 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
338 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
339 %0 = bitcast i8 %__U to <8 x i1>
340 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
344 define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
345 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
346 ; X86: # %bb.0: # %entry
347 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
348 ; X86-NEXT: kmovw %eax, %k1
349 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
352 ; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
353 ; X64: # %bb.0: # %entry
354 ; X64-NEXT: kmovw %edi, %k1
355 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
358 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
359 %0 = bitcast i8 %__U to <8 x i1>
360 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
365 define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
366 ; CHECK-LABEL: test_mm512_testn_epi32_mask:
367 ; CHECK: # %bb.0: # %entry
368 ; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
369 ; CHECK-NEXT: kmovw %k0, %eax
370 ; CHECK-NEXT: movzwl %ax, %eax
371 ; CHECK-NEXT: vzeroupper
372 ; CHECK-NEXT: ret{{[l|q]}}
374 %and1.i.i = and <8 x i64> %__B, %__A
375 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
376 %1 = icmp eq <16 x i32> %0, zeroinitializer
377 %2 = bitcast <16 x i1> %1 to i16
381 define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
382 ; X86-LABEL: test_mm512_mask_testn_epi32_mask:
383 ; X86: # %bb.0: # %entry
384 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
385 ; X86-NEXT: kmovw %eax, %k1
386 ; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
387 ; X86-NEXT: kmovw %k0, %eax
388 ; X86-NEXT: movzwl %ax, %eax
389 ; X86-NEXT: vzeroupper
392 ; X64-LABEL: test_mm512_mask_testn_epi32_mask:
393 ; X64: # %bb.0: # %entry
394 ; X64-NEXT: kmovw %edi, %k1
395 ; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
396 ; X64-NEXT: kmovw %k0, %eax
397 ; X64-NEXT: movzwl %ax, %eax
398 ; X64-NEXT: vzeroupper
401 %and1.i.i = and <8 x i64> %__B, %__A
402 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
403 %1 = icmp eq <16 x i32> %0, zeroinitializer
404 %2 = bitcast i16 %__U to <16 x i1>
405 %3 = and <16 x i1> %1, %2
406 %4 = bitcast <16 x i1> %3 to i16
410 define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
411 ; CHECK-LABEL: test_mm512_testn_epi64_mask:
412 ; CHECK: # %bb.0: # %entry
413 ; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
414 ; CHECK-NEXT: kmovw %k0, %eax
415 ; CHECK-NEXT: movzbl %al, %eax
416 ; CHECK-NEXT: vzeroupper
417 ; CHECK-NEXT: ret{{[l|q]}}
419 %and1.i.i = and <8 x i64> %__B, %__A
420 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
421 %1 = bitcast <8 x i1> %0 to i8
425 define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
426 ; X86-LABEL: test_mm512_mask_testn_epi64_mask:
427 ; X86: # %bb.0: # %entry
428 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
429 ; X86-NEXT: kmovw %eax, %k1
430 ; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
431 ; X86-NEXT: kmovw %k0, %eax
432 ; X86-NEXT: movzbl %al, %eax
433 ; X86-NEXT: vzeroupper
436 ; X64-LABEL: test_mm512_mask_testn_epi64_mask:
437 ; X64: # %bb.0: # %entry
438 ; X64-NEXT: kmovw %edi, %k1
439 ; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
440 ; X64-NEXT: kmovw %k0, %eax
441 ; X64-NEXT: movzbl %al, %eax
442 ; X64-NEXT: vzeroupper
445 %and1.i.i = and <8 x i64> %__B, %__A
446 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
447 %1 = bitcast i8 %__U to <8 x i1>
448 %2 = and <8 x i1> %0, %1
449 %3 = bitcast <8 x i1> %2 to i8
453 define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
454 ; X86-LABEL: test_mm512_mask_test_epi32_mask:
455 ; X86: # %bb.0: # %entry
456 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
457 ; X86-NEXT: kmovw %eax, %k1
458 ; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
459 ; X86-NEXT: kmovw %k0, %eax
460 ; X86-NEXT: movzwl %ax, %eax
461 ; X86-NEXT: vzeroupper
464 ; X64-LABEL: test_mm512_mask_test_epi32_mask:
465 ; X64: # %bb.0: # %entry
466 ; X64-NEXT: kmovw %edi, %k1
467 ; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
468 ; X64-NEXT: kmovw %k0, %eax
469 ; X64-NEXT: movzwl %ax, %eax
470 ; X64-NEXT: vzeroupper
473 %and1.i.i = and <8 x i64> %__B, %__A
474 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
475 %1 = icmp ne <16 x i32> %0, zeroinitializer
476 %2 = bitcast i16 %__U to <16 x i1>
477 %3 = and <16 x i1> %1, %2
478 %4 = bitcast <16 x i1> %3 to i16
482 define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
483 ; X86-LABEL: test_mm512_mask_test_epi64_mask:
484 ; X86: # %bb.0: # %entry
485 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
486 ; X86-NEXT: kmovw %eax, %k1
487 ; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
488 ; X86-NEXT: kmovw %k0, %eax
489 ; X86-NEXT: movzbl %al, %eax
490 ; X86-NEXT: vzeroupper
493 ; X64-LABEL: test_mm512_mask_test_epi64_mask:
494 ; X64: # %bb.0: # %entry
495 ; X64-NEXT: kmovw %edi, %k1
496 ; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
497 ; X64-NEXT: kmovw %k0, %eax
498 ; X64-NEXT: movzbl %al, %eax
499 ; X64-NEXT: vzeroupper
502 %and1.i.i = and <8 x i64> %__B, %__A
503 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
504 %1 = bitcast i8 %__U to <8 x i1>
505 %2 = and <8 x i1> %0, %1
506 %3 = bitcast <8 x i1> %2 to i8
510 define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
511 ; X86-LABEL: test_mm512_mask_set1_epi32:
512 ; X86: # %bb.0: # %entry
513 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
514 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
515 ; X86-NEXT: kmovw %ecx, %k1
516 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1}
519 ; X64-LABEL: test_mm512_mask_set1_epi32:
520 ; X64: # %bb.0: # %entry
521 ; X64-NEXT: kmovw %edi, %k1
522 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1}
525 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
526 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
527 %0 = bitcast <8 x i64> %__O to <16 x i32>
528 %1 = bitcast i16 %__M to <16 x i1>
529 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
530 %3 = bitcast <16 x i32> %2 to <8 x i64>
534 define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) {
535 ; X86-LABEL: test_mm512_maskz_set1_epi32:
536 ; X86: # %bb.0: # %entry
537 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
538 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
539 ; X86-NEXT: kmovw %ecx, %k1
540 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
543 ; X64-LABEL: test_mm512_maskz_set1_epi32:
544 ; X64: # %bb.0: # %entry
545 ; X64-NEXT: kmovw %edi, %k1
546 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z}
549 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
550 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
551 %0 = bitcast i16 %__M to <16 x i1>
552 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
553 %2 = bitcast <16 x i32> %1 to <8 x i64>
557 define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
558 ; X86-LABEL: test_mm512_mask_set1_epi64:
559 ; X86: # %bb.0: # %entry
560 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
561 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
562 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
563 ; X86-NEXT: kmovw %eax, %k1
564 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
567 ; X64-LABEL: test_mm512_mask_set1_epi64:
568 ; X64: # %bb.0: # %entry
569 ; X64-NEXT: kmovw %edi, %k1
570 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1}
573 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
574 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
575 %0 = bitcast i8 %__M to <8 x i1>
576 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
580 define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
581 ; X86-LABEL: test_mm512_maskz_set1_epi64:
582 ; X86: # %bb.0: # %entry
583 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
584 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
585 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
586 ; X86-NEXT: kmovw %eax, %k1
587 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
590 ; X64-LABEL: test_mm512_maskz_set1_epi64:
591 ; X64: # %bb.0: # %entry
592 ; X64-NEXT: kmovw %edi, %k1
593 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z}
596 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
597 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
598 %0 = bitcast i8 %__M to <8 x i1>
599 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
604 define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
605 ; CHECK-LABEL: test_mm512_broadcastd_epi32:
607 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
608 ; CHECK-NEXT: ret{{[l|q]}}
609 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
610 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
611 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
615 define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
616 ; X86-LABEL: test_mm512_mask_broadcastd_epi32:
618 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
619 ; X86-NEXT: kmovw %eax, %k1
620 ; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
623 ; X64-LABEL: test_mm512_mask_broadcastd_epi32:
625 ; X64-NEXT: kmovw %edi, %k1
626 ; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
628 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
629 %arg1 = bitcast i16 %a1 to <16 x i1>
630 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
631 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
632 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
633 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
637 define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
638 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
640 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
641 ; X86-NEXT: kmovw %eax, %k1
642 ; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
645 ; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
647 ; X64-NEXT: kmovw %edi, %k1
648 ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
650 %arg0 = bitcast i16 %a0 to <16 x i1>
651 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
652 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
653 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
654 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
658 define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
659 ; CHECK-LABEL: test_mm512_broadcastq_epi64:
661 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
662 ; CHECK-NEXT: ret{{[l|q]}}
663 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
667 define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
668 ; X86-LABEL: test_mm512_mask_broadcastq_epi64:
670 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
671 ; X86-NEXT: kmovw %eax, %k1
672 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
675 ; X64-LABEL: test_mm512_mask_broadcastq_epi64:
677 ; X64-NEXT: kmovw %edi, %k1
678 ; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
680 %arg1 = bitcast i8 %a1 to <8 x i1>
681 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
682 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
686 define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
687 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
689 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
690 ; X86-NEXT: kmovw %eax, %k1
691 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
694 ; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
696 ; X64-NEXT: kmovw %edi, %k1
697 ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
699 %arg0 = bitcast i8 %a0 to <8 x i1>
700 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
701 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
705 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
706 ; CHECK-LABEL: test_mm512_broadcastsd_pd:
708 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
709 ; CHECK-NEXT: ret{{[l|q]}}
710 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
711 ret <8 x double> %res
714 define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
715 ; X86-LABEL: test_mm512_mask_broadcastsd_pd:
717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
718 ; X86-NEXT: kmovw %eax, %k1
719 ; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
722 ; X64-LABEL: test_mm512_mask_broadcastsd_pd:
724 ; X64-NEXT: kmovw %edi, %k1
725 ; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
727 %arg1 = bitcast i8 %a1 to <8 x i1>
728 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
729 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
730 ret <8 x double> %res1
733 define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
734 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
736 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
737 ; X86-NEXT: kmovw %eax, %k1
738 ; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
741 ; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
743 ; X64-NEXT: kmovw %edi, %k1
744 ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
746 %arg0 = bitcast i8 %a0 to <8 x i1>
747 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
748 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
749 ret <8 x double> %res1
752 define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
753 ; CHECK-LABEL: test_mm512_broadcastss_ps:
755 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
756 ; CHECK-NEXT: ret{{[l|q]}}
757 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
758 ret <16 x float> %res
761 define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
762 ; X86-LABEL: test_mm512_mask_broadcastss_ps:
764 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
765 ; X86-NEXT: kmovw %eax, %k1
766 ; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
769 ; X64-LABEL: test_mm512_mask_broadcastss_ps:
771 ; X64-NEXT: kmovw %edi, %k1
772 ; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
774 %arg1 = bitcast i16 %a1 to <16 x i1>
775 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
776 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
777 ret <16 x float> %res1
780 define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
781 ; X86-LABEL: test_mm512_maskz_broadcastss_ps:
783 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
784 ; X86-NEXT: kmovw %eax, %k1
785 ; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
788 ; X64-LABEL: test_mm512_maskz_broadcastss_ps:
790 ; X64-NEXT: kmovw %edi, %k1
791 ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
793 %arg0 = bitcast i16 %a0 to <16 x i1>
794 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
795 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
796 ret <16 x float> %res1
799 define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
800 ; CHECK-LABEL: test_mm512_movedup_pd:
802 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
803 ; CHECK-NEXT: ret{{[l|q]}}
804 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
805 ret <8 x double> %res
808 define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
809 ; X86-LABEL: test_mm512_mask_movedup_pd:
811 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
812 ; X86-NEXT: kmovw %eax, %k1
813 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
816 ; X64-LABEL: test_mm512_mask_movedup_pd:
818 ; X64-NEXT: kmovw %edi, %k1
819 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
821 %arg1 = bitcast i8 %a1 to <8 x i1>
822 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
823 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
824 ret <8 x double> %res1
827 define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
828 ; X86-LABEL: test_mm512_maskz_movedup_pd:
830 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
831 ; X86-NEXT: kmovw %eax, %k1
832 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
835 ; X64-LABEL: test_mm512_maskz_movedup_pd:
837 ; X64-NEXT: kmovw %edi, %k1
838 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
840 %arg0 = bitcast i8 %a0 to <8 x i1>
841 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
842 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
843 ret <8 x double> %res1
846 define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
847 ; CHECK-LABEL: test_mm512_movehdup_ps:
849 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
850 ; CHECK-NEXT: ret{{[l|q]}}
851 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
852 ret <16 x float> %res
855 define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
856 ; X86-LABEL: test_mm512_mask_movehdup_ps:
858 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
859 ; X86-NEXT: kmovw %eax, %k1
860 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
863 ; X64-LABEL: test_mm512_mask_movehdup_ps:
865 ; X64-NEXT: kmovw %edi, %k1
866 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
868 %arg1 = bitcast i16 %a1 to <16 x i1>
869 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
870 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
871 ret <16 x float> %res1
874 define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
875 ; X86-LABEL: test_mm512_maskz_movehdup_ps:
877 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
878 ; X86-NEXT: kmovw %eax, %k1
879 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
882 ; X64-LABEL: test_mm512_maskz_movehdup_ps:
884 ; X64-NEXT: kmovw %edi, %k1
885 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
887 %arg0 = bitcast i16 %a0 to <16 x i1>
888 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
889 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
890 ret <16 x float> %res1
893 define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
894 ; CHECK-LABEL: test_mm512_moveldup_ps:
896 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
897 ; CHECK-NEXT: ret{{[l|q]}}
898 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
899 ret <16 x float> %res
902 define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
903 ; X86-LABEL: test_mm512_mask_moveldup_ps:
905 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
906 ; X86-NEXT: kmovw %eax, %k1
907 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
910 ; X64-LABEL: test_mm512_mask_moveldup_ps:
912 ; X64-NEXT: kmovw %edi, %k1
913 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
915 %arg1 = bitcast i16 %a1 to <16 x i1>
916 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
917 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
918 ret <16 x float> %res1
921 define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
922 ; X86-LABEL: test_mm512_maskz_moveldup_ps:
924 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
925 ; X86-NEXT: kmovw %eax, %k1
926 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
929 ; X64-LABEL: test_mm512_maskz_moveldup_ps:
931 ; X64-NEXT: kmovw %edi, %k1
932 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
934 %arg0 = bitcast i16 %a0 to <16 x i1>
935 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
936 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
937 ret <16 x float> %res1
940 define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
941 ; CHECK-LABEL: test_mm512_permute_pd:
943 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
944 ; CHECK-NEXT: ret{{[l|q]}}
945 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
946 ret <8 x double> %res
949 define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
950 ; X86-LABEL: test_mm512_mask_permute_pd:
952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
953 ; X86-NEXT: kmovw %eax, %k1
954 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
957 ; X64-LABEL: test_mm512_mask_permute_pd:
959 ; X64-NEXT: kmovw %edi, %k1
960 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
962 %arg1 = bitcast i8 %a1 to <8 x i1>
963 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
964 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
965 ret <8 x double> %res1
968 define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
969 ; X86-LABEL: test_mm512_maskz_permute_pd:
971 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
972 ; X86-NEXT: kmovw %eax, %k1
973 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
976 ; X64-LABEL: test_mm512_maskz_permute_pd:
978 ; X64-NEXT: kmovw %edi, %k1
979 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
981 %arg0 = bitcast i8 %a0 to <8 x i1>
982 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
983 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
984 ret <8 x double> %res1
987 define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
988 ; CHECK-LABEL: test_mm512_permute_ps:
990 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
991 ; CHECK-NEXT: ret{{[l|q]}}
992 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
993 ret <16 x float> %res
996 define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
997 ; X86-LABEL: test_mm512_mask_permute_ps:
999 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1000 ; X86-NEXT: kmovw %eax, %k1
1001 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1004 ; X64-LABEL: test_mm512_mask_permute_ps:
1006 ; X64-NEXT: kmovw %edi, %k1
1007 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1009 %arg1 = bitcast i16 %a1 to <16 x i1>
1010 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1011 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1012 ret <16 x float> %res1
1015 define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
1016 ; X86-LABEL: test_mm512_maskz_permute_ps:
1018 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1019 ; X86-NEXT: kmovw %eax, %k1
1020 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1023 ; X64-LABEL: test_mm512_maskz_permute_ps:
1025 ; X64-NEXT: kmovw %edi, %k1
1026 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1028 %arg0 = bitcast i16 %a0 to <16 x i1>
1029 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1030 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1031 ret <16 x float> %res1
1034 define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
1035 ; CHECK-LABEL: test_mm512_permutex_epi64:
1037 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1038 ; CHECK-NEXT: ret{{[l|q]}}
1039 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1043 define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
1044 ; X86-LABEL: test_mm512_mask_permutex_epi64:
1046 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1047 ; X86-NEXT: kmovw %eax, %k1
1048 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1051 ; X64-LABEL: test_mm512_mask_permutex_epi64:
1053 ; X64-NEXT: kmovw %edi, %k1
1054 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1056 %arg1 = bitcast i8 %a1 to <8 x i1>
1057 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1058 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1062 define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
1063 ; X86-LABEL: test_mm512_maskz_permutex_epi64:
1065 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1066 ; X86-NEXT: kmovw %eax, %k1
1067 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1070 ; X64-LABEL: test_mm512_maskz_permutex_epi64:
1072 ; X64-NEXT: kmovw %edi, %k1
1073 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1075 %arg0 = bitcast i8 %a0 to <8 x i1>
1076 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1077 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1081 define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
1082 ; CHECK-LABEL: test_mm512_permutex_pd:
1084 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1085 ; CHECK-NEXT: ret{{[l|q]}}
1086 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1087 ret <8 x double> %res
1090 define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
1091 ; X86-LABEL: test_mm512_mask_permutex_pd:
1093 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1094 ; X86-NEXT: kmovw %eax, %k1
1095 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1098 ; X64-LABEL: test_mm512_mask_permutex_pd:
1100 ; X64-NEXT: kmovw %edi, %k1
1101 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1103 %arg1 = bitcast i8 %a1 to <8 x i1>
1104 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1105 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1106 ret <8 x double> %res1
1109 define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
1110 ; X86-LABEL: test_mm512_maskz_permutex_pd:
1112 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1113 ; X86-NEXT: kmovw %eax, %k1
1114 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1117 ; X64-LABEL: test_mm512_maskz_permutex_pd:
1119 ; X64-NEXT: kmovw %edi, %k1
1120 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1122 %arg0 = bitcast i8 %a0 to <8 x i1>
1123 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1124 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1125 ret <8 x double> %res1
1128 define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
1129 ; CHECK-LABEL: test_mm512_shuffle_epi32:
1131 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1132 ; CHECK-NEXT: ret{{[l|q]}}
1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1134 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1135 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1139 define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
1140 ; X86-LABEL: test_mm512_mask_shuffle_epi32:
1142 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1143 ; X86-NEXT: kmovw %eax, %k1
1144 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1147 ; X64-LABEL: test_mm512_mask_shuffle_epi32:
1149 ; X64-NEXT: kmovw %edi, %k1
1150 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1152 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1153 %arg1 = bitcast i16 %a1 to <16 x i1>
1154 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1155 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1156 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1157 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1161 define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
1162 ; X86-LABEL: test_mm512_maskz_shuffle_epi32:
1164 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1165 ; X86-NEXT: kmovw %eax, %k1
1166 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1169 ; X64-LABEL: test_mm512_maskz_shuffle_epi32:
1171 ; X64-NEXT: kmovw %edi, %k1
1172 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1174 %arg0 = bitcast i16 %a0 to <16 x i1>
1175 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1176 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1177 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1178 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1182 define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
1183 ; CHECK-LABEL: test_mm512_shuffle_pd:
1185 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1186 ; CHECK-NEXT: ret{{[l|q]}}
1187 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1188 ret <8 x double> %res
1191 define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1192 ; X86-LABEL: test_mm512_mask_shuffle_pd:
1194 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1195 ; X86-NEXT: kmovw %eax, %k1
1196 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1199 ; X64-LABEL: test_mm512_mask_shuffle_pd:
1201 ; X64-NEXT: kmovw %edi, %k1
1202 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1204 %arg1 = bitcast i8 %a1 to <8 x i1>
1205 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1206 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1207 ret <8 x double> %res1
1210 define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1211 ; X86-LABEL: test_mm512_maskz_shuffle_pd:
1213 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1214 ; X86-NEXT: kmovw %eax, %k1
1215 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1218 ; X64-LABEL: test_mm512_maskz_shuffle_pd:
1220 ; X64-NEXT: kmovw %edi, %k1
1221 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1223 %arg0 = bitcast i8 %a0 to <8 x i1>
1224 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1225 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1226 ret <8 x double> %res1
1229 define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1230 ; CHECK-LABEL: test_mm512_unpackhi_epi32:
1232 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1233 ; CHECK-NEXT: ret{{[l|q]}}
1234 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1235 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1236 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1237 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1241 define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1242 ; X86-LABEL: test_mm512_mask_unpackhi_epi32:
1244 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1245 ; X86-NEXT: kmovw %eax, %k1
1246 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1249 ; X64-LABEL: test_mm512_mask_unpackhi_epi32:
1251 ; X64-NEXT: kmovw %edi, %k1
1252 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1254 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1255 %arg1 = bitcast i16 %a1 to <16 x i1>
1256 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1257 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1258 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1259 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1264 define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1265 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
1267 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1268 ; X86-NEXT: kmovw %eax, %k1
1269 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1272 ; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
1274 ; X64-NEXT: kmovw %edi, %k1
1275 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1277 %arg0 = bitcast i16 %a0 to <16 x i1>
1278 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1279 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1280 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1281 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1282 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1286 define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1287 ; CHECK-LABEL: test_mm512_unpackhi_epi64:
1289 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1290 ; CHECK-NEXT: ret{{[l|q]}}
1291 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1295 define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1296 ; X86-LABEL: test_mm512_mask_unpackhi_epi64:
1298 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1299 ; X86-NEXT: kmovw %eax, %k1
1300 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1303 ; X64-LABEL: test_mm512_mask_unpackhi_epi64:
1305 ; X64-NEXT: kmovw %edi, %k1
1306 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1308 %arg1 = bitcast i8 %a1 to <8 x i1>
1309 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1310 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1314 define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1315 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
1317 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1318 ; X86-NEXT: kmovw %eax, %k1
1319 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1322 ; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
1324 ; X64-NEXT: kmovw %edi, %k1
1325 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1327 %arg0 = bitcast i8 %a0 to <8 x i1>
1328 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1329 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1333 define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
1334 ; CHECK-LABEL: test_mm512_unpackhi_pd:
1336 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1337 ; CHECK-NEXT: ret{{[l|q]}}
1338 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1339 ret <8 x double> %res
1342 define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1343 ; X86-LABEL: test_mm512_mask_unpackhi_pd:
1345 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1346 ; X86-NEXT: kmovw %eax, %k1
1347 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1350 ; X64-LABEL: test_mm512_mask_unpackhi_pd:
1352 ; X64-NEXT: kmovw %edi, %k1
1353 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1355 %arg1 = bitcast i8 %a1 to <8 x i1>
1356 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1357 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1358 ret <8 x double> %res1
1361 define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1362 ; X86-LABEL: test_mm512_maskz_unpackhi_pd:
1364 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1365 ; X86-NEXT: kmovw %eax, %k1
1366 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1369 ; X64-LABEL: test_mm512_maskz_unpackhi_pd:
1371 ; X64-NEXT: kmovw %edi, %k1
1372 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1374 %arg0 = bitcast i8 %a0 to <8 x i1>
1375 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1376 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1377 ret <8 x double> %res1
1380 define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
1381 ; CHECK-LABEL: test_mm512_unpackhi_ps:
1383 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1384 ; CHECK-NEXT: ret{{[l|q]}}
1385 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1386 ret <16 x float> %res
1389 define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1390 ; X86-LABEL: test_mm512_mask_unpackhi_ps:
1392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1393 ; X86-NEXT: kmovw %eax, %k1
1394 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1397 ; X64-LABEL: test_mm512_mask_unpackhi_ps:
1399 ; X64-NEXT: kmovw %edi, %k1
1400 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1402 %arg1 = bitcast i16 %a1 to <16 x i1>
1403 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1404 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1405 ret <16 x float> %res1
1408 define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1409 ; X86-LABEL: test_mm512_maskz_unpackhi_ps:
1411 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1412 ; X86-NEXT: kmovw %eax, %k1
1413 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1416 ; X64-LABEL: test_mm512_maskz_unpackhi_ps:
1418 ; X64-NEXT: kmovw %edi, %k1
1419 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1421 %arg0 = bitcast i16 %a0 to <16 x i1>
1422 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1423 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1424 ret <16 x float> %res1
1427 define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1428 ; CHECK-LABEL: test_mm512_unpacklo_epi32:
1430 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1431 ; CHECK-NEXT: ret{{[l|q]}}
1432 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1433 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1434 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1435 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1439 define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1440 ; X86-LABEL: test_mm512_mask_unpacklo_epi32:
1442 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1443 ; X86-NEXT: kmovw %eax, %k1
1444 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1447 ; X64-LABEL: test_mm512_mask_unpacklo_epi32:
1449 ; X64-NEXT: kmovw %edi, %k1
1450 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1452 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1453 %arg1 = bitcast i16 %a1 to <16 x i1>
1454 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1455 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1456 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1457 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1458 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1462 define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1463 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
1465 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1466 ; X86-NEXT: kmovw %eax, %k1
1467 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1470 ; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
1472 ; X64-NEXT: kmovw %edi, %k1
1473 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1475 %arg0 = bitcast i16 %a0 to <16 x i1>
1476 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1477 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1478 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1479 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1480 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1484 define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1485 ; CHECK-LABEL: test_mm512_unpacklo_epi64:
1487 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1488 ; CHECK-NEXT: ret{{[l|q]}}
1489 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1493 define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1494 ; X86-LABEL: test_mm512_mask_unpacklo_epi64:
1496 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1497 ; X86-NEXT: kmovw %eax, %k1
1498 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1501 ; X64-LABEL: test_mm512_mask_unpacklo_epi64:
1503 ; X64-NEXT: kmovw %edi, %k1
1504 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1506 %arg1 = bitcast i8 %a1 to <8 x i1>
1507 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1508 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1512 define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1513 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
1515 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1516 ; X86-NEXT: kmovw %eax, %k1
1517 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1520 ; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
1522 ; X64-NEXT: kmovw %edi, %k1
1523 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1525 %arg0 = bitcast i8 %a0 to <8 x i1>
1526 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1527 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1531 define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
1532 ; CHECK-LABEL: test_mm512_unpacklo_pd:
1534 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1535 ; CHECK-NEXT: ret{{[l|q]}}
1536 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1537 ret <8 x double> %res
1540 define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1541 ; X86-LABEL: test_mm512_mask_unpacklo_pd:
1543 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1544 ; X86-NEXT: kmovw %eax, %k1
1545 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1548 ; X64-LABEL: test_mm512_mask_unpacklo_pd:
1550 ; X64-NEXT: kmovw %edi, %k1
1551 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1553 %arg1 = bitcast i8 %a1 to <8 x i1>
1554 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1555 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1556 ret <8 x double> %res1
1559 define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1560 ; X86-LABEL: test_mm512_maskz_unpacklo_pd:
1562 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1563 ; X86-NEXT: kmovw %eax, %k1
1564 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1567 ; X64-LABEL: test_mm512_maskz_unpacklo_pd:
1569 ; X64-NEXT: kmovw %edi, %k1
1570 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1572 %arg0 = bitcast i8 %a0 to <8 x i1>
1573 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1574 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1575 ret <8 x double> %res1
1578 define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
1579 ; CHECK-LABEL: test_mm512_unpacklo_ps:
1581 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1582 ; CHECK-NEXT: ret{{[l|q]}}
1583 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1584 ret <16 x float> %res
1587 define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1588 ; X86-LABEL: test_mm512_mask_unpacklo_ps:
1590 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1591 ; X86-NEXT: kmovw %eax, %k1
1592 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1595 ; X64-LABEL: test_mm512_mask_unpacklo_ps:
1597 ; X64-NEXT: kmovw %edi, %k1
1598 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1600 %arg1 = bitcast i16 %a1 to <16 x i1>
1601 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1602 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1603 ret <16 x float> %res1
1606 define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1607 ; X86-LABEL: test_mm512_maskz_unpacklo_ps:
1609 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: kmovw %eax, %k1
1611 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1614 ; X64-LABEL: test_mm512_maskz_unpacklo_ps:
1616 ; X64-NEXT: kmovw %edi, %k1
1617 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1619 %arg0 = bitcast i16 %a0 to <16 x i1>
1620 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1621 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1622 ret <16 x float> %res1
1625 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
1626 ; CHECK-LABEL: test_mm512_zextpd128_pd512:
1628 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1629 ; CHECK-NEXT: ret{{[l|q]}}
1630 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1631 ret <8 x double> %res
1634 define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
1635 ; CHECK-LABEL: test_mm512_zextpd256_pd512:
1637 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1638 ; CHECK-NEXT: ret{{[l|q]}}
1639 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1640 ret <8 x double> %res
1643 define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
1644 ; CHECK-LABEL: test_mm512_zextps128_ps512:
1646 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1647 ; CHECK-NEXT: ret{{[l|q]}}
1648 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
1649 ret <16 x float> %res
1652 define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
1653 ; CHECK-LABEL: test_mm512_zextps256_ps512:
1655 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1656 ; CHECK-NEXT: ret{{[l|q]}}
1657 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1658 ret <16 x float> %res
1661 define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
1662 ; CHECK-LABEL: test_mm512_zextsi128_si512:
1664 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1665 ; CHECK-NEXT: ret{{[l|q]}}
1666 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1670 define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
1671 ; CHECK-LABEL: test_mm512_zextsi256_si512:
1673 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1674 ; CHECK-NEXT: ret{{[l|q]}}
1675 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1679 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1680 ; CHECK-LABEL: test_mm512_mul_epi32:
1682 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0
1683 ; CHECK-NEXT: ret{{[l|q]}}
1684 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1685 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1686 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1687 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1688 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1692 define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1693 ; X86-LABEL: test_mm512_maskz_mul_epi32:
1694 ; X86: # %bb.0: # %entry
1695 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1696 ; X86-NEXT: kmovw %eax, %k1
1697 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1700 ; X64-LABEL: test_mm512_maskz_mul_epi32:
1701 ; X64: # %bb.0: # %entry
1702 ; X64-NEXT: kmovw %edi, %k1
1703 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1706 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1707 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1708 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1709 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1710 %4 = mul nsw <8 x i64> %3, %1
1711 %5 = bitcast i8 %__k to <8 x i1>
1712 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
1716 define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1717 ; X86-LABEL: test_mm512_mask_mul_epi32:
1718 ; X86: # %bb.0: # %entry
1719 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1720 ; X86-NEXT: kmovw %eax, %k1
1721 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1722 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1725 ; X64-LABEL: test_mm512_mask_mul_epi32:
1726 ; X64: # %bb.0: # %entry
1727 ; X64-NEXT: kmovw %edi, %k1
1728 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1729 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1732 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1733 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1734 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1735 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1736 %4 = mul nsw <8 x i64> %3, %1
1737 %5 = bitcast i8 %__k to <8 x i1>
1738 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src
1742 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1743 ; CHECK-LABEL: test_mm512_mul_epu32:
1745 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
1746 ; CHECK-NEXT: ret{{[l|q]}}
1747 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1748 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1749 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1753 define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1754 ; X86-LABEL: test_mm512_maskz_mul_epu32:
1755 ; X86: # %bb.0: # %entry
1756 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1757 ; X86-NEXT: kmovw %eax, %k1
1758 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1761 ; X64-LABEL: test_mm512_maskz_mul_epu32:
1762 ; X64: # %bb.0: # %entry
1763 ; X64-NEXT: kmovw %edi, %k1
1764 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1767 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1768 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1769 %2 = mul nuw <8 x i64> %1, %0
1770 %3 = bitcast i8 %__k to <8 x i1>
1771 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1775 define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1776 ; X86-LABEL: test_mm512_mask_mul_epu32:
1777 ; X86: # %bb.0: # %entry
1778 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1779 ; X86-NEXT: kmovw %eax, %k1
1780 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1781 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1784 ; X64-LABEL: test_mm512_mask_mul_epu32:
1785 ; X64: # %bb.0: # %entry
1786 ; X64-NEXT: kmovw %edi, %k1
1787 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1788 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1791 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1792 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1793 %2 = mul nuw <8 x i64> %1, %0
1794 %3 = bitcast i8 %__k to <8 x i1>
1795 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src
1799 define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
1800 ; X86-LABEL: test_mm512_set1_epi8:
1801 ; X86: # %bb.0: # %entry
1802 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1803 ; X86-NEXT: vmovd %eax, %xmm0
1804 ; X86-NEXT: vpbroadcastb %xmm0, %ymm0
1805 ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1808 ; X64-LABEL: test_mm512_set1_epi8:
1809 ; X64: # %bb.0: # %entry
1810 ; X64-NEXT: vmovd %edi, %xmm0
1811 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0
1812 ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1815 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
1816 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
1817 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
1821 define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
1822 ; X86-LABEL: test_mm_cvtu32_sd:
1823 ; X86: # %bb.0: # %entry
1824 ; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
1827 ; X64-LABEL: test_mm_cvtu32_sd:
1828 ; X64: # %bb.0: # %entry
1829 ; X64-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0
1832 %conv.i = uitofp i32 %__B to double
1833 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1834 ret <2 x double> %vecins.i
1837 define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
1838 ; X86-LABEL: test_mm_cvtu64_sd:
1839 ; X86: # %bb.0: # %entry
1840 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1841 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1842 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1843 ; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
1844 ; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1845 ; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
1846 ; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1849 ; X64-LABEL: test_mm_cvtu64_sd:
1850 ; X64: # %bb.0: # %entry
1851 ; X64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0
1854 %conv.i = uitofp i64 %__B to double
1855 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1856 ret <2 x double> %vecins.i
1859 define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
1860 ; X86-LABEL: test_mm_cvtu32_ss:
1861 ; X86: # %bb.0: # %entry
1862 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
1865 ; X64-LABEL: test_mm_cvtu32_ss:
1866 ; X64: # %bb.0: # %entry
1867 ; X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
1870 %conv.i = uitofp i32 %__B to float
1871 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1872 ret <4 x float> %vecins.i
1875 define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
1876 ; X86-LABEL: test_mm_cvtu64_ss:
1877 ; X86: # %bb.0: # %entry
1878 ; X86-NEXT: pushl %ebp
1879 ; X86-NEXT: .cfi_def_cfa_offset 8
1880 ; X86-NEXT: .cfi_offset %ebp, -8
1881 ; X86-NEXT: movl %esp, %ebp
1882 ; X86-NEXT: .cfi_def_cfa_register %ebp
1883 ; X86-NEXT: andl $-8, %esp
1884 ; X86-NEXT: subl $16, %esp
1885 ; X86-NEXT: movl 12(%ebp), %eax
1886 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1887 ; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1888 ; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
1889 ; X86-NEXT: xorl %ecx, %ecx
1890 ; X86-NEXT: testl %eax, %eax
1891 ; X86-NEXT: setns %cl
1892 ; X86-NEXT: fildll {{[0-9]+}}(%esp)
1893 ; X86-NEXT: fadds {{\.LCPI.*}}(,%ecx,4)
1894 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
1895 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1896 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1897 ; X86-NEXT: movl %ebp, %esp
1898 ; X86-NEXT: popl %ebp
1899 ; X86-NEXT: .cfi_def_cfa %esp, 4
1902 ; X64-LABEL: test_mm_cvtu64_ss:
1903 ; X64: # %bb.0: # %entry
1904 ; X64-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0
1907 %conv.i = uitofp i64 %__B to float
1908 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1909 ret <4 x float> %vecins.i
1912 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
1913 ; CHECK-LABEL: test_mm512_cvtps_pd:
1914 ; CHECK: # %bb.0: # %entry
1915 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1916 ; CHECK-NEXT: ret{{[l|q]}}
1918 %conv.i = fpext <8 x float> %__A to <8 x double>
1919 ret <8 x double> %conv.i
1922 define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
1923 ; CHECK-LABEL: test_mm512_cvtpslo_pd:
1924 ; CHECK: # %bb.0: # %entry
1925 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1926 ; CHECK-NEXT: ret{{[l|q]}}
1928 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1929 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1930 ret <8 x double> %conv.i.i
1933 define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
1934 ; X86-LABEL: test_mm512_mask_cvtps_pd:
1935 ; X86: # %bb.0: # %entry
1936 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1937 ; X86-NEXT: kmovw %eax, %k1
1938 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1941 ; X64-LABEL: test_mm512_mask_cvtps_pd:
1942 ; X64: # %bb.0: # %entry
1943 ; X64-NEXT: kmovw %edi, %k1
1944 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1947 %conv.i.i = fpext <8 x float> %__A to <8 x double>
1948 %0 = bitcast i8 %__U to <8 x i1>
1949 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
1953 define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
1954 ; X86-LABEL: test_mm512_mask_cvtpslo_pd:
1955 ; X86: # %bb.0: # %entry
1956 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1957 ; X86-NEXT: kmovw %eax, %k1
1958 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1961 ; X64-LABEL: test_mm512_mask_cvtpslo_pd:
1962 ; X64: # %bb.0: # %entry
1963 ; X64-NEXT: kmovw %edi, %k1
1964 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1967 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1968 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1969 %0 = bitcast i8 %__U to <8 x i1>
1970 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
1974 define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
1975 ; X86-LABEL: test_mm512_maskz_cvtps_pd:
1976 ; X86: # %bb.0: # %entry
1977 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1978 ; X86-NEXT: kmovw %eax, %k1
1979 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1982 ; X64-LABEL: test_mm512_maskz_cvtps_pd:
1983 ; X64: # %bb.0: # %entry
1984 ; X64-NEXT: kmovw %edi, %k1
1985 ; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1988 %conv.i.i = fpext <8 x float> %__A to <8 x double>
1989 %0 = bitcast i8 %__U to <8 x i1>
1990 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
1994 define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
1995 ; CHECK-LABEL: test_mm512_cvtepi32_epi8:
1996 ; CHECK: # %bb.0: # %entry
1997 ; CHECK-NEXT: vpmovdb %zmm0, %xmm0
1998 ; CHECK-NEXT: vzeroupper
1999 ; CHECK-NEXT: ret{{[l|q]}}
2001 %0 = bitcast <8 x i64> %__A to <16 x i32>
2002 %conv.i = trunc <16 x i32> %0 to <16 x i8>
2003 %1 = bitcast <16 x i8> %conv.i to <2 x i64>
2007 define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
2008 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
2009 ; X86: # %bb.0: # %entry
2010 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2011 ; X86-NEXT: kmovw %eax, %k1
2012 ; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2013 ; X86-NEXT: vzeroupper
2016 ; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
2017 ; X64: # %bb.0: # %entry
2018 ; X64-NEXT: kmovw %edi, %k1
2019 ; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2020 ; X64-NEXT: vzeroupper
2023 %0 = bitcast <8 x i64> %__A to <16 x i32>
2024 %1 = bitcast <2 x i64> %__O to <16 x i8>
2025 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
2026 %3 = bitcast <16 x i8> %2 to <2 x i64>
2030 define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
2031 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
2032 ; X86: # %bb.0: # %entry
2033 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2034 ; X86-NEXT: kmovw %eax, %k1
2035 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2036 ; X86-NEXT: vzeroupper
2039 ; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
2040 ; X64: # %bb.0: # %entry
2041 ; X64-NEXT: kmovw %edi, %k1
2042 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2043 ; X64-NEXT: vzeroupper
2046 %0 = bitcast <8 x i64> %__A to <16 x i32>
2047 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
2048 %2 = bitcast <16 x i8> %1 to <2 x i64>
2052 define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
2053 ; CHECK-LABEL: test_mm512_cvtepi64_epi32:
2054 ; CHECK: # %bb.0: # %entry
2055 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0
2056 ; CHECK-NEXT: ret{{[l|q]}}
2058 %conv.i = trunc <8 x i64> %__A to <8 x i32>
2059 %0 = bitcast <8 x i32> %conv.i to <4 x i64>
2063 define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2064 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
2065 ; X86: # %bb.0: # %entry
2066 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2067 ; X86-NEXT: kmovw %eax, %k1
2068 ; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2071 ; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
2072 ; X64: # %bb.0: # %entry
2073 ; X64-NEXT: kmovw %edi, %k1
2074 ; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2077 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2078 %0 = bitcast <4 x i64> %__O to <8 x i32>
2079 %1 = bitcast i8 %__M to <8 x i1>
2080 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
2081 %3 = bitcast <8 x i32> %2 to <4 x i64>
2085 define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
2086 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
2087 ; X86: # %bb.0: # %entry
2088 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2089 ; X86-NEXT: kmovw %eax, %k1
2090 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2093 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
2094 ; X64: # %bb.0: # %entry
2095 ; X64-NEXT: kmovw %edi, %k1
2096 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2099 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2100 %0 = bitcast i8 %__M to <8 x i1>
2101 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
2102 %2 = bitcast <8 x i32> %1 to <4 x i64>
2106 define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
2107 ; CHECK-LABEL: test_mm512_cvtepi64_epi16:
2108 ; CHECK: # %bb.0: # %entry
2109 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0
2110 ; CHECK-NEXT: vzeroupper
2111 ; CHECK-NEXT: ret{{[l|q]}}
2113 %conv.i = trunc <8 x i64> %__A to <8 x i16>
2114 %0 = bitcast <8 x i16> %conv.i to <2 x i64>
2118 define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2119 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
2120 ; X86: # %bb.0: # %entry
2121 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2122 ; X86-NEXT: kmovw %eax, %k1
2123 ; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2124 ; X86-NEXT: vzeroupper
2127 ; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
2128 ; X64: # %bb.0: # %entry
2129 ; X64-NEXT: kmovw %edi, %k1
2130 ; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2131 ; X64-NEXT: vzeroupper
2134 %0 = bitcast <2 x i64> %__O to <8 x i16>
2135 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
2136 %2 = bitcast <8 x i16> %1 to <2 x i64>
2140 define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
2141 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
2142 ; X86: # %bb.0: # %entry
2143 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2144 ; X86-NEXT: kmovw %eax, %k1
2145 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2146 ; X86-NEXT: vzeroupper
2149 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
2150 ; X64: # %bb.0: # %entry
2151 ; X64-NEXT: kmovw %edi, %k1
2152 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2153 ; X64-NEXT: vzeroupper
2156 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
2157 %1 = bitcast <8 x i16> %0 to <2 x i64>
2161 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
2162 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
2164 define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2165 ; CHECK-LABEL: test_mm512_ternarylogic_epi32:
2166 ; CHECK: # %bb.0: # %entry
2167 ; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0
2168 ; CHECK-NEXT: ret{{[l|q]}}
2170 %0 = bitcast <8 x i64> %__A to <16 x i32>
2171 %1 = bitcast <8 x i64> %__B to <16 x i32>
2172 %2 = bitcast <8 x i64> %__C to <16 x i32>
2173 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2174 %4 = bitcast <16 x i32> %3 to <8 x i64>
2178 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
2180 define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2181 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
2182 ; X86: # %bb.0: # %entry
2183 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2184 ; X86-NEXT: kmovw %eax, %k1
2185 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2188 ; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
2189 ; X64: # %bb.0: # %entry
2190 ; X64-NEXT: kmovw %edi, %k1
2191 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2194 %0 = bitcast <8 x i64> %__A to <16 x i32>
2195 %1 = bitcast <8 x i64> %__B to <16 x i32>
2196 %2 = bitcast <8 x i64> %__C to <16 x i32>
2197 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2198 %4 = bitcast i16 %__U to <16 x i1>
2199 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2200 %6 = bitcast <16 x i32> %5 to <8 x i64>
2204 define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2205 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
2206 ; X86: # %bb.0: # %entry
2207 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2208 ; X86-NEXT: kmovw %eax, %k1
2209 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2212 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
2213 ; X64: # %bb.0: # %entry
2214 ; X64-NEXT: kmovw %edi, %k1
2215 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2218 %0 = bitcast <8 x i64> %__A to <16 x i32>
2219 %1 = bitcast <8 x i64> %__B to <16 x i32>
2220 %2 = bitcast <8 x i64> %__C to <16 x i32>
2221 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2222 %4 = bitcast i16 %__U to <16 x i1>
2223 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2224 %6 = bitcast <16 x i32> %5 to <8 x i64>
2228 define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2229 ; CHECK-LABEL: test_mm512_ternarylogic_epi64:
2230 ; CHECK: # %bb.0: # %entry
2231 ; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0
2232 ; CHECK-NEXT: ret{{[l|q]}}
2234 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2238 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
2240 define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2241 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
2242 ; X86: # %bb.0: # %entry
2243 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2244 ; X86-NEXT: kmovw %eax, %k1
2245 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2248 ; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
2249 ; X64: # %bb.0: # %entry
2250 ; X64-NEXT: kmovw %edi, %k1
2251 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2254 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2255 %1 = bitcast i8 %__U to <8 x i1>
2256 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2260 define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2261 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
2262 ; X86: # %bb.0: # %entry
2263 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2264 ; X86-NEXT: kmovw %eax, %k1
2265 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2268 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
2269 ; X64: # %bb.0: # %entry
2270 ; X64-NEXT: kmovw %edi, %k1
2271 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2274 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2275 %1 = bitcast i8 %__U to <8 x i1>
2276 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2280 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2282 define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
2283 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
2284 ; X86: # %bb.0: # %entry
2285 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2286 ; X86-NEXT: kmovw %eax, %k1
2287 ; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2288 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2291 ; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
2292 ; X64: # %bb.0: # %entry
2293 ; X64-NEXT: kmovw %edi, %k1
2294 ; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2295 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2298 %0 = bitcast <8 x i64> %__A to <16 x i32>
2299 %1 = bitcast <8 x i64> %__I to <16 x i32>
2300 %2 = bitcast <8 x i64> %__B to <16 x i32>
2301 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2302 %4 = bitcast i16 %__U to <16 x i1>
2303 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
2304 %6 = bitcast <16 x i32> %5 to <8 x i64>
2308 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2310 define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
2311 ; X86-LABEL: test_mm512_mask2_permutex2var_pd:
2312 ; X86: # %bb.0: # %entry
2313 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2314 ; X86-NEXT: kmovw %eax, %k1
2315 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2316 ; X86-NEXT: vmovapd %zmm1, %zmm0
2319 ; X64-LABEL: test_mm512_mask2_permutex2var_pd:
2320 ; X64: # %bb.0: # %entry
2321 ; X64-NEXT: kmovw %edi, %k1
2322 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2323 ; X64-NEXT: vmovapd %zmm1, %zmm0
2326 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2327 %1 = bitcast <8 x i64> %__I to <8 x double>
2328 %2 = bitcast i8 %__U to <8 x i1>
2329 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
2333 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2335 define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
2336 ; X86-LABEL: test_mm512_mask2_permutex2var_ps:
2337 ; X86: # %bb.0: # %entry
2338 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2339 ; X86-NEXT: kmovw %eax, %k1
2340 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2341 ; X86-NEXT: vmovaps %zmm1, %zmm0
2344 ; X64-LABEL: test_mm512_mask2_permutex2var_ps:
2345 ; X64: # %bb.0: # %entry
2346 ; X64-NEXT: kmovw %edi, %k1
2347 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2348 ; X64-NEXT: vmovaps %zmm1, %zmm0
2351 %0 = bitcast <8 x i64> %__I to <16 x i32>
2352 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2353 %2 = bitcast <8 x i64> %__I to <16 x float>
2354 %3 = bitcast i16 %__U to <16 x i1>
2355 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
2359 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2361 define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
2362 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
2363 ; X86: # %bb.0: # %entry
2364 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2365 ; X86-NEXT: kmovw %eax, %k1
2366 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2367 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2370 ; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
2371 ; X64: # %bb.0: # %entry
2372 ; X64-NEXT: kmovw %edi, %k1
2373 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2374 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2377 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2378 %1 = bitcast i8 %__U to <8 x i1>
2379 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
2383 define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2384 ; CHECK-LABEL: test_mm512_permutex2var_epi32:
2385 ; CHECK: # %bb.0: # %entry
2386 ; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
2387 ; CHECK-NEXT: ret{{[l|q]}}
2389 %0 = bitcast <8 x i64> %__A to <16 x i32>
2390 %1 = bitcast <8 x i64> %__I to <16 x i32>
2391 %2 = bitcast <8 x i64> %__B to <16 x i32>
2392 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2393 %4 = bitcast <16 x i32> %3 to <8 x i64>
2397 define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2398 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
2399 ; X86: # %bb.0: # %entry
2400 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2401 ; X86-NEXT: kmovw %eax, %k1
2402 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2405 ; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
2406 ; X64: # %bb.0: # %entry
2407 ; X64-NEXT: kmovw %edi, %k1
2408 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2411 %0 = bitcast <8 x i64> %__A to <16 x i32>
2412 %1 = bitcast <8 x i64> %__I to <16 x i32>
2413 %2 = bitcast <8 x i64> %__B to <16 x i32>
2414 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2415 %4 = bitcast i16 %__U to <16 x i1>
2416 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2417 %6 = bitcast <16 x i32> %5 to <8 x i64>
2421 define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2422 ; X86-LABEL: test_mm512_mask_permutex2var_epi32:
2423 ; X86: # %bb.0: # %entry
2424 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2425 ; X86-NEXT: kmovw %eax, %k1
2426 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2429 ; X64-LABEL: test_mm512_mask_permutex2var_epi32:
2430 ; X64: # %bb.0: # %entry
2431 ; X64-NEXT: kmovw %edi, %k1
2432 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2435 %0 = bitcast <8 x i64> %__A to <16 x i32>
2436 %1 = bitcast <8 x i64> %__I to <16 x i32>
2437 %2 = bitcast <8 x i64> %__B to <16 x i32>
2438 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2439 %4 = bitcast i16 %__U to <16 x i1>
2440 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2441 %6 = bitcast <16 x i32> %5 to <8 x i64>
2445 define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2446 ; CHECK-LABEL: test_mm512_permutex2var_pd:
2447 ; CHECK: # %bb.0: # %entry
2448 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
2449 ; CHECK-NEXT: ret{{[l|q]}}
2451 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2455 define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
2456 ; X86-LABEL: test_mm512_mask_permutex2var_pd:
2457 ; X86: # %bb.0: # %entry
2458 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2459 ; X86-NEXT: kmovw %eax, %k1
2460 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2463 ; X64-LABEL: test_mm512_mask_permutex2var_pd:
2464 ; X64: # %bb.0: # %entry
2465 ; X64-NEXT: kmovw %edi, %k1
2466 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2469 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2470 %1 = bitcast i8 %__U to <8 x i1>
2471 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2475 define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2476 ; X86-LABEL: test_mm512_maskz_permutex2var_pd:
2477 ; X86: # %bb.0: # %entry
2478 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2479 ; X86-NEXT: kmovw %eax, %k1
2480 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2483 ; X64-LABEL: test_mm512_maskz_permutex2var_pd:
2484 ; X64: # %bb.0: # %entry
2485 ; X64-NEXT: kmovw %edi, %k1
2486 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2489 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2490 %1 = bitcast i8 %__U to <8 x i1>
2491 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
2495 define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2496 ; CHECK-LABEL: test_mm512_permutex2var_ps:
2497 ; CHECK: # %bb.0: # %entry
2498 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
2499 ; CHECK-NEXT: ret{{[l|q]}}
2501 %0 = bitcast <8 x i64> %__I to <16 x i32>
2502 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2506 define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
2507 ; X86-LABEL: test_mm512_mask_permutex2var_ps:
2508 ; X86: # %bb.0: # %entry
2509 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2510 ; X86-NEXT: kmovw %eax, %k1
2511 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2514 ; X64-LABEL: test_mm512_mask_permutex2var_ps:
2515 ; X64: # %bb.0: # %entry
2516 ; X64-NEXT: kmovw %edi, %k1
2517 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2520 %0 = bitcast <8 x i64> %__I to <16 x i32>
2521 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2522 %2 = bitcast i16 %__U to <16 x i1>
2523 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
2527 define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2528 ; X86-LABEL: test_mm512_maskz_permutex2var_ps:
2529 ; X86: # %bb.0: # %entry
2530 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2531 ; X86-NEXT: kmovw %eax, %k1
2532 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2535 ; X64-LABEL: test_mm512_maskz_permutex2var_ps:
2536 ; X64: # %bb.0: # %entry
2537 ; X64-NEXT: kmovw %edi, %k1
2538 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2541 %0 = bitcast <8 x i64> %__I to <16 x i32>
2542 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2543 %2 = bitcast i16 %__U to <16 x i1>
2544 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2548 define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2549 ; CHECK-LABEL: test_mm512_permutex2var_epi64:
2550 ; CHECK: # %bb.0: # %entry
2551 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
2552 ; CHECK-NEXT: ret{{[l|q]}}
2554 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2558 define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2559 ; X86-LABEL: test_mm512_mask_permutex2var_epi64:
2560 ; X86: # %bb.0: # %entry
2561 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2562 ; X86-NEXT: kmovw %eax, %k1
2563 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2566 ; X64-LABEL: test_mm512_mask_permutex2var_epi64:
2567 ; X64: # %bb.0: # %entry
2568 ; X64-NEXT: kmovw %edi, %k1
2569 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2572 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2573 %1 = bitcast i8 %__U to <8 x i1>
2574 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2578 define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2579 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
2580 ; X86: # %bb.0: # %entry
2581 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2582 ; X86-NEXT: kmovw %eax, %k1
2583 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2586 ; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
2587 ; X64: # %bb.0: # %entry
2588 ; X64-NEXT: kmovw %edi, %k1
2589 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2592 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2593 %1 = bitcast i8 %__U to <8 x i1>
2594 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2597 define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2598 ; X86-LABEL: test_mm_mask_add_ss:
2599 ; X86: # %bb.0: # %entry
2600 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2601 ; X86-NEXT: kmovw %eax, %k1
2602 ; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2605 ; X64-LABEL: test_mm_mask_add_ss:
2606 ; X64: # %bb.0: # %entry
2607 ; X64-NEXT: kmovw %edi, %k1
2608 ; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2611 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2612 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2613 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2615 %tobool.i = icmp eq i8 %0, 0
2616 %vecext1.i = extractelement <4 x float> %__W, i32 0
2617 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
2618 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2619 ret <4 x float> %vecins.i
2622 define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2623 ; X86-LABEL: test_mm_maskz_add_ss:
2624 ; X86: # %bb.0: # %entry
2625 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2626 ; X86-NEXT: kmovw %eax, %k1
2627 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2630 ; X64-LABEL: test_mm_maskz_add_ss:
2631 ; X64: # %bb.0: # %entry
2632 ; X64-NEXT: kmovw %edi, %k1
2633 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2636 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2637 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2638 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2640 %tobool.i = icmp eq i8 %0, 0
2641 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
2642 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2643 ret <4 x float> %vecins.i
2646 define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2647 ; X86-LABEL: test_mm_mask_add_sd:
2648 ; X86: # %bb.0: # %entry
2649 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2650 ; X86-NEXT: kmovw %eax, %k1
2651 ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2654 ; X64-LABEL: test_mm_mask_add_sd:
2655 ; X64: # %bb.0: # %entry
2656 ; X64-NEXT: kmovw %edi, %k1
2657 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2660 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2661 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2662 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2664 %tobool.i = icmp eq i8 %0, 0
2665 %vecext1.i = extractelement <2 x double> %__W, i32 0
2666 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
2667 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2668 ret <2 x double> %vecins.i
2671 define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2672 ; X86-LABEL: test_mm_maskz_add_sd:
2673 ; X86: # %bb.0: # %entry
2674 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2675 ; X86-NEXT: kmovw %eax, %k1
2676 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2679 ; X64-LABEL: test_mm_maskz_add_sd:
2680 ; X64: # %bb.0: # %entry
2681 ; X64-NEXT: kmovw %edi, %k1
2682 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2685 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2686 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2687 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2689 %tobool.i = icmp eq i8 %0, 0
2690 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
2691 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2692 ret <2 x double> %vecins.i
2695 define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2696 ; X86-LABEL: test_mm_mask_sub_ss:
2697 ; X86: # %bb.0: # %entry
2698 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2699 ; X86-NEXT: kmovw %eax, %k1
2700 ; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2703 ; X64-LABEL: test_mm_mask_sub_ss:
2704 ; X64: # %bb.0: # %entry
2705 ; X64-NEXT: kmovw %edi, %k1
2706 ; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2709 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2710 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2711 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2713 %tobool.i = icmp eq i8 %0, 0
2714 %vecext1.i = extractelement <4 x float> %__W, i32 0
2715 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
2716 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2717 ret <4 x float> %vecins.i
2720 define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2721 ; X86-LABEL: test_mm_maskz_sub_ss:
2722 ; X86: # %bb.0: # %entry
2723 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2724 ; X86-NEXT: kmovw %eax, %k1
2725 ; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2728 ; X64-LABEL: test_mm_maskz_sub_ss:
2729 ; X64: # %bb.0: # %entry
2730 ; X64-NEXT: kmovw %edi, %k1
2731 ; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2734 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2735 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2736 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2738 %tobool.i = icmp eq i8 %0, 0
2739 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
2740 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2741 ret <4 x float> %vecins.i
2744 define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2745 ; X86-LABEL: test_mm_mask_sub_sd:
2746 ; X86: # %bb.0: # %entry
2747 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2748 ; X86-NEXT: kmovw %eax, %k1
2749 ; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2752 ; X64-LABEL: test_mm_mask_sub_sd:
2753 ; X64: # %bb.0: # %entry
2754 ; X64-NEXT: kmovw %edi, %k1
2755 ; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2758 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2759 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2760 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2762 %tobool.i = icmp eq i8 %0, 0
2763 %vecext1.i = extractelement <2 x double> %__W, i32 0
2764 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
2765 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2766 ret <2 x double> %vecins.i
2769 define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2770 ; X86-LABEL: test_mm_maskz_sub_sd:
2771 ; X86: # %bb.0: # %entry
2772 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2773 ; X86-NEXT: kmovw %eax, %k1
2774 ; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2777 ; X64-LABEL: test_mm_maskz_sub_sd:
2778 ; X64: # %bb.0: # %entry
2779 ; X64-NEXT: kmovw %edi, %k1
2780 ; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2783 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2784 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2785 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2787 %tobool.i = icmp eq i8 %0, 0
2788 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
2789 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2790 ret <2 x double> %vecins.i
2793 define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2794 ; X86-LABEL: test_mm_mask_mul_ss:
2795 ; X86: # %bb.0: # %entry
2796 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2797 ; X86-NEXT: kmovw %eax, %k1
2798 ; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2801 ; X64-LABEL: test_mm_mask_mul_ss:
2802 ; X64: # %bb.0: # %entry
2803 ; X64-NEXT: kmovw %edi, %k1
2804 ; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2807 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2808 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2809 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2811 %tobool.i = icmp eq i8 %0, 0
2812 %vecext1.i = extractelement <4 x float> %__W, i32 0
2813 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
2814 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2815 ret <4 x float> %vecins.i
2818 define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2819 ; X86-LABEL: test_mm_maskz_mul_ss:
2820 ; X86: # %bb.0: # %entry
2821 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2822 ; X86-NEXT: kmovw %eax, %k1
2823 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2826 ; X64-LABEL: test_mm_maskz_mul_ss:
2827 ; X64: # %bb.0: # %entry
2828 ; X64-NEXT: kmovw %edi, %k1
2829 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2832 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2833 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2834 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2836 %tobool.i = icmp eq i8 %0, 0
2837 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
2838 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2839 ret <4 x float> %vecins.i
2842 define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2843 ; X86-LABEL: test_mm_mask_mul_sd:
2844 ; X86: # %bb.0: # %entry
2845 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2846 ; X86-NEXT: kmovw %eax, %k1
2847 ; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2850 ; X64-LABEL: test_mm_mask_mul_sd:
2851 ; X64: # %bb.0: # %entry
2852 ; X64-NEXT: kmovw %edi, %k1
2853 ; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2856 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2857 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2858 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2860 %tobool.i = icmp eq i8 %0, 0
2861 %vecext1.i = extractelement <2 x double> %__W, i32 0
2862 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
2863 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2864 ret <2 x double> %vecins.i
2867 define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2868 ; X86-LABEL: test_mm_maskz_mul_sd:
2869 ; X86: # %bb.0: # %entry
2870 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2871 ; X86-NEXT: kmovw %eax, %k1
2872 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2875 ; X64-LABEL: test_mm_maskz_mul_sd:
2876 ; X64: # %bb.0: # %entry
2877 ; X64-NEXT: kmovw %edi, %k1
2878 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2881 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2882 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2883 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2885 %tobool.i = icmp eq i8 %0, 0
2886 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
2887 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2888 ret <2 x double> %vecins.i
2891 define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2892 ; X86-LABEL: test_mm_mask_div_ss:
2893 ; X86: # %bb.0: # %entry
2894 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2895 ; X86-NEXT: kmovw %eax, %k1
2896 ; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2899 ; X64-LABEL: test_mm_mask_div_ss:
2900 ; X64: # %bb.0: # %entry
2901 ; X64-NEXT: kmovw %edi, %k1
2902 ; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2905 %0 = extractelement <4 x float> %__A, i64 0
2906 %1 = extractelement <4 x float> %__B, i64 0
2907 %2 = extractelement <4 x float> %__W, i64 0
2908 %3 = fdiv float %0, %1
2909 %4 = bitcast i8 %__U to <8 x i1>
2910 %5 = extractelement <8 x i1> %4, i64 0
2911 %6 = select i1 %5, float %3, float %2
2912 %7 = insertelement <4 x float> %__A, float %6, i64 0
2916 define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2917 ; X86-LABEL: test_mm_maskz_div_ss:
2918 ; X86: # %bb.0: # %entry
2919 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2920 ; X86-NEXT: kmovw %eax, %k1
2921 ; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2924 ; X64-LABEL: test_mm_maskz_div_ss:
2925 ; X64: # %bb.0: # %entry
2926 ; X64-NEXT: kmovw %edi, %k1
2927 ; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2930 %0 = extractelement <4 x float> %__A, i64 0
2931 %1 = extractelement <4 x float> %__B, i64 0
2932 %2 = fdiv float %0, %1
2933 %3 = bitcast i8 %__U to <8 x i1>
2934 %4 = extractelement <8 x i1> %3, i64 0
2935 %5 = select i1 %4, float %2, float 0.000000e+00
2936 %6 = insertelement <4 x float> %__A, float %5, i64 0
2940 define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2941 ; X86-LABEL: test_mm_mask_div_sd:
2942 ; X86: # %bb.0: # %entry
2943 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2944 ; X86-NEXT: kmovw %eax, %k1
2945 ; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2948 ; X64-LABEL: test_mm_mask_div_sd:
2949 ; X64: # %bb.0: # %entry
2950 ; X64-NEXT: kmovw %edi, %k1
2951 ; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2954 %0 = extractelement <2 x double> %__A, i64 0
2955 %1 = extractelement <2 x double> %__B, i64 0
2956 %2 = extractelement <2 x double> %__W, i64 0
2957 %3 = fdiv double %0, %1
2958 %4 = bitcast i8 %__U to <8 x i1>
2959 %5 = extractelement <8 x i1> %4, i64 0
2960 %6 = select i1 %5, double %3, double %2
2961 %7 = insertelement <2 x double> %__A, double %6, i64 0
2965 define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2966 ; X86-LABEL: test_mm_maskz_div_sd:
2967 ; X86: # %bb.0: # %entry
2968 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2969 ; X86-NEXT: kmovw %eax, %k1
2970 ; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2973 ; X64-LABEL: test_mm_maskz_div_sd:
2974 ; X64: # %bb.0: # %entry
2975 ; X64-NEXT: kmovw %edi, %k1
2976 ; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2979 %0 = extractelement <2 x double> %__A, i64 0
2980 %1 = extractelement <2 x double> %__B, i64 0
2981 %2 = fdiv double %0, %1
2982 %3 = bitcast i8 %__U to <8 x i1>
2983 %4 = extractelement <8 x i1> %3, i64 0
2984 %5 = select i1 %4, double %2, double 0.000000e+00
2985 %6 = insertelement <2 x double> %__A, double %5, i64 0
2990 define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
2991 ; CHECK-LABEL: test_mm512_fmadd_round_pd:
2992 ; CHECK: # %bb.0: # %entry
2993 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
2994 ; CHECK-NEXT: ret{{[l|q]}}
2996 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3000 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3002 define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3003 ; X86-LABEL: test_mm512_mask_fmadd_round_pd:
3004 ; X86: # %bb.0: # %entry
3005 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3006 ; X86-NEXT: kmovw %eax, %k1
3007 ; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3010 ; X64-LABEL: test_mm512_mask_fmadd_round_pd:
3011 ; X64: # %bb.0: # %entry
3012 ; X64-NEXT: kmovw %edi, %k1
3013 ; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3016 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3017 %1 = bitcast i8 %__U to <8 x i1>
3018 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3022 define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3023 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
3024 ; X86: # %bb.0: # %entry
3025 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3026 ; X86-NEXT: kmovw %eax, %k1
3027 ; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3028 ; X86-NEXT: vmovapd %zmm2, %zmm0
3031 ; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
3032 ; X64: # %bb.0: # %entry
3033 ; X64-NEXT: kmovw %edi, %k1
3034 ; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3035 ; X64-NEXT: vmovapd %zmm2, %zmm0
3038 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3039 %1 = bitcast i8 %__U to <8 x i1>
3040 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3044 define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3045 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
3046 ; X86: # %bb.0: # %entry
3047 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3048 ; X86-NEXT: kmovw %eax, %k1
3049 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3052 ; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
3053 ; X64: # %bb.0: # %entry
3054 ; X64-NEXT: kmovw %edi, %k1
3055 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3058 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3059 %1 = bitcast i8 %__U to <8 x i1>
3060 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3064 define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3065 ; X86-LABEL: test_mm512_fmsub_round_pd:
3066 ; X86: # %bb.0: # %entry
3067 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3068 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3071 ; X64-LABEL: test_mm512_fmsub_round_pd:
3072 ; X64: # %bb.0: # %entry
3073 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3074 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3077 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3078 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3082 define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3083 ; X86-LABEL: test_mm512_mask_fmsub_round_pd:
3084 ; X86: # %bb.0: # %entry
3085 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3086 ; X86-NEXT: kmovw %eax, %k1
3087 ; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3090 ; X64-LABEL: test_mm512_mask_fmsub_round_pd:
3091 ; X64: # %bb.0: # %entry
3092 ; X64-NEXT: kmovw %edi, %k1
3093 ; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3096 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3097 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3098 %1 = bitcast i8 %__U to <8 x i1>
3099 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3103 define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3104 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
3105 ; X86: # %bb.0: # %entry
3106 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3107 ; X86-NEXT: kmovw %eax, %k1
3108 ; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3111 ; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
3112 ; X64: # %bb.0: # %entry
3113 ; X64-NEXT: kmovw %edi, %k1
3114 ; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3117 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3118 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3119 %1 = bitcast i8 %__U to <8 x i1>
3120 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3124 define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3125 ; X86-LABEL: test_mm512_fnmadd_round_pd:
3126 ; X86: # %bb.0: # %entry
3127 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3128 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3131 ; X64-LABEL: test_mm512_fnmadd_round_pd:
3132 ; X64: # %bb.0: # %entry
3133 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3134 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3137 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3138 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3142 define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3143 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
3144 ; X86: # %bb.0: # %entry
3145 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3146 ; X86-NEXT: kmovw %eax, %k1
3147 ; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3148 ; X86-NEXT: vmovapd %zmm2, %zmm0
3151 ; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
3152 ; X64: # %bb.0: # %entry
3153 ; X64-NEXT: kmovw %edi, %k1
3154 ; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3155 ; X64-NEXT: vmovapd %zmm2, %zmm0
3158 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3159 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3160 %1 = bitcast i8 %__U to <8 x i1>
3161 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3165 define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3166 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
3167 ; X86: # %bb.0: # %entry
3168 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3169 ; X86-NEXT: kmovw %eax, %k1
3170 ; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3173 ; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
3174 ; X64: # %bb.0: # %entry
3175 ; X64-NEXT: kmovw %edi, %k1
3176 ; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3179 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3180 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3181 %1 = bitcast i8 %__U to <8 x i1>
3182 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3186 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3187 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
3188 ; CHECK: # %bb.0: # %entry
3189 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3190 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3191 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3192 ; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
3193 ; CHECK-NEXT: ret{{[l|q]}}
3195 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3196 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3197 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3201 define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3202 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
3203 ; X86: # %bb.0: # %entry
3204 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3205 ; X86-NEXT: kmovw %eax, %k1
3206 ; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3209 ; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
3210 ; X64: # %bb.0: # %entry
3211 ; X64-NEXT: kmovw %edi, %k1
3212 ; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3215 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3216 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3217 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3218 %1 = bitcast i8 %__U to <8 x i1>
3219 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3223 define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3224 ; CHECK-LABEL: test_mm512_fmadd_pd:
3225 ; CHECK: # %bb.0: # %entry
3226 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3227 ; CHECK-NEXT: ret{{[l|q]}}
3229 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3233 define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3234 ; X86-LABEL: test_mm512_mask_fmadd_pd:
3235 ; X86: # %bb.0: # %entry
3236 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3237 ; X86-NEXT: kmovw %eax, %k1
3238 ; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3241 ; X64-LABEL: test_mm512_mask_fmadd_pd:
3242 ; X64: # %bb.0: # %entry
3243 ; X64-NEXT: kmovw %edi, %k1
3244 ; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3247 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3248 %1 = bitcast i8 %__U to <8 x i1>
3249 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3253 define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3254 ; X86-LABEL: test_mm512_mask3_fmadd_pd:
3255 ; X86: # %bb.0: # %entry
3256 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3257 ; X86-NEXT: kmovw %eax, %k1
3258 ; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3259 ; X86-NEXT: vmovapd %zmm2, %zmm0
3262 ; X64-LABEL: test_mm512_mask3_fmadd_pd:
3263 ; X64: # %bb.0: # %entry
3264 ; X64-NEXT: kmovw %edi, %k1
3265 ; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3266 ; X64-NEXT: vmovapd %zmm2, %zmm0
3269 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3270 %1 = bitcast i8 %__U to <8 x i1>
3271 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3275 define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3276 ; X86-LABEL: test_mm512_maskz_fmadd_pd:
3277 ; X86: # %bb.0: # %entry
3278 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3279 ; X86-NEXT: kmovw %eax, %k1
3280 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3283 ; X64-LABEL: test_mm512_maskz_fmadd_pd:
3284 ; X64: # %bb.0: # %entry
3285 ; X64-NEXT: kmovw %edi, %k1
3286 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3289 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3290 %1 = bitcast i8 %__U to <8 x i1>
3291 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3295 define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3296 ; X86-LABEL: test_mm512_fmsub_pd:
3297 ; X86: # %bb.0: # %entry
3298 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3299 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3302 ; X64-LABEL: test_mm512_fmsub_pd:
3303 ; X64: # %bb.0: # %entry
3304 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3305 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3308 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3309 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3313 define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3314 ; X86-LABEL: test_mm512_mask_fmsub_pd:
3315 ; X86: # %bb.0: # %entry
3316 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3317 ; X86-NEXT: kmovw %eax, %k1
3318 ; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3321 ; X64-LABEL: test_mm512_mask_fmsub_pd:
3322 ; X64: # %bb.0: # %entry
3323 ; X64-NEXT: kmovw %edi, %k1
3324 ; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3327 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3328 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3329 %1 = bitcast i8 %__U to <8 x i1>
3330 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3334 define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3335 ; X86-LABEL: test_mm512_maskz_fmsub_pd:
3336 ; X86: # %bb.0: # %entry
3337 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3338 ; X86-NEXT: kmovw %eax, %k1
3339 ; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3342 ; X64-LABEL: test_mm512_maskz_fmsub_pd:
3343 ; X64: # %bb.0: # %entry
3344 ; X64-NEXT: kmovw %edi, %k1
3345 ; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3348 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3349 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3350 %1 = bitcast i8 %__U to <8 x i1>
3351 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3355 define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3356 ; X86-LABEL: test_mm512_fnmadd_pd:
3357 ; X86: # %bb.0: # %entry
3358 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3359 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3362 ; X64-LABEL: test_mm512_fnmadd_pd:
3363 ; X64: # %bb.0: # %entry
3364 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3365 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3368 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3369 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3373 define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3374 ; X86-LABEL: test_mm512_mask3_fnmadd_pd:
3375 ; X86: # %bb.0: # %entry
3376 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3377 ; X86-NEXT: kmovw %eax, %k1
3378 ; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3379 ; X86-NEXT: vmovapd %zmm2, %zmm0
3382 ; X64-LABEL: test_mm512_mask3_fnmadd_pd:
3383 ; X64: # %bb.0: # %entry
3384 ; X64-NEXT: kmovw %edi, %k1
3385 ; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3386 ; X64-NEXT: vmovapd %zmm2, %zmm0
3389 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3390 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3391 %1 = bitcast i8 %__U to <8 x i1>
3392 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3396 define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3397 ; X86-LABEL: test_mm512_maskz_fnmadd_pd:
3398 ; X86: # %bb.0: # %entry
3399 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3400 ; X86-NEXT: kmovw %eax, %k1
3401 ; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3404 ; X64-LABEL: test_mm512_maskz_fnmadd_pd:
3405 ; X64: # %bb.0: # %entry
3406 ; X64-NEXT: kmovw %edi, %k1
3407 ; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3410 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3411 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3412 %1 = bitcast i8 %__U to <8 x i1>
3413 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3417 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3418 ; CHECK-LABEL: test_mm512_fnmsub_pd:
3419 ; CHECK: # %bb.0: # %entry
3420 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3421 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3422 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3423 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3424 ; CHECK-NEXT: ret{{[l|q]}}
3426 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3427 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3428 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3432 define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3433 ; X86-LABEL: test_mm512_maskz_fnmsub_pd:
3434 ; X86: # %bb.0: # %entry
3435 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3436 ; X86-NEXT: kmovw %eax, %k1
3437 ; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3440 ; X64-LABEL: test_mm512_maskz_fnmsub_pd:
3441 ; X64: # %bb.0: # %entry
3442 ; X64-NEXT: kmovw %edi, %k1
3443 ; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3446 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3447 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3448 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3449 %1 = bitcast i8 %__U to <8 x i1>
3450 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3454 define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3455 ; CHECK-LABEL: test_mm512_fmadd_round_ps:
3456 ; CHECK: # %bb.0: # %entry
3457 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3458 ; CHECK-NEXT: ret{{[l|q]}}
3460 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3464 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
3466 define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3467 ; X86-LABEL: test_mm512_mask_fmadd_round_ps:
3468 ; X86: # %bb.0: # %entry
3469 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3470 ; X86-NEXT: kmovw %eax, %k1
3471 ; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3474 ; X64-LABEL: test_mm512_mask_fmadd_round_ps:
3475 ; X64: # %bb.0: # %entry
3476 ; X64-NEXT: kmovw %edi, %k1
3477 ; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3480 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3481 %1 = bitcast i16 %__U to <16 x i1>
3482 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3486 define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3487 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
3488 ; X86: # %bb.0: # %entry
3489 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3490 ; X86-NEXT: kmovw %eax, %k1
3491 ; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3492 ; X86-NEXT: vmovaps %zmm2, %zmm0
3495 ; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
3496 ; X64: # %bb.0: # %entry
3497 ; X64-NEXT: kmovw %edi, %k1
3498 ; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3499 ; X64-NEXT: vmovaps %zmm2, %zmm0
3502 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3503 %1 = bitcast i16 %__U to <16 x i1>
3504 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3508 define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3509 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
3510 ; X86: # %bb.0: # %entry
3511 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3512 ; X86-NEXT: kmovw %eax, %k1
3513 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3516 ; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
3517 ; X64: # %bb.0: # %entry
3518 ; X64-NEXT: kmovw %edi, %k1
3519 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3522 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3523 %1 = bitcast i16 %__U to <16 x i1>
3524 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3528 define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3529 ; X86-LABEL: test_mm512_fmsub_round_ps:
3530 ; X86: # %bb.0: # %entry
3531 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3532 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3535 ; X64-LABEL: test_mm512_fmsub_round_ps:
3536 ; X64: # %bb.0: # %entry
3537 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3538 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3541 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3542 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3546 define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3547 ; X86-LABEL: test_mm512_mask_fmsub_round_ps:
3548 ; X86: # %bb.0: # %entry
3549 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3550 ; X86-NEXT: kmovw %eax, %k1
3551 ; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3554 ; X64-LABEL: test_mm512_mask_fmsub_round_ps:
3555 ; X64: # %bb.0: # %entry
3556 ; X64-NEXT: kmovw %edi, %k1
3557 ; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3560 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3561 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3562 %1 = bitcast i16 %__U to <16 x i1>
3563 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3567 define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3568 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
3569 ; X86: # %bb.0: # %entry
3570 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3571 ; X86-NEXT: kmovw %eax, %k1
3572 ; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3575 ; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
3576 ; X64: # %bb.0: # %entry
3577 ; X64-NEXT: kmovw %edi, %k1
3578 ; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3581 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3582 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3583 %1 = bitcast i16 %__U to <16 x i1>
3584 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3588 define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3589 ; X86-LABEL: test_mm512_fnmadd_round_ps:
3590 ; X86: # %bb.0: # %entry
3591 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3592 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3595 ; X64-LABEL: test_mm512_fnmadd_round_ps:
3596 ; X64: # %bb.0: # %entry
3597 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3598 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3601 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3602 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3606 define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3607 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
3608 ; X86: # %bb.0: # %entry
3609 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3610 ; X86-NEXT: kmovw %eax, %k1
3611 ; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3612 ; X86-NEXT: vmovaps %zmm2, %zmm0
3615 ; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
3616 ; X64: # %bb.0: # %entry
3617 ; X64-NEXT: kmovw %edi, %k1
3618 ; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3619 ; X64-NEXT: vmovaps %zmm2, %zmm0
3622 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3623 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3624 %1 = bitcast i16 %__U to <16 x i1>
3625 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3629 define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3630 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
3631 ; X86: # %bb.0: # %entry
3632 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3633 ; X86-NEXT: kmovw %eax, %k1
3634 ; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3637 ; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
3638 ; X64: # %bb.0: # %entry
3639 ; X64-NEXT: kmovw %edi, %k1
3640 ; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3643 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3644 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3645 %1 = bitcast i16 %__U to <16 x i1>
3646 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3650 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3651 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
3652 ; CHECK: # %bb.0: # %entry
3653 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3654 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3655 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3656 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
3657 ; CHECK-NEXT: ret{{[l|q]}}
3659 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3660 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3661 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3665 define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3666 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
3667 ; X86: # %bb.0: # %entry
3668 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3669 ; X86-NEXT: kmovw %eax, %k1
3670 ; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3673 ; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
3674 ; X64: # %bb.0: # %entry
3675 ; X64-NEXT: kmovw %edi, %k1
3676 ; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3679 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3680 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3681 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3682 %1 = bitcast i16 %__U to <16 x i1>
3683 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3687 define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3688 ; CHECK-LABEL: test_mm512_fmadd_ps:
3689 ; CHECK: # %bb.0: # %entry
3690 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3691 ; CHECK-NEXT: ret{{[l|q]}}
3693 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3697 define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3698 ; X86-LABEL: test_mm512_mask_fmadd_ps:
3699 ; X86: # %bb.0: # %entry
3700 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3701 ; X86-NEXT: kmovw %eax, %k1
3702 ; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3705 ; X64-LABEL: test_mm512_mask_fmadd_ps:
3706 ; X64: # %bb.0: # %entry
3707 ; X64-NEXT: kmovw %edi, %k1
3708 ; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3711 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3712 %1 = bitcast i16 %__U to <16 x i1>
3713 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3717 define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3718 ; X86-LABEL: test_mm512_mask3_fmadd_ps:
3719 ; X86: # %bb.0: # %entry
3720 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3721 ; X86-NEXT: kmovw %eax, %k1
3722 ; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3723 ; X86-NEXT: vmovaps %zmm2, %zmm0
3726 ; X64-LABEL: test_mm512_mask3_fmadd_ps:
3727 ; X64: # %bb.0: # %entry
3728 ; X64-NEXT: kmovw %edi, %k1
3729 ; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3730 ; X64-NEXT: vmovaps %zmm2, %zmm0
3733 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3734 %1 = bitcast i16 %__U to <16 x i1>
3735 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3739 define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3740 ; X86-LABEL: test_mm512_maskz_fmadd_ps:
3741 ; X86: # %bb.0: # %entry
3742 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3743 ; X86-NEXT: kmovw %eax, %k1
3744 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3747 ; X64-LABEL: test_mm512_maskz_fmadd_ps:
3748 ; X64: # %bb.0: # %entry
3749 ; X64-NEXT: kmovw %edi, %k1
3750 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3753 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3754 %1 = bitcast i16 %__U to <16 x i1>
3755 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3759 define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3760 ; X86-LABEL: test_mm512_fmsub_ps:
3761 ; X86: # %bb.0: # %entry
3762 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3763 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3766 ; X64-LABEL: test_mm512_fmsub_ps:
3767 ; X64: # %bb.0: # %entry
3768 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3769 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3772 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3773 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3777 define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3778 ; X86-LABEL: test_mm512_mask_fmsub_ps:
3779 ; X86: # %bb.0: # %entry
3780 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3781 ; X86-NEXT: kmovw %eax, %k1
3782 ; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3785 ; X64-LABEL: test_mm512_mask_fmsub_ps:
3786 ; X64: # %bb.0: # %entry
3787 ; X64-NEXT: kmovw %edi, %k1
3788 ; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3791 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3792 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3793 %1 = bitcast i16 %__U to <16 x i1>
3794 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3798 define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3799 ; X86-LABEL: test_mm512_maskz_fmsub_ps:
3800 ; X86: # %bb.0: # %entry
3801 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3802 ; X86-NEXT: kmovw %eax, %k1
3803 ; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3806 ; X64-LABEL: test_mm512_maskz_fmsub_ps:
3807 ; X64: # %bb.0: # %entry
3808 ; X64-NEXT: kmovw %edi, %k1
3809 ; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3812 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3813 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3814 %1 = bitcast i16 %__U to <16 x i1>
3815 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3819 define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3820 ; X86-LABEL: test_mm512_fnmadd_ps:
3821 ; X86: # %bb.0: # %entry
3822 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3823 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3826 ; X64-LABEL: test_mm512_fnmadd_ps:
3827 ; X64: # %bb.0: # %entry
3828 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3829 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3832 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3833 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3837 define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3838 ; X86-LABEL: test_mm512_mask3_fnmadd_ps:
3839 ; X86: # %bb.0: # %entry
3840 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3841 ; X86-NEXT: kmovw %eax, %k1
3842 ; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3843 ; X86-NEXT: vmovaps %zmm2, %zmm0
3846 ; X64-LABEL: test_mm512_mask3_fnmadd_ps:
3847 ; X64: # %bb.0: # %entry
3848 ; X64-NEXT: kmovw %edi, %k1
3849 ; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3850 ; X64-NEXT: vmovaps %zmm2, %zmm0
3853 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3854 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3855 %1 = bitcast i16 %__U to <16 x i1>
3856 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3860 define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3861 ; X86-LABEL: test_mm512_maskz_fnmadd_ps:
3862 ; X86: # %bb.0: # %entry
3863 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3864 ; X86-NEXT: kmovw %eax, %k1
3865 ; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3868 ; X64-LABEL: test_mm512_maskz_fnmadd_ps:
3869 ; X64: # %bb.0: # %entry
3870 ; X64-NEXT: kmovw %edi, %k1
3871 ; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3874 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3875 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3876 %1 = bitcast i16 %__U to <16 x i1>
3877 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3881 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3882 ; CHECK-LABEL: test_mm512_fnmsub_ps:
3883 ; CHECK: # %bb.0: # %entry
3884 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3885 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3886 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3887 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3888 ; CHECK-NEXT: ret{{[l|q]}}
3890 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3891 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3892 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3896 define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3897 ; X86-LABEL: test_mm512_maskz_fnmsub_ps:
3898 ; X86: # %bb.0: # %entry
3899 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3900 ; X86-NEXT: kmovw %eax, %k1
3901 ; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3904 ; X64-LABEL: test_mm512_maskz_fnmsub_ps:
3905 ; X64: # %bb.0: # %entry
3906 ; X64-NEXT: kmovw %edi, %k1
3907 ; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3910 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3911 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3912 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3913 %1 = bitcast i16 %__U to <16 x i1>
3914 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3918 define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3919 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
3920 ; CHECK: # %bb.0: # %entry
3921 ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3922 ; CHECK-NEXT: ret{{[l|q]}}
3924 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3928 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3930 define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3931 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
3932 ; X86: # %bb.0: # %entry
3933 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3934 ; X86-NEXT: kmovw %eax, %k1
3935 ; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3938 ; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
3939 ; X64: # %bb.0: # %entry
3940 ; X64-NEXT: kmovw %edi, %k1
3941 ; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3944 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3945 %1 = bitcast i8 %__U to <8 x i1>
3946 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3950 define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3951 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3952 ; X86: # %bb.0: # %entry
3953 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3954 ; X86-NEXT: kmovw %eax, %k1
3955 ; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3956 ; X86-NEXT: vmovapd %zmm2, %zmm0
3959 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3960 ; X64: # %bb.0: # %entry
3961 ; X64-NEXT: kmovw %edi, %k1
3962 ; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3963 ; X64-NEXT: vmovapd %zmm2, %zmm0
3966 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3967 %1 = bitcast i8 %__U to <8 x i1>
3968 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3972 define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3973 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3974 ; X86: # %bb.0: # %entry
3975 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3976 ; X86-NEXT: kmovw %eax, %k1
3977 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3980 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3981 ; X64: # %bb.0: # %entry
3982 ; X64-NEXT: kmovw %edi, %k1
3983 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3986 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3987 %1 = bitcast i8 %__U to <8 x i1>
3988 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3992 define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3993 ; X86-LABEL: test_mm512_fmsubadd_round_pd:
3994 ; X86: # %bb.0: # %entry
3995 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3996 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3999 ; X64-LABEL: test_mm512_fmsubadd_round_pd:
4000 ; X64: # %bb.0: # %entry
4001 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
4002 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4005 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4006 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4010 define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4011 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
4012 ; X86: # %bb.0: # %entry
4013 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4014 ; X86-NEXT: kmovw %eax, %k1
4015 ; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4018 ; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
4019 ; X64: # %bb.0: # %entry
4020 ; X64-NEXT: kmovw %edi, %k1
4021 ; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4024 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4025 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4026 %1 = bitcast i8 %__U to <8 x i1>
4027 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4031 define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4032 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4033 ; X86: # %bb.0: # %entry
4034 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4035 ; X86-NEXT: kmovw %eax, %k1
4036 ; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4039 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4040 ; X64: # %bb.0: # %entry
4041 ; X64-NEXT: kmovw %edi, %k1
4042 ; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4045 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4046 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4047 %1 = bitcast i8 %__U to <8 x i1>
4048 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4052 define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4053 ; CHECK-LABEL: test_mm512_fmaddsub_pd:
4054 ; CHECK: # %bb.0: # %entry
4055 ; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4056 ; CHECK-NEXT: ret{{[l|q]}}
4058 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4059 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4060 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4061 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4065 define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4066 ; X86-LABEL: test_mm512_mask_fmaddsub_pd:
4067 ; X86: # %bb.0: # %entry
4068 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4069 ; X86-NEXT: kmovw %eax, %k1
4070 ; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4073 ; X64-LABEL: test_mm512_mask_fmaddsub_pd:
4074 ; X64: # %bb.0: # %entry
4075 ; X64-NEXT: kmovw %edi, %k1
4076 ; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4079 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4080 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4081 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4082 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4083 %4 = bitcast i8 %__U to <8 x i1>
4084 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
4088 define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4089 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
4090 ; X86: # %bb.0: # %entry
4091 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4092 ; X86-NEXT: kmovw %eax, %k1
4093 ; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4094 ; X86-NEXT: vmovapd %zmm2, %zmm0
4097 ; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
4098 ; X64: # %bb.0: # %entry
4099 ; X64-NEXT: kmovw %edi, %k1
4100 ; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4101 ; X64-NEXT: vmovapd %zmm2, %zmm0
4104 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4105 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4106 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4107 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4108 %4 = bitcast i8 %__U to <8 x i1>
4109 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
4113 define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4114 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
4115 ; X86: # %bb.0: # %entry
4116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4117 ; X86-NEXT: kmovw %eax, %k1
4118 ; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4121 ; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
4122 ; X64: # %bb.0: # %entry
4123 ; X64-NEXT: kmovw %edi, %k1
4124 ; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4127 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4128 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4129 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4130 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4131 %4 = bitcast i8 %__U to <8 x i1>
4132 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
4136 define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4137 ; CHECK-LABEL: test_mm512_fmsubadd_pd:
4138 ; CHECK: # %bb.0: # %entry
4139 ; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4140 ; CHECK-NEXT: ret{{[l|q]}}
4142 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4143 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4144 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4145 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4149 define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4150 ; X86-LABEL: test_mm512_mask_fmsubadd_pd:
4151 ; X86: # %bb.0: # %entry
4152 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4153 ; X86-NEXT: kmovw %eax, %k1
4154 ; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4157 ; X64-LABEL: test_mm512_mask_fmsubadd_pd:
4158 ; X64: # %bb.0: # %entry
4159 ; X64-NEXT: kmovw %edi, %k1
4160 ; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4163 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4164 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4165 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4166 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4167 %3 = bitcast i8 %__U to <8 x i1>
4168 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
4172 define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4173 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
4174 ; X86: # %bb.0: # %entry
4175 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4176 ; X86-NEXT: kmovw %eax, %k1
4177 ; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4180 ; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
4181 ; X64: # %bb.0: # %entry
4182 ; X64-NEXT: kmovw %edi, %k1
4183 ; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4186 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4187 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4188 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4189 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4190 %3 = bitcast i8 %__U to <8 x i1>
4191 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
4195 define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4196 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
4197 ; CHECK: # %bb.0: # %entry
4198 ; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4199 ; CHECK-NEXT: ret{{[l|q]}}
4201 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4205 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
4207 define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4208 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
4209 ; X86: # %bb.0: # %entry
4210 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4211 ; X86-NEXT: kmovw %eax, %k1
4212 ; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4215 ; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
4216 ; X64: # %bb.0: # %entry
4217 ; X64-NEXT: kmovw %edi, %k1
4218 ; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4221 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4222 %1 = bitcast i16 %__U to <16 x i1>
4223 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4227 define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4228 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4229 ; X86: # %bb.0: # %entry
4230 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4231 ; X86-NEXT: kmovw %eax, %k1
4232 ; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4233 ; X86-NEXT: vmovaps %zmm2, %zmm0
4236 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4237 ; X64: # %bb.0: # %entry
4238 ; X64-NEXT: kmovw %edi, %k1
4239 ; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4240 ; X64-NEXT: vmovaps %zmm2, %zmm0
4243 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4244 %1 = bitcast i16 %__U to <16 x i1>
4245 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4249 define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4250 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4251 ; X86: # %bb.0: # %entry
4252 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4253 ; X86-NEXT: kmovw %eax, %k1
4254 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4257 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4258 ; X64: # %bb.0: # %entry
4259 ; X64-NEXT: kmovw %edi, %k1
4260 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4263 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4264 %1 = bitcast i16 %__U to <16 x i1>
4265 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4269 define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4270 ; X86-LABEL: test_mm512_fmsubadd_round_ps:
4271 ; X86: # %bb.0: # %entry
4272 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
4273 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4276 ; X64-LABEL: test_mm512_fmsubadd_round_ps:
4277 ; X64: # %bb.0: # %entry
4278 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
4279 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4282 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4283 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4287 define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4288 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
4289 ; X86: # %bb.0: # %entry
4290 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4291 ; X86-NEXT: kmovw %eax, %k1
4292 ; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4295 ; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
4296 ; X64: # %bb.0: # %entry
4297 ; X64-NEXT: kmovw %edi, %k1
4298 ; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4301 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4302 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4303 %1 = bitcast i16 %__U to <16 x i1>
4304 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4308 define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4309 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4310 ; X86: # %bb.0: # %entry
4311 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4312 ; X86-NEXT: kmovw %eax, %k1
4313 ; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4316 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4317 ; X64: # %bb.0: # %entry
4318 ; X64-NEXT: kmovw %edi, %k1
4319 ; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4322 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4323 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4324 %1 = bitcast i16 %__U to <16 x i1>
4325 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4329 define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4330 ; CHECK-LABEL: test_mm512_fmaddsub_ps:
4331 ; CHECK: # %bb.0: # %entry
4332 ; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4333 ; CHECK-NEXT: ret{{[l|q]}}
4335 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4336 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4337 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4338 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4342 define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4343 ; X86-LABEL: test_mm512_mask_fmaddsub_ps:
4344 ; X86: # %bb.0: # %entry
4345 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4346 ; X86-NEXT: kmovw %eax, %k1
4347 ; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4350 ; X64-LABEL: test_mm512_mask_fmaddsub_ps:
4351 ; X64: # %bb.0: # %entry
4352 ; X64-NEXT: kmovw %edi, %k1
4353 ; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4356 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4357 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4358 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4359 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4360 %4 = bitcast i16 %__U to <16 x i1>
4361 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
4365 define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4366 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
4367 ; X86: # %bb.0: # %entry
4368 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4369 ; X86-NEXT: kmovw %eax, %k1
4370 ; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4371 ; X86-NEXT: vmovaps %zmm2, %zmm0
4374 ; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
4375 ; X64: # %bb.0: # %entry
4376 ; X64-NEXT: kmovw %edi, %k1
4377 ; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4378 ; X64-NEXT: vmovaps %zmm2, %zmm0
4381 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4382 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4383 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4384 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4385 %4 = bitcast i16 %__U to <16 x i1>
4386 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
4390 define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4391 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
4392 ; X86: # %bb.0: # %entry
4393 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4394 ; X86-NEXT: kmovw %eax, %k1
4395 ; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4398 ; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
4399 ; X64: # %bb.0: # %entry
4400 ; X64-NEXT: kmovw %edi, %k1
4401 ; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4404 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4405 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4406 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4407 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4408 %4 = bitcast i16 %__U to <16 x i1>
4409 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
4413 define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4414 ; CHECK-LABEL: test_mm512_fmsubadd_ps:
4415 ; CHECK: # %bb.0: # %entry
4416 ; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4417 ; CHECK-NEXT: ret{{[l|q]}}
4419 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4420 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4421 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4422 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4426 define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4427 ; X86-LABEL: test_mm512_mask_fmsubadd_ps:
4428 ; X86: # %bb.0: # %entry
4429 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4430 ; X86-NEXT: kmovw %eax, %k1
4431 ; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4434 ; X64-LABEL: test_mm512_mask_fmsubadd_ps:
4435 ; X64: # %bb.0: # %entry
4436 ; X64-NEXT: kmovw %edi, %k1
4437 ; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4440 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4441 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4442 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4443 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4444 %3 = bitcast i16 %__U to <16 x i1>
4445 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
4449 define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4450 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
4451 ; X86: # %bb.0: # %entry
4452 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4453 ; X86-NEXT: kmovw %eax, %k1
4454 ; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4457 ; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
4458 ; X64: # %bb.0: # %entry
4459 ; X64-NEXT: kmovw %edi, %k1
4460 ; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4463 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4464 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4465 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4466 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4467 %3 = bitcast i16 %__U to <16 x i1>
4468 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
4472 define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4473 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
4474 ; X86: # %bb.0: # %entry
4475 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4476 ; X86-NEXT: kmovw %eax, %k1
4477 ; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4478 ; X86-NEXT: vmovapd %zmm2, %zmm0
4481 ; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
4482 ; X64: # %bb.0: # %entry
4483 ; X64-NEXT: kmovw %edi, %k1
4484 ; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4485 ; X64-NEXT: vmovapd %zmm2, %zmm0
4488 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4489 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4490 %1 = bitcast i8 %__U to <8 x i1>
4491 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4495 define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4496 ; X86-LABEL: test_mm512_mask3_fmsub_pd:
4497 ; X86: # %bb.0: # %entry
4498 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4499 ; X86-NEXT: kmovw %eax, %k1
4500 ; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4501 ; X86-NEXT: vmovapd %zmm2, %zmm0
4504 ; X64-LABEL: test_mm512_mask3_fmsub_pd:
4505 ; X64: # %bb.0: # %entry
4506 ; X64-NEXT: kmovw %edi, %k1
4507 ; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4508 ; X64-NEXT: vmovapd %zmm2, %zmm0
4511 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4512 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4513 %1 = bitcast i8 %__U to <8 x i1>
4514 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4518 define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4519 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
4520 ; X86: # %bb.0: # %entry
4521 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4522 ; X86-NEXT: kmovw %eax, %k1
4523 ; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4524 ; X86-NEXT: vmovaps %zmm2, %zmm0
4527 ; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
4528 ; X64: # %bb.0: # %entry
4529 ; X64-NEXT: kmovw %edi, %k1
4530 ; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4531 ; X64-NEXT: vmovaps %zmm2, %zmm0
4534 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4535 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4536 %1 = bitcast i16 %__U to <16 x i1>
4537 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4541 define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4542 ; X86-LABEL: test_mm512_mask3_fmsub_ps:
4543 ; X86: # %bb.0: # %entry
4544 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4545 ; X86-NEXT: kmovw %eax, %k1
4546 ; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4547 ; X86-NEXT: vmovaps %zmm2, %zmm0
4550 ; X64-LABEL: test_mm512_mask3_fmsub_ps:
4551 ; X64: # %bb.0: # %entry
4552 ; X64-NEXT: kmovw %edi, %k1
4553 ; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4554 ; X64-NEXT: vmovaps %zmm2, %zmm0
4557 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4558 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4559 %1 = bitcast i16 %__U to <16 x i1>
4560 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4564 define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4565 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4566 ; X86: # %bb.0: # %entry
4567 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4568 ; X86-NEXT: kmovw %eax, %k1
4569 ; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4570 ; X86-NEXT: vmovapd %zmm2, %zmm0
4573 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4574 ; X64: # %bb.0: # %entry
4575 ; X64-NEXT: kmovw %edi, %k1
4576 ; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4577 ; X64-NEXT: vmovapd %zmm2, %zmm0
4580 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4581 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4582 %1 = bitcast i8 %__U to <8 x i1>
4583 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4587 define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4588 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
4589 ; X86: # %bb.0: # %entry
4590 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4591 ; X86-NEXT: kmovw %eax, %k1
4592 ; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4593 ; X86-NEXT: vmovapd %zmm2, %zmm0
4596 ; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
4597 ; X64: # %bb.0: # %entry
4598 ; X64-NEXT: kmovw %edi, %k1
4599 ; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4600 ; X64-NEXT: vmovapd %zmm2, %zmm0
4603 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4604 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4605 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4606 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4607 %3 = bitcast i8 %__U to <8 x i1>
4608 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
4612 define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4613 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4614 ; X86: # %bb.0: # %entry
4615 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4616 ; X86-NEXT: kmovw %eax, %k1
4617 ; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4618 ; X86-NEXT: vmovaps %zmm2, %zmm0
4621 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4622 ; X64: # %bb.0: # %entry
4623 ; X64-NEXT: kmovw %edi, %k1
4624 ; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4625 ; X64-NEXT: vmovaps %zmm2, %zmm0
4628 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4629 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4630 %1 = bitcast i16 %__U to <16 x i1>
4631 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4635 define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4636 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
4637 ; X86: # %bb.0: # %entry
4638 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4639 ; X86-NEXT: kmovw %eax, %k1
4640 ; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4641 ; X86-NEXT: vmovaps %zmm2, %zmm0
4644 ; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
4645 ; X64: # %bb.0: # %entry
4646 ; X64-NEXT: kmovw %edi, %k1
4647 ; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4648 ; X64-NEXT: vmovaps %zmm2, %zmm0
4651 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4652 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4653 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4654 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4655 %3 = bitcast i16 %__U to <16 x i1>
4656 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
4660 define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4661 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
4662 ; X86: # %bb.0: # %entry
4663 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4664 ; X86-NEXT: kmovw %eax, %k1
4665 ; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4668 ; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
4669 ; X64: # %bb.0: # %entry
4670 ; X64-NEXT: kmovw %edi, %k1
4671 ; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4674 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4675 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
4676 %1 = bitcast i8 %__U to <8 x i1>
4677 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4681 define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4682 ; X86-LABEL: test_mm512_mask_fnmadd_pd:
4683 ; X86: # %bb.0: # %entry
4684 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4685 ; X86-NEXT: kmovw %eax, %k1
4686 ; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4689 ; X64-LABEL: test_mm512_mask_fnmadd_pd:
4690 ; X64: # %bb.0: # %entry
4691 ; X64-NEXT: kmovw %edi, %k1
4692 ; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4695 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4696 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
4697 %1 = bitcast i8 %__U to <8 x i1>
4698 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4702 define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4703 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
4704 ; X86: # %bb.0: # %entry
4705 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4706 ; X86-NEXT: kmovw %eax, %k1
4707 ; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4710 ; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
4711 ; X64: # %bb.0: # %entry
4712 ; X64-NEXT: kmovw %edi, %k1
4713 ; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4716 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4717 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
4718 %1 = bitcast i16 %__U to <16 x i1>
4719 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4723 define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4724 ; X86-LABEL: test_mm512_mask_fnmadd_ps:
4725 ; X86: # %bb.0: # %entry
4726 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4727 ; X86-NEXT: kmovw %eax, %k1
4728 ; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4731 ; X64-LABEL: test_mm512_mask_fnmadd_ps:
4732 ; X64: # %bb.0: # %entry
4733 ; X64-NEXT: kmovw %edi, %k1
4734 ; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4737 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4738 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
4739 %1 = bitcast i16 %__U to <16 x i1>
4740 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4744 define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4745 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
4746 ; X86: # %bb.0: # %entry
4747 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4748 ; X86-NEXT: kmovw %eax, %k1
4749 ; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4752 ; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
4753 ; X64: # %bb.0: # %entry
4754 ; X64-NEXT: kmovw %edi, %k1
4755 ; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4758 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4759 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4760 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4761 %1 = bitcast i8 %__U to <8 x i1>
4762 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4766 define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4767 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
4768 ; X86: # %bb.0: # %entry
4769 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4770 ; X86-NEXT: kmovw %eax, %k1
4771 ; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4772 ; X86-NEXT: vmovapd %zmm2, %zmm0
4775 ; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
4776 ; X64: # %bb.0: # %entry
4777 ; X64-NEXT: kmovw %edi, %k1
4778 ; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4779 ; X64-NEXT: vmovapd %zmm2, %zmm0
4782 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4783 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4784 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4785 %1 = bitcast i8 %__U to <8 x i1>
4786 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4790 define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4791 ; X86-LABEL: test_mm512_mask_fnmsub_pd:
4792 ; X86: # %bb.0: # %entry
4793 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4794 ; X86-NEXT: kmovw %eax, %k1
4795 ; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4798 ; X64-LABEL: test_mm512_mask_fnmsub_pd:
4799 ; X64: # %bb.0: # %entry
4800 ; X64-NEXT: kmovw %edi, %k1
4801 ; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4804 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4805 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4806 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4807 %1 = bitcast i8 %__U to <8 x i1>
4808 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4812 define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4813 ; X86-LABEL: test_mm512_mask3_fnmsub_pd:
4814 ; X86: # %bb.0: # %entry
4815 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4816 ; X86-NEXT: kmovw %eax, %k1
4817 ; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4818 ; X86-NEXT: vmovapd %zmm2, %zmm0
4821 ; X64-LABEL: test_mm512_mask3_fnmsub_pd:
4822 ; X64: # %bb.0: # %entry
4823 ; X64-NEXT: kmovw %edi, %k1
4824 ; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4825 ; X64-NEXT: vmovapd %zmm2, %zmm0
4828 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4829 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4830 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4831 %1 = bitcast i8 %__U to <8 x i1>
4832 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4836 define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4837 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
4838 ; X86: # %bb.0: # %entry
4839 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4840 ; X86-NEXT: kmovw %eax, %k1
4841 ; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4844 ; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
4845 ; X64: # %bb.0: # %entry
4846 ; X64-NEXT: kmovw %edi, %k1
4847 ; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4850 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4851 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4852 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4853 %1 = bitcast i16 %__U to <16 x i1>
4854 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4858 define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4859 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
4860 ; X86: # %bb.0: # %entry
4861 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4862 ; X86-NEXT: kmovw %eax, %k1
4863 ; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4864 ; X86-NEXT: vmovaps %zmm2, %zmm0
4867 ; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
4868 ; X64: # %bb.0: # %entry
4869 ; X64-NEXT: kmovw %edi, %k1
4870 ; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4871 ; X64-NEXT: vmovaps %zmm2, %zmm0
4874 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4875 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4876 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4877 %1 = bitcast i16 %__U to <16 x i1>
4878 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4882 define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4883 ; X86-LABEL: test_mm512_mask_fnmsub_ps:
4884 ; X86: # %bb.0: # %entry
4885 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4886 ; X86-NEXT: kmovw %eax, %k1
4887 ; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4890 ; X64-LABEL: test_mm512_mask_fnmsub_ps:
4891 ; X64: # %bb.0: # %entry
4892 ; X64-NEXT: kmovw %edi, %k1
4893 ; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4896 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4897 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4898 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4899 %1 = bitcast i16 %__U to <16 x i1>
4900 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4904 define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4905 ; X86-LABEL: test_mm512_mask3_fnmsub_ps:
4906 ; X86: # %bb.0: # %entry
4907 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4908 ; X86-NEXT: kmovw %eax, %k1
4909 ; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4910 ; X86-NEXT: vmovaps %zmm2, %zmm0
4913 ; X64-LABEL: test_mm512_mask3_fnmsub_ps:
4914 ; X64: # %bb.0: # %entry
4915 ; X64-NEXT: kmovw %edi, %k1
4916 ; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4917 ; X64-NEXT: vmovaps %zmm2, %zmm0
4920 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4921 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4922 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4923 %1 = bitcast i16 %__U to <16 x i1>
4924 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4928 define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4929 ; X86-LABEL: test_mm_mask_fmadd_ss:
4930 ; X86: # %bb.0: # %entry
4931 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4932 ; X86-NEXT: kmovw %eax, %k1
4933 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4936 ; X64-LABEL: test_mm_mask_fmadd_ss:
4937 ; X64: # %bb.0: # %entry
4938 ; X64-NEXT: kmovw %edi, %k1
4939 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4942 %0 = extractelement <4 x float> %__W, i64 0
4943 %1 = extractelement <4 x float> %__A, i64 0
4944 %2 = extractelement <4 x float> %__B, i64 0
4945 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4947 %tobool.i = icmp eq i8 %4, 0
4948 %vecext1.i = extractelement <4 x float> %__W, i32 0
4949 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
4950 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
4951 ret <4 x float> %vecins.i
4954 define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4955 ; X86-LABEL: test_mm_mask_fmadd_round_ss:
4956 ; X86: # %bb.0: # %entry
4957 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4958 ; X86-NEXT: kmovw %eax, %k1
4959 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
4962 ; X64-LABEL: test_mm_mask_fmadd_round_ss:
4963 ; X64: # %bb.0: # %entry
4964 ; X64-NEXT: kmovw %edi, %k1
4965 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
4968 %0 = extractelement <4 x float> %__W, i64 0
4969 %1 = extractelement <4 x float> %__A, i64 0
4970 %2 = extractelement <4 x float> %__B, i64 0
4971 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
4972 %4 = bitcast i8 %__U to <8 x i1>
4973 %5 = extractelement <8 x i1> %4, i64 0
4974 %6 = select i1 %5, float %3, float %0
4975 %7 = insertelement <4 x float> %__W, float %6, i64 0
4979 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
4981 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4982 ; X86-LABEL: test_mm_maskz_fmadd_ss:
4983 ; X86: # %bb.0: # %entry
4984 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4985 ; X86-NEXT: kmovw %eax, %k1
4986 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4989 ; X64-LABEL: test_mm_maskz_fmadd_ss:
4990 ; X64: # %bb.0: # %entry
4991 ; X64-NEXT: kmovw %edi, %k1
4992 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4995 %0 = extractelement <4 x float> %__A, i64 0
4996 %1 = extractelement <4 x float> %__B, i64 0
4997 %2 = extractelement <4 x float> %__C, i64 0
4998 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5000 %tobool.i = icmp eq i8 %4, 0
5001 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5002 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5003 ret <4 x float> %vecins.i
5006 define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5007 ; X86-LABEL: test_mm_maskz_fmadd_round_ss:
5008 ; X86: # %bb.0: # %entry
5009 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5010 ; X86-NEXT: kmovw %eax, %k1
5011 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5014 ; X64-LABEL: test_mm_maskz_fmadd_round_ss:
5015 ; X64: # %bb.0: # %entry
5016 ; X64-NEXT: kmovw %edi, %k1
5017 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5020 %0 = extractelement <4 x float> %__A, i64 0
5021 %1 = extractelement <4 x float> %__B, i64 0
5022 %2 = extractelement <4 x float> %__C, i64 0
5023 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5024 %4 = bitcast i8 %__U to <8 x i1>
5025 %5 = extractelement <8 x i1> %4, i64 0
5026 %6 = select i1 %5, float %3, float 0.000000e+00
5027 %7 = insertelement <4 x float> %__A, float %6, i64 0
5031 define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5032 ; X86-LABEL: test_mm_mask3_fmadd_ss:
5033 ; X86: # %bb.0: # %entry
5034 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5035 ; X86-NEXT: kmovw %eax, %k1
5036 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5037 ; X86-NEXT: vmovaps %xmm2, %xmm0
5040 ; X64-LABEL: test_mm_mask3_fmadd_ss:
5041 ; X64: # %bb.0: # %entry
5042 ; X64-NEXT: kmovw %edi, %k1
5043 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5044 ; X64-NEXT: vmovaps %xmm2, %xmm0
5047 %0 = extractelement <4 x float> %__W, i64 0
5048 %1 = extractelement <4 x float> %__X, i64 0
5049 %2 = extractelement <4 x float> %__Y, i64 0
5050 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5052 %tobool.i = icmp eq i8 %4, 0
5053 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5054 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5055 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5056 ret <4 x float> %vecins.i
5059 define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5060 ; X86-LABEL: test_mm_mask3_fmadd_round_ss:
5061 ; X86: # %bb.0: # %entry
5062 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5063 ; X86-NEXT: kmovw %eax, %k1
5064 ; X86-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5065 ; X86-NEXT: vmovaps %xmm2, %xmm0
5068 ; X64-LABEL: test_mm_mask3_fmadd_round_ss:
5069 ; X64: # %bb.0: # %entry
5070 ; X64-NEXT: kmovw %edi, %k1
5071 ; X64-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5072 ; X64-NEXT: vmovaps %xmm2, %xmm0
5075 %0 = extractelement <4 x float> %__W, i64 0
5076 %1 = extractelement <4 x float> %__X, i64 0
5077 %2 = extractelement <4 x float> %__Y, i64 0
5078 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5079 %4 = bitcast i8 %__U to <8 x i1>
5080 %5 = extractelement <8 x i1> %4, i64 0
5081 %6 = select i1 %5, float %3, float %2
5082 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5086 define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5087 ; X86-LABEL: test_mm_mask_fmsub_ss:
5088 ; X86: # %bb.0: # %entry
5089 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5090 ; X86-NEXT: kmovw %eax, %k1
5091 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5094 ; X64-LABEL: test_mm_mask_fmsub_ss:
5095 ; X64: # %bb.0: # %entry
5096 ; X64-NEXT: kmovw %edi, %k1
5097 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5100 %0 = extractelement <4 x float> %__W, i64 0
5101 %1 = extractelement <4 x float> %__A, i64 0
5102 %.rhs.i = extractelement <4 x float> %__B, i64 0
5103 %2 = fsub float -0.000000e+00, %.rhs.i
5104 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5106 %tobool.i = icmp eq i8 %4, 0
5107 %vecext1.i = extractelement <4 x float> %__W, i32 0
5108 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5109 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5110 ret <4 x float> %vecins.i
5113 define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5114 ; X86-LABEL: test_mm_mask_fmsub_round_ss:
5115 ; X86: # %bb.0: # %entry
5116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5117 ; X86-NEXT: kmovw %eax, %k1
5118 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5121 ; X64-LABEL: test_mm_mask_fmsub_round_ss:
5122 ; X64: # %bb.0: # %entry
5123 ; X64-NEXT: kmovw %edi, %k1
5124 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5127 %0 = extractelement <4 x float> %__W, i64 0
5128 %1 = extractelement <4 x float> %__A, i64 0
5129 %.rhs = extractelement <4 x float> %__B, i64 0
5130 %2 = fsub float -0.000000e+00, %.rhs
5131 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5132 %4 = bitcast i8 %__U to <8 x i1>
5133 %5 = extractelement <8 x i1> %4, i64 0
5134 %6 = select i1 %5, float %3, float %0
5135 %7 = insertelement <4 x float> %__W, float %6, i64 0
5139 define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5140 ; X86-LABEL: test_mm_maskz_fmsub_ss:
5141 ; X86: # %bb.0: # %entry
5142 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5143 ; X86-NEXT: kmovw %eax, %k1
5144 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5147 ; X64-LABEL: test_mm_maskz_fmsub_ss:
5148 ; X64: # %bb.0: # %entry
5149 ; X64-NEXT: kmovw %edi, %k1
5150 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5153 %0 = extractelement <4 x float> %__A, i64 0
5154 %1 = extractelement <4 x float> %__B, i64 0
5155 %.rhs.i = extractelement <4 x float> %__C, i64 0
5156 %2 = fsub float -0.000000e+00, %.rhs.i
5157 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5159 %tobool.i = icmp eq i8 %4, 0
5160 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5161 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5162 ret <4 x float> %vecins.i
5165 define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5166 ; X86-LABEL: test_mm_maskz_fmsub_round_ss:
5167 ; X86: # %bb.0: # %entry
5168 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5169 ; X86-NEXT: kmovw %eax, %k1
5170 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5173 ; X64-LABEL: test_mm_maskz_fmsub_round_ss:
5174 ; X64: # %bb.0: # %entry
5175 ; X64-NEXT: kmovw %edi, %k1
5176 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5179 %0 = extractelement <4 x float> %__A, i64 0
5180 %1 = extractelement <4 x float> %__B, i64 0
5181 %.rhs = extractelement <4 x float> %__C, i64 0
5182 %2 = fsub float -0.000000e+00, %.rhs
5183 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5184 %4 = bitcast i8 %__U to <8 x i1>
5185 %5 = extractelement <8 x i1> %4, i64 0
5186 %6 = select i1 %5, float %3, float 0.000000e+00
5187 %7 = insertelement <4 x float> %__A, float %6, i64 0
5191 define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5192 ; X86-LABEL: test_mm_mask3_fmsub_ss:
5193 ; X86: # %bb.0: # %entry
5194 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5195 ; X86-NEXT: kmovw %eax, %k1
5196 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5197 ; X86-NEXT: vmovaps %xmm2, %xmm0
5200 ; X64-LABEL: test_mm_mask3_fmsub_ss:
5201 ; X64: # %bb.0: # %entry
5202 ; X64-NEXT: kmovw %edi, %k1
5203 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5204 ; X64-NEXT: vmovaps %xmm2, %xmm0
5207 %0 = extractelement <4 x float> %__W, i64 0
5208 %1 = extractelement <4 x float> %__X, i64 0
5209 %.rhs.i = extractelement <4 x float> %__Y, i64 0
5210 %2 = fsub float -0.000000e+00, %.rhs.i
5211 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5213 %tobool.i = icmp eq i8 %4, 0
5214 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5215 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5216 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5217 ret <4 x float> %vecins.i
5220 define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5221 ; X86-LABEL: test_mm_mask3_fmsub_round_ss:
5222 ; X86: # %bb.0: # %entry
5223 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5224 ; X86-NEXT: kmovw %eax, %k1
5225 ; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5226 ; X86-NEXT: vmovaps %xmm2, %xmm0
5229 ; X64-LABEL: test_mm_mask3_fmsub_round_ss:
5230 ; X64: # %bb.0: # %entry
5231 ; X64-NEXT: kmovw %edi, %k1
5232 ; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5233 ; X64-NEXT: vmovaps %xmm2, %xmm0
5236 %0 = extractelement <4 x float> %__W, i64 0
5237 %1 = extractelement <4 x float> %__X, i64 0
5238 %.rhs = extractelement <4 x float> %__Y, i64 0
5239 %2 = fsub float -0.000000e+00, %.rhs
5240 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5241 %4 = bitcast i8 %__U to <8 x i1>
5242 %5 = extractelement <8 x i1> %4, i64 0
5243 %6 = select i1 %5, float %3, float %.rhs
5244 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5248 define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5249 ; X86-LABEL: test_mm_mask_fnmadd_ss:
5250 ; X86: # %bb.0: # %entry
5251 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5252 ; X86-NEXT: kmovw %eax, %k1
5253 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5256 ; X64-LABEL: test_mm_mask_fnmadd_ss:
5257 ; X64: # %bb.0: # %entry
5258 ; X64-NEXT: kmovw %edi, %k1
5259 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5262 %0 = extractelement <4 x float> %__W, i64 0
5263 %.rhs.i = extractelement <4 x float> %__A, i64 0
5264 %1 = fsub float -0.000000e+00, %.rhs.i
5265 %2 = extractelement <4 x float> %__B, i64 0
5266 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5268 %tobool.i = icmp eq i8 %4, 0
5269 %vecext1.i = extractelement <4 x float> %__W, i32 0
5270 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5271 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5272 ret <4 x float> %vecins.i
5275 define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5276 ; X86-LABEL: test_mm_mask_fnmadd_round_ss:
5277 ; X86: # %bb.0: # %entry
5278 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5279 ; X86-NEXT: kmovw %eax, %k1
5280 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5283 ; X64-LABEL: test_mm_mask_fnmadd_round_ss:
5284 ; X64: # %bb.0: # %entry
5285 ; X64-NEXT: kmovw %edi, %k1
5286 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5289 %0 = extractelement <4 x float> %__W, i64 0
5290 %.rhs = extractelement <4 x float> %__A, i64 0
5291 %1 = fsub float -0.000000e+00, %.rhs
5292 %2 = extractelement <4 x float> %__B, i64 0
5293 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5294 %4 = bitcast i8 %__U to <8 x i1>
5295 %5 = extractelement <8 x i1> %4, i64 0
5296 %6 = select i1 %5, float %3, float %0
5297 %7 = insertelement <4 x float> %__W, float %6, i64 0
5301 define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5302 ; X86-LABEL: test_mm_maskz_fnmadd_ss:
5303 ; X86: # %bb.0: # %entry
5304 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5305 ; X86-NEXT: kmovw %eax, %k1
5306 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5309 ; X64-LABEL: test_mm_maskz_fnmadd_ss:
5310 ; X64: # %bb.0: # %entry
5311 ; X64-NEXT: kmovw %edi, %k1
5312 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5315 %0 = extractelement <4 x float> %__A, i64 0
5316 %.rhs.i = extractelement <4 x float> %__B, i64 0
5317 %1 = fsub float -0.000000e+00, %.rhs.i
5318 %2 = extractelement <4 x float> %__C, i64 0
5319 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5321 %tobool.i = icmp eq i8 %4, 0
5322 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5323 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5324 ret <4 x float> %vecins.i
5327 define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5328 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
5329 ; X86: # %bb.0: # %entry
5330 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5331 ; X86-NEXT: kmovw %eax, %k1
5332 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5335 ; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
5336 ; X64: # %bb.0: # %entry
5337 ; X64-NEXT: kmovw %edi, %k1
5338 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5341 %0 = extractelement <4 x float> %__A, i64 0
5342 %.rhs = extractelement <4 x float> %__B, i64 0
5343 %1 = fsub float -0.000000e+00, %.rhs
5344 %2 = extractelement <4 x float> %__C, i64 0
5345 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5346 %4 = bitcast i8 %__U to <8 x i1>
5347 %5 = extractelement <8 x i1> %4, i64 0
5348 %6 = select i1 %5, float %3, float 0.000000e+00
5349 %7 = insertelement <4 x float> %__A, float %6, i64 0
5353 define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5354 ; X86-LABEL: test_mm_mask3_fnmadd_ss:
5355 ; X86: # %bb.0: # %entry
5356 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5357 ; X86-NEXT: kmovw %eax, %k1
5358 ; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5359 ; X86-NEXT: vmovaps %xmm2, %xmm0
5362 ; X64-LABEL: test_mm_mask3_fnmadd_ss:
5363 ; X64: # %bb.0: # %entry
5364 ; X64-NEXT: kmovw %edi, %k1
5365 ; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5366 ; X64-NEXT: vmovaps %xmm2, %xmm0
5369 %0 = extractelement <4 x float> %__W, i64 0
5370 %.rhs.i = extractelement <4 x float> %__X, i64 0
5371 %1 = fsub float -0.000000e+00, %.rhs.i
5372 %2 = extractelement <4 x float> %__Y, i64 0
5373 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5375 %tobool.i = icmp eq i8 %4, 0
5376 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5377 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5378 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5379 ret <4 x float> %vecins.i
5382 define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5383 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
5384 ; X86: # %bb.0: # %entry
5385 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5386 ; X86-NEXT: kmovw %eax, %k1
5387 ; X86-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5388 ; X86-NEXT: vmovaps %xmm2, %xmm0
5391 ; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
5392 ; X64: # %bb.0: # %entry
5393 ; X64-NEXT: kmovw %edi, %k1
5394 ; X64-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5395 ; X64-NEXT: vmovaps %xmm2, %xmm0
5398 %0 = extractelement <4 x float> %__W, i64 0
5399 %.rhs = extractelement <4 x float> %__X, i64 0
5400 %1 = fsub float -0.000000e+00, %.rhs
5401 %2 = extractelement <4 x float> %__Y, i64 0
5402 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5403 %4 = bitcast i8 %__U to <8 x i1>
5404 %5 = extractelement <8 x i1> %4, i64 0
5405 %6 = select i1 %5, float %3, float %2
5406 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5410 define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5411 ; X86-LABEL: test_mm_mask_fnmsub_ss:
5412 ; X86: # %bb.0: # %entry
5413 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5414 ; X86-NEXT: kmovw %eax, %k1
5415 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5418 ; X64-LABEL: test_mm_mask_fnmsub_ss:
5419 ; X64: # %bb.0: # %entry
5420 ; X64-NEXT: kmovw %edi, %k1
5421 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5424 %0 = extractelement <4 x float> %__W, i64 0
5425 %.rhs.i = extractelement <4 x float> %__A, i64 0
5426 %1 = fsub float -0.000000e+00, %.rhs.i
5427 %.rhs7.i = extractelement <4 x float> %__B, i64 0
5428 %2 = fsub float -0.000000e+00, %.rhs7.i
5429 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5431 %tobool.i = icmp eq i8 %4, 0
5432 %vecext2.i = extractelement <4 x float> %__W, i32 0
5433 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5434 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5435 ret <4 x float> %vecins.i
5438 define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5439 ; X86-LABEL: test_mm_mask_fnmsub_round_ss:
5440 ; X86: # %bb.0: # %entry
5441 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5442 ; X86-NEXT: kmovw %eax, %k1
5443 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5446 ; X64-LABEL: test_mm_mask_fnmsub_round_ss:
5447 ; X64: # %bb.0: # %entry
5448 ; X64-NEXT: kmovw %edi, %k1
5449 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5452 %0 = extractelement <4 x float> %__W, i64 0
5453 %.rhs = extractelement <4 x float> %__A, i64 0
5454 %1 = fsub float -0.000000e+00, %.rhs
5455 %.rhs2 = extractelement <4 x float> %__B, i64 0
5456 %2 = fsub float -0.000000e+00, %.rhs2
5457 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5458 %4 = bitcast i8 %__U to <8 x i1>
5459 %5 = extractelement <8 x i1> %4, i64 0
5460 %6 = select i1 %5, float %3, float %0
5461 %7 = insertelement <4 x float> %__W, float %6, i64 0
5465 define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5466 ; X86-LABEL: test_mm_maskz_fnmsub_ss:
5467 ; X86: # %bb.0: # %entry
5468 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5469 ; X86-NEXT: kmovw %eax, %k1
5470 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5473 ; X64-LABEL: test_mm_maskz_fnmsub_ss:
5474 ; X64: # %bb.0: # %entry
5475 ; X64-NEXT: kmovw %edi, %k1
5476 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5479 %0 = extractelement <4 x float> %__A, i64 0
5480 %.rhs.i = extractelement <4 x float> %__B, i64 0
5481 %1 = fsub float -0.000000e+00, %.rhs.i
5482 %.rhs5.i = extractelement <4 x float> %__C, i64 0
5483 %2 = fsub float -0.000000e+00, %.rhs5.i
5484 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5486 %tobool.i = icmp eq i8 %4, 0
5487 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5488 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5489 ret <4 x float> %vecins.i
5492 define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5493 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
5494 ; X86: # %bb.0: # %entry
5495 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5496 ; X86-NEXT: kmovw %eax, %k1
5497 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5500 ; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
5501 ; X64: # %bb.0: # %entry
5502 ; X64-NEXT: kmovw %edi, %k1
5503 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5506 %0 = extractelement <4 x float> %__A, i64 0
5507 %.rhs = extractelement <4 x float> %__B, i64 0
5508 %1 = fsub float -0.000000e+00, %.rhs
5509 %.rhs2 = extractelement <4 x float> %__C, i64 0
5510 %2 = fsub float -0.000000e+00, %.rhs2
5511 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5512 %4 = bitcast i8 %__U to <8 x i1>
5513 %5 = extractelement <8 x i1> %4, i64 0
5514 %6 = select i1 %5, float %3, float 0.000000e+00
5515 %7 = insertelement <4 x float> %__A, float %6, i64 0
5519 define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5520 ; X86-LABEL: test_mm_mask3_fnmsub_ss:
5521 ; X86: # %bb.0: # %entry
5522 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5523 ; X86-NEXT: kmovw %eax, %k1
5524 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5525 ; X86-NEXT: vmovaps %xmm2, %xmm0
5528 ; X64-LABEL: test_mm_mask3_fnmsub_ss:
5529 ; X64: # %bb.0: # %entry
5530 ; X64-NEXT: kmovw %edi, %k1
5531 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5532 ; X64-NEXT: vmovaps %xmm2, %xmm0
5535 %0 = extractelement <4 x float> %__W, i64 0
5536 %.rhs.i = extractelement <4 x float> %__X, i64 0
5537 %1 = fsub float -0.000000e+00, %.rhs.i
5538 %.rhs7.i = extractelement <4 x float> %__Y, i64 0
5539 %2 = fsub float -0.000000e+00, %.rhs7.i
5540 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5542 %tobool.i = icmp eq i8 %4, 0
5543 %vecext2.i = extractelement <4 x float> %__Y, i32 0
5544 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5545 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5546 ret <4 x float> %vecins.i
5549 define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5550 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
5551 ; X86: # %bb.0: # %entry
5552 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5553 ; X86-NEXT: kmovw %eax, %k1
5554 ; X86-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5555 ; X86-NEXT: vmovaps %xmm2, %xmm0
5558 ; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
5559 ; X64: # %bb.0: # %entry
5560 ; X64-NEXT: kmovw %edi, %k1
5561 ; X64-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5562 ; X64-NEXT: vmovaps %xmm2, %xmm0
5565 %0 = extractelement <4 x float> %__W, i64 0
5566 %.rhs = extractelement <4 x float> %__X, i64 0
5567 %1 = fsub float -0.000000e+00, %.rhs
5568 %.rhs1 = extractelement <4 x float> %__Y, i64 0
5569 %2 = fsub float -0.000000e+00, %.rhs1
5570 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5571 %4 = bitcast i8 %__U to <8 x i1>
5572 %5 = extractelement <8 x i1> %4, i64 0
5573 %6 = select i1 %5, float %3, float %.rhs1
5574 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5578 define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5579 ; X86-LABEL: test_mm_mask_fmadd_sd:
5580 ; X86: # %bb.0: # %entry
5581 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5582 ; X86-NEXT: kmovw %eax, %k1
5583 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5586 ; X64-LABEL: test_mm_mask_fmadd_sd:
5587 ; X64: # %bb.0: # %entry
5588 ; X64-NEXT: kmovw %edi, %k1
5589 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5592 %0 = extractelement <2 x double> %__W, i64 0
5593 %1 = extractelement <2 x double> %__A, i64 0
5594 %2 = extractelement <2 x double> %__B, i64 0
5595 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5597 %tobool.i = icmp eq i8 %4, 0
5598 %vecext1.i = extractelement <2 x double> %__W, i32 0
5599 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5600 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5601 ret <2 x double> %vecins.i
5604 define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5605 ; X86-LABEL: test_mm_mask_fmadd_round_sd:
5606 ; X86: # %bb.0: # %entry
5607 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5608 ; X86-NEXT: kmovw %eax, %k1
5609 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5612 ; X64-LABEL: test_mm_mask_fmadd_round_sd:
5613 ; X64: # %bb.0: # %entry
5614 ; X64-NEXT: kmovw %edi, %k1
5615 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5618 %0 = extractelement <2 x double> %__W, i64 0
5619 %1 = extractelement <2 x double> %__A, i64 0
5620 %2 = extractelement <2 x double> %__B, i64 0
5621 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5622 %4 = bitcast i8 %__U to <8 x i1>
5623 %5 = extractelement <8 x i1> %4, i64 0
5624 %6 = select i1 %5, double %3, double %0
5625 %7 = insertelement <2 x double> %__W, double %6, i64 0
5629 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
5631 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5632 ; X86-LABEL: test_mm_maskz_fmadd_sd:
5633 ; X86: # %bb.0: # %entry
5634 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5635 ; X86-NEXT: kmovw %eax, %k1
5636 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5639 ; X64-LABEL: test_mm_maskz_fmadd_sd:
5640 ; X64: # %bb.0: # %entry
5641 ; X64-NEXT: kmovw %edi, %k1
5642 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5645 %0 = extractelement <2 x double> %__A, i64 0
5646 %1 = extractelement <2 x double> %__B, i64 0
5647 %2 = extractelement <2 x double> %__C, i64 0
5648 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5650 %tobool.i = icmp eq i8 %4, 0
5651 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5652 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5653 ret <2 x double> %vecins.i
5656 define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5657 ; X86-LABEL: test_mm_maskz_fmadd_round_sd:
5658 ; X86: # %bb.0: # %entry
5659 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5660 ; X86-NEXT: kmovw %eax, %k1
5661 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5664 ; X64-LABEL: test_mm_maskz_fmadd_round_sd:
5665 ; X64: # %bb.0: # %entry
5666 ; X64-NEXT: kmovw %edi, %k1
5667 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5670 %0 = extractelement <2 x double> %__A, i64 0
5671 %1 = extractelement <2 x double> %__B, i64 0
5672 %2 = extractelement <2 x double> %__C, i64 0
5673 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5674 %4 = bitcast i8 %__U to <8 x i1>
5675 %5 = extractelement <8 x i1> %4, i64 0
5676 %6 = select i1 %5, double %3, double 0.000000e+00
5677 %7 = insertelement <2 x double> %__A, double %6, i64 0
5681 define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5682 ; X86-LABEL: test_mm_mask3_fmadd_sd:
5683 ; X86: # %bb.0: # %entry
5684 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5685 ; X86-NEXT: kmovw %eax, %k1
5686 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5687 ; X86-NEXT: vmovapd %xmm2, %xmm0
5690 ; X64-LABEL: test_mm_mask3_fmadd_sd:
5691 ; X64: # %bb.0: # %entry
5692 ; X64-NEXT: kmovw %edi, %k1
5693 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5694 ; X64-NEXT: vmovapd %xmm2, %xmm0
5697 %0 = extractelement <2 x double> %__W, i64 0
5698 %1 = extractelement <2 x double> %__X, i64 0
5699 %2 = extractelement <2 x double> %__Y, i64 0
5700 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5702 %tobool.i = icmp eq i8 %4, 0
5703 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5704 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5705 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5706 ret <2 x double> %vecins.i
5709 define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5710 ; X86-LABEL: test_mm_mask3_fmadd_round_sd:
5711 ; X86: # %bb.0: # %entry
5712 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5713 ; X86-NEXT: kmovw %eax, %k1
5714 ; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5715 ; X86-NEXT: vmovapd %xmm2, %xmm0
5718 ; X64-LABEL: test_mm_mask3_fmadd_round_sd:
5719 ; X64: # %bb.0: # %entry
5720 ; X64-NEXT: kmovw %edi, %k1
5721 ; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5722 ; X64-NEXT: vmovapd %xmm2, %xmm0
5725 %0 = extractelement <2 x double> %__W, i64 0
5726 %1 = extractelement <2 x double> %__X, i64 0
5727 %2 = extractelement <2 x double> %__Y, i64 0
5728 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5729 %4 = bitcast i8 %__U to <8 x i1>
5730 %5 = extractelement <8 x i1> %4, i64 0
5731 %6 = select i1 %5, double %3, double %2
5732 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5736 define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5737 ; X86-LABEL: test_mm_mask_fmsub_sd:
5738 ; X86: # %bb.0: # %entry
5739 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5740 ; X86-NEXT: kmovw %eax, %k1
5741 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5744 ; X64-LABEL: test_mm_mask_fmsub_sd:
5745 ; X64: # %bb.0: # %entry
5746 ; X64-NEXT: kmovw %edi, %k1
5747 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5750 %0 = extractelement <2 x double> %__W, i64 0
5751 %1 = extractelement <2 x double> %__A, i64 0
5752 %.rhs.i = extractelement <2 x double> %__B, i64 0
5753 %2 = fsub double -0.000000e+00, %.rhs.i
5754 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5756 %tobool.i = icmp eq i8 %4, 0
5757 %vecext1.i = extractelement <2 x double> %__W, i32 0
5758 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5759 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5760 ret <2 x double> %vecins.i
5763 define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5764 ; X86-LABEL: test_mm_mask_fmsub_round_sd:
5765 ; X86: # %bb.0: # %entry
5766 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5767 ; X86-NEXT: kmovw %eax, %k1
5768 ; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5771 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
5772 ; X64: # %bb.0: # %entry
5773 ; X64-NEXT: kmovw %edi, %k1
5774 ; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5777 %0 = extractelement <2 x double> %__W, i64 0
5778 %1 = extractelement <2 x double> %__A, i64 0
5779 %.rhs = extractelement <2 x double> %__B, i64 0
5780 %2 = fsub double -0.000000e+00, %.rhs
5781 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5782 %4 = bitcast i8 %__U to <8 x i1>
5783 %5 = extractelement <8 x i1> %4, i64 0
5784 %6 = select i1 %5, double %3, double %0
5785 %7 = insertelement <2 x double> %__W, double %6, i64 0
5789 define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5790 ; X86-LABEL: test_mm_maskz_fmsub_sd:
5791 ; X86: # %bb.0: # %entry
5792 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5793 ; X86-NEXT: kmovw %eax, %k1
5794 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5797 ; X64-LABEL: test_mm_maskz_fmsub_sd:
5798 ; X64: # %bb.0: # %entry
5799 ; X64-NEXT: kmovw %edi, %k1
5800 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5803 %0 = extractelement <2 x double> %__A, i64 0
5804 %1 = extractelement <2 x double> %__B, i64 0
5805 %.rhs.i = extractelement <2 x double> %__C, i64 0
5806 %2 = fsub double -0.000000e+00, %.rhs.i
5807 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5809 %tobool.i = icmp eq i8 %4, 0
5810 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5811 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5812 ret <2 x double> %vecins.i
5815 define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5816 ; X86-LABEL: test_mm_maskz_fmsub_round_sd:
5817 ; X86: # %bb.0: # %entry
5818 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5819 ; X86-NEXT: kmovw %eax, %k1
5820 ; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5823 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
5824 ; X64: # %bb.0: # %entry
5825 ; X64-NEXT: kmovw %edi, %k1
5826 ; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5829 %0 = extractelement <2 x double> %__A, i64 0
5830 %1 = extractelement <2 x double> %__B, i64 0
5831 %.rhs = extractelement <2 x double> %__C, i64 0
5832 %2 = fsub double -0.000000e+00, %.rhs
5833 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5834 %4 = bitcast i8 %__U to <8 x i1>
5835 %5 = extractelement <8 x i1> %4, i64 0
5836 %6 = select i1 %5, double %3, double 0.000000e+00
5837 %7 = insertelement <2 x double> %__A, double %6, i64 0
5841 define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5842 ; X86-LABEL: test_mm_mask3_fmsub_sd:
5843 ; X86: # %bb.0: # %entry
5844 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5845 ; X86-NEXT: kmovw %eax, %k1
5846 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5847 ; X86-NEXT: vmovapd %xmm2, %xmm0
5850 ; X64-LABEL: test_mm_mask3_fmsub_sd:
5851 ; X64: # %bb.0: # %entry
5852 ; X64-NEXT: kmovw %edi, %k1
5853 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5854 ; X64-NEXT: vmovapd %xmm2, %xmm0
5857 %0 = extractelement <2 x double> %__W, i64 0
5858 %1 = extractelement <2 x double> %__X, i64 0
5859 %.rhs.i = extractelement <2 x double> %__Y, i64 0
5860 %2 = fsub double -0.000000e+00, %.rhs.i
5861 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5863 %tobool.i = icmp eq i8 %4, 0
5864 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5865 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5866 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5867 ret <2 x double> %vecins.i
5870 define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5871 ; X86-LABEL: test_mm_mask3_fmsub_round_sd:
5872 ; X86: # %bb.0: # %entry
5873 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5874 ; X86-NEXT: kmovw %eax, %k1
5875 ; X86-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5876 ; X86-NEXT: vmovapd %xmm2, %xmm0
5879 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
5880 ; X64: # %bb.0: # %entry
5881 ; X64-NEXT: kmovw %edi, %k1
5882 ; X64-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5883 ; X64-NEXT: vmovapd %xmm2, %xmm0
5886 %0 = extractelement <2 x double> %__W, i64 0
5887 %1 = extractelement <2 x double> %__X, i64 0
5888 %.rhs = extractelement <2 x double> %__Y, i64 0
5889 %2 = fsub double -0.000000e+00, %.rhs
5890 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5891 %4 = bitcast i8 %__U to <8 x i1>
5892 %5 = extractelement <8 x i1> %4, i64 0
5893 %6 = select i1 %5, double %3, double %.rhs
5894 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5898 define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5899 ; X86-LABEL: test_mm_mask_fnmadd_sd:
5900 ; X86: # %bb.0: # %entry
5901 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5902 ; X86-NEXT: kmovw %eax, %k1
5903 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5906 ; X64-LABEL: test_mm_mask_fnmadd_sd:
5907 ; X64: # %bb.0: # %entry
5908 ; X64-NEXT: kmovw %edi, %k1
5909 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5912 %0 = extractelement <2 x double> %__W, i64 0
5913 %.rhs.i = extractelement <2 x double> %__A, i64 0
5914 %1 = fsub double -0.000000e+00, %.rhs.i
5915 %2 = extractelement <2 x double> %__B, i64 0
5916 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5918 %tobool.i = icmp eq i8 %4, 0
5919 %vecext1.i = extractelement <2 x double> %__W, i32 0
5920 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5921 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5922 ret <2 x double> %vecins.i
5925 define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5926 ; X86-LABEL: test_mm_mask_fnmadd_round_sd:
5927 ; X86: # %bb.0: # %entry
5928 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5929 ; X86-NEXT: kmovw %eax, %k1
5930 ; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5933 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
5934 ; X64: # %bb.0: # %entry
5935 ; X64-NEXT: kmovw %edi, %k1
5936 ; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5939 %0 = extractelement <2 x double> %__W, i64 0
5940 %.rhs = extractelement <2 x double> %__A, i64 0
5941 %1 = fsub double -0.000000e+00, %.rhs
5942 %2 = extractelement <2 x double> %__B, i64 0
5943 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5944 %4 = bitcast i8 %__U to <8 x i1>
5945 %5 = extractelement <8 x i1> %4, i64 0
5946 %6 = select i1 %5, double %3, double %0
5947 %7 = insertelement <2 x double> %__W, double %6, i64 0
5951 define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5952 ; X86-LABEL: test_mm_maskz_fnmadd_sd:
5953 ; X86: # %bb.0: # %entry
5954 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5955 ; X86-NEXT: kmovw %eax, %k1
5956 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5959 ; X64-LABEL: test_mm_maskz_fnmadd_sd:
5960 ; X64: # %bb.0: # %entry
5961 ; X64-NEXT: kmovw %edi, %k1
5962 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5965 %0 = extractelement <2 x double> %__A, i64 0
5966 %.rhs.i = extractelement <2 x double> %__B, i64 0
5967 %1 = fsub double -0.000000e+00, %.rhs.i
5968 %2 = extractelement <2 x double> %__C, i64 0
5969 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5971 %tobool.i = icmp eq i8 %4, 0
5972 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5973 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5974 ret <2 x double> %vecins.i
5977 define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5978 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
5979 ; X86: # %bb.0: # %entry
5980 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5981 ; X86-NEXT: kmovw %eax, %k1
5982 ; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5985 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
5986 ; X64: # %bb.0: # %entry
5987 ; X64-NEXT: kmovw %edi, %k1
5988 ; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5991 %0 = extractelement <2 x double> %__A, i64 0
5992 %.rhs = extractelement <2 x double> %__B, i64 0
5993 %1 = fsub double -0.000000e+00, %.rhs
5994 %2 = extractelement <2 x double> %__C, i64 0
5995 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5996 %4 = bitcast i8 %__U to <8 x i1>
5997 %5 = extractelement <8 x i1> %4, i64 0
5998 %6 = select i1 %5, double %3, double 0.000000e+00
5999 %7 = insertelement <2 x double> %__A, double %6, i64 0
6003 define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6004 ; X86-LABEL: test_mm_mask3_fnmadd_sd:
6005 ; X86: # %bb.0: # %entry
6006 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6007 ; X86-NEXT: kmovw %eax, %k1
6008 ; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
6009 ; X86-NEXT: vmovapd %xmm2, %xmm0
6012 ; X64-LABEL: test_mm_mask3_fnmadd_sd:
6013 ; X64: # %bb.0: # %entry
6014 ; X64-NEXT: kmovw %edi, %k1
6015 ; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
6016 ; X64-NEXT: vmovapd %xmm2, %xmm0
6019 %0 = extractelement <2 x double> %__W, i64 0
6020 %.rhs.i = extractelement <2 x double> %__X, i64 0
6021 %1 = fsub double -0.000000e+00, %.rhs.i
6022 %2 = extractelement <2 x double> %__Y, i64 0
6023 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6025 %tobool.i = icmp eq i8 %4, 0
6026 %vecext1.i = extractelement <2 x double> %__Y, i32 0
6027 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
6028 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6029 ret <2 x double> %vecins.i
6032 define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6033 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
6034 ; X86: # %bb.0: # %entry
6035 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6036 ; X86-NEXT: kmovw %eax, %k1
6037 ; X86-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6038 ; X86-NEXT: vmovapd %xmm2, %xmm0
6041 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
6042 ; X64: # %bb.0: # %entry
6043 ; X64-NEXT: kmovw %edi, %k1
6044 ; X64-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6045 ; X64-NEXT: vmovapd %xmm2, %xmm0
6048 %0 = extractelement <2 x double> %__W, i64 0
6049 %.rhs = extractelement <2 x double> %__X, i64 0
6050 %1 = fsub double -0.000000e+00, %.rhs
6051 %2 = extractelement <2 x double> %__Y, i64 0
6052 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6053 %4 = bitcast i8 %__U to <8 x i1>
6054 %5 = extractelement <8 x i1> %4, i64 0
6055 %6 = select i1 %5, double %3, double %2
6056 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6060 define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6061 ; X86-LABEL: test_mm_mask_fnmsub_sd:
6062 ; X86: # %bb.0: # %entry
6063 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6064 ; X86-NEXT: kmovw %eax, %k1
6065 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6068 ; X64-LABEL: test_mm_mask_fnmsub_sd:
6069 ; X64: # %bb.0: # %entry
6070 ; X64-NEXT: kmovw %edi, %k1
6071 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6074 %0 = extractelement <2 x double> %__W, i64 0
6075 %.rhs.i = extractelement <2 x double> %__A, i64 0
6076 %1 = fsub double -0.000000e+00, %.rhs.i
6077 %.rhs7.i = extractelement <2 x double> %__B, i64 0
6078 %2 = fsub double -0.000000e+00, %.rhs7.i
6079 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6081 %tobool.i = icmp eq i8 %4, 0
6082 %vecext2.i = extractelement <2 x double> %__W, i32 0
6083 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6084 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
6085 ret <2 x double> %vecins.i
6088 define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6089 ; X86-LABEL: test_mm_mask_fnmsub_round_sd:
6090 ; X86: # %bb.0: # %entry
6091 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6092 ; X86-NEXT: kmovw %eax, %k1
6093 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6096 ; X64-LABEL: test_mm_mask_fnmsub_round_sd:
6097 ; X64: # %bb.0: # %entry
6098 ; X64-NEXT: kmovw %edi, %k1
6099 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6102 %0 = extractelement <2 x double> %__W, i64 0
6103 %.rhs = extractelement <2 x double> %__A, i64 0
6104 %1 = fsub double -0.000000e+00, %.rhs
6105 %.rhs2 = extractelement <2 x double> %__B, i64 0
6106 %2 = fsub double -0.000000e+00, %.rhs2
6107 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6108 %4 = bitcast i8 %__U to <8 x i1>
6109 %5 = extractelement <8 x i1> %4, i64 0
6110 %6 = select i1 %5, double %3, double %0
6111 %7 = insertelement <2 x double> %__W, double %6, i64 0
6115 define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6116 ; X86-LABEL: test_mm_maskz_fnmsub_sd:
6117 ; X86: # %bb.0: # %entry
6118 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6119 ; X86-NEXT: kmovw %eax, %k1
6120 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6123 ; X64-LABEL: test_mm_maskz_fnmsub_sd:
6124 ; X64: # %bb.0: # %entry
6125 ; X64-NEXT: kmovw %edi, %k1
6126 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6129 %0 = extractelement <2 x double> %__A, i64 0
6130 %.rhs.i = extractelement <2 x double> %__B, i64 0
6131 %1 = fsub double -0.000000e+00, %.rhs.i
6132 %.rhs5.i = extractelement <2 x double> %__C, i64 0
6133 %2 = fsub double -0.000000e+00, %.rhs5.i
6134 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6136 %tobool.i = icmp eq i8 %4, 0
6137 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6138 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6139 ret <2 x double> %vecins.i
6142 define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6143 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
6144 ; X86: # %bb.0: # %entry
6145 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6146 ; X86-NEXT: kmovw %eax, %k1
6147 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6150 ; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
6151 ; X64: # %bb.0: # %entry
6152 ; X64-NEXT: kmovw %edi, %k1
6153 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6156 %0 = extractelement <2 x double> %__A, i64 0
6157 %.rhs = extractelement <2 x double> %__B, i64 0
6158 %1 = fsub double -0.000000e+00, %.rhs
6159 %.rhs2 = extractelement <2 x double> %__C, i64 0
6160 %2 = fsub double -0.000000e+00, %.rhs2
6161 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6162 %4 = bitcast i8 %__U to <8 x i1>
6163 %5 = extractelement <8 x i1> %4, i64 0
6164 %6 = select i1 %5, double %3, double 0.000000e+00
6165 %7 = insertelement <2 x double> %__A, double %6, i64 0
6169 define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6170 ; X86-LABEL: test_mm_mask3_fnmsub_sd:
6171 ; X86: # %bb.0: # %entry
6172 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6173 ; X86-NEXT: kmovw %eax, %k1
6174 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6175 ; X86-NEXT: vmovapd %xmm2, %xmm0
6178 ; X64-LABEL: test_mm_mask3_fnmsub_sd:
6179 ; X64: # %bb.0: # %entry
6180 ; X64-NEXT: kmovw %edi, %k1
6181 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6182 ; X64-NEXT: vmovapd %xmm2, %xmm0
6185 %0 = extractelement <2 x double> %__W, i64 0
6186 %.rhs.i = extractelement <2 x double> %__X, i64 0
6187 %1 = fsub double -0.000000e+00, %.rhs.i
6188 %.rhs7.i = extractelement <2 x double> %__Y, i64 0
6189 %2 = fsub double -0.000000e+00, %.rhs7.i
6190 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6192 %tobool.i = icmp eq i8 %4, 0
6193 %vecext2.i = extractelement <2 x double> %__Y, i32 0
6194 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6195 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6196 ret <2 x double> %vecins.i
6199 define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6200 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
6201 ; X86: # %bb.0: # %entry
6202 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6203 ; X86-NEXT: kmovw %eax, %k1
6204 ; X86-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6205 ; X86-NEXT: vmovapd %xmm2, %xmm0
6208 ; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
6209 ; X64: # %bb.0: # %entry
6210 ; X64-NEXT: kmovw %edi, %k1
6211 ; X64-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6212 ; X64-NEXT: vmovapd %xmm2, %xmm0
6215 %0 = extractelement <2 x double> %__W, i64 0
6216 %.rhs = extractelement <2 x double> %__X, i64 0
6217 %1 = fsub double -0.000000e+00, %.rhs
6218 %.rhs1 = extractelement <2 x double> %__Y, i64 0
6219 %2 = fsub double -0.000000e+00, %.rhs1
6220 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6221 %4 = bitcast i8 %__U to <8 x i1>
6222 %5 = extractelement <8 x i1> %4, i64 0
6223 %6 = select i1 %5, double %3, double %.rhs1
6224 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6228 define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6229 ; X86-LABEL: test_mm512_mask_expandloadu_epi64:
6230 ; X86: # %bb.0: # %entry
6231 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6232 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6233 ; X86-NEXT: kmovw %ecx, %k1
6234 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1}
6237 ; X64-LABEL: test_mm512_mask_expandloadu_epi64:
6238 ; X64: # %bb.0: # %entry
6239 ; X64-NEXT: kmovw %edi, %k1
6240 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1}
6243 %0 = bitcast i8* %__P to i64*
6244 %1 = bitcast i8 %__U to <8 x i1>
6245 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
6249 define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6250 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
6251 ; X86: # %bb.0: # %entry
6252 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6253 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6254 ; X86-NEXT: kmovw %ecx, %k1
6255 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
6258 ; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
6259 ; X64: # %bb.0: # %entry
6260 ; X64-NEXT: kmovw %edi, %k1
6261 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
6264 %0 = bitcast i8* %__P to i64*
6265 %1 = bitcast i8 %__U to <8 x i1>
6266 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
6270 define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
6271 ; X86-LABEL: test_mm512_mask_expandloadu_pd:
6272 ; X86: # %bb.0: # %entry
6273 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6274 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6275 ; X86-NEXT: kmovw %ecx, %k1
6276 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1}
6279 ; X64-LABEL: test_mm512_mask_expandloadu_pd:
6280 ; X64: # %bb.0: # %entry
6281 ; X64-NEXT: kmovw %edi, %k1
6282 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1}
6285 %0 = bitcast i8* %__P to double*
6286 %1 = bitcast i8 %__U to <8 x i1>
6287 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
6291 define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6292 ; X86-LABEL: test_mm512_maskz_expandloadu_pd:
6293 ; X86: # %bb.0: # %entry
6294 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6295 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6296 ; X86-NEXT: kmovw %ecx, %k1
6297 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z}
6300 ; X64-LABEL: test_mm512_maskz_expandloadu_pd:
6301 ; X64: # %bb.0: # %entry
6302 ; X64-NEXT: kmovw %edi, %k1
6303 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z}
6306 %0 = bitcast i8* %__P to double*
6307 %1 = bitcast i8 %__U to <8 x i1>
6308 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
6312 define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
6313 ; X86-LABEL: test_mm512_mask_expandloadu_epi32:
6314 ; X86: # %bb.0: # %entry
6315 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6316 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6317 ; X86-NEXT: kmovw %ecx, %k1
6318 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1}
6321 ; X64-LABEL: test_mm512_mask_expandloadu_epi32:
6322 ; X64: # %bb.0: # %entry
6323 ; X64-NEXT: kmovw %edi, %k1
6324 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1}
6327 %0 = bitcast <8 x i64> %__W to <16 x i32>
6328 %1 = bitcast i8* %__P to i32*
6329 %2 = bitcast i16 %__U to <16 x i1>
6330 %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
6331 %4 = bitcast <16 x i32> %3 to <8 x i64>
6335 define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
6336 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
6337 ; X86: # %bb.0: # %entry
6338 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6339 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6340 ; X86-NEXT: kmovw %ecx, %k1
6341 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z}
6344 ; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
6345 ; X64: # %bb.0: # %entry
6346 ; X64-NEXT: kmovw %edi, %k1
6347 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z}
6350 %0 = bitcast i8* %__P to i32*
6351 %1 = bitcast i16 %__U to <16 x i1>
6352 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
6353 %3 = bitcast <16 x i32> %2 to <8 x i64>
6357 define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
6358 ; X86-LABEL: test_mm512_mask_expandloadu_ps:
6359 ; X86: # %bb.0: # %entry
6360 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6361 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6362 ; X86-NEXT: kmovw %ecx, %k1
6363 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1}
6366 ; X64-LABEL: test_mm512_mask_expandloadu_ps:
6367 ; X64: # %bb.0: # %entry
6368 ; X64-NEXT: kmovw %edi, %k1
6369 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1}
6372 %0 = bitcast i8* %__P to float*
6373 %1 = bitcast i16 %__U to <16 x i1>
6374 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
6378 define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
6379 ; X86-LABEL: test_mm512_maskz_expandloadu_ps:
6380 ; X86: # %bb.0: # %entry
6381 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6382 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6383 ; X86-NEXT: kmovw %ecx, %k1
6384 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z}
6387 ; X64-LABEL: test_mm512_maskz_expandloadu_ps:
6388 ; X64: # %bb.0: # %entry
6389 ; X64-NEXT: kmovw %edi, %k1
6390 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z}
6393 %0 = bitcast i8* %__P to float*
6394 %1 = bitcast i16 %__U to <16 x i1>
6395 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
6399 define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
6400 ; X86-LABEL: test_mm512_mask_compressstoreu_pd:
6401 ; X86: # %bb.0: # %entry
6402 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6403 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6404 ; X86-NEXT: kmovw %eax, %k1
6405 ; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1}
6406 ; X86-NEXT: vzeroupper
6409 ; X64-LABEL: test_mm512_mask_compressstoreu_pd:
6410 ; X64: # %bb.0: # %entry
6411 ; X64-NEXT: kmovw %esi, %k1
6412 ; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
6413 ; X64-NEXT: vzeroupper
6416 %0 = bitcast i8* %__P to double*
6417 %1 = bitcast i8 %__U to <8 x i1>
6418 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
6422 define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
6423 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
6424 ; X86: # %bb.0: # %entry
6425 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6426 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6427 ; X86-NEXT: kmovw %eax, %k1
6428 ; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1}
6429 ; X86-NEXT: vzeroupper
6432 ; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
6433 ; X64: # %bb.0: # %entry
6434 ; X64-NEXT: kmovw %esi, %k1
6435 ; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
6436 ; X64-NEXT: vzeroupper
6439 %0 = bitcast i8* %__P to i64*
6440 %1 = bitcast i8 %__U to <8 x i1>
6441 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
6445 define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
6446 ; X86-LABEL: test_mm512_mask_compressstoreu_ps:
6447 ; X86: # %bb.0: # %entry
6448 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6449 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6450 ; X86-NEXT: kmovw %eax, %k1
6451 ; X86-NEXT: vcompressps %zmm0, (%ecx) {%k1}
6452 ; X86-NEXT: vzeroupper
6455 ; X64-LABEL: test_mm512_mask_compressstoreu_ps:
6456 ; X64: # %bb.0: # %entry
6457 ; X64-NEXT: kmovw %esi, %k1
6458 ; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1}
6459 ; X64-NEXT: vzeroupper
6462 %0 = bitcast i8* %__P to float*
6463 %1 = bitcast i16 %__U to <16 x i1>
6464 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
6468 define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
6469 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
6470 ; X86: # %bb.0: # %entry
6471 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6472 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6473 ; X86-NEXT: kmovw %eax, %k1
6474 ; X86-NEXT: vpcompressd %zmm0, (%ecx) {%k1}
6475 ; X86-NEXT: vzeroupper
6478 ; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
6479 ; X64: # %bb.0: # %entry
6480 ; X64-NEXT: kmovw %esi, %k1
6481 ; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1}
6482 ; X64-NEXT: vzeroupper
6485 %0 = bitcast <8 x i64> %__A to <16 x i32>
6486 %1 = bitcast i8* %__P to i32*
6487 %2 = bitcast i16 %__U to <16 x i1>
6488 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
6492 define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
6493 ; X86-LABEL: test_mm512_reduce_add_epi64:
6494 ; X86: # %bb.0: # %entry
6495 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6496 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6497 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6498 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6499 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6500 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6501 ; X86-NEXT: vmovd %xmm0, %eax
6502 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6503 ; X86-NEXT: vzeroupper
6506 ; X64-LABEL: test_mm512_reduce_add_epi64:
6507 ; X64: # %bb.0: # %entry
6508 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6509 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6510 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6511 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6512 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6513 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6514 ; X64-NEXT: vmovq %xmm0, %rax
6515 ; X64-NEXT: vzeroupper
6518 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6519 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6520 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6521 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6522 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6523 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6524 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6525 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6526 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6530 define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
6531 ; X86-LABEL: test_mm512_reduce_mul_epi64:
6532 ; X86: # %bb.0: # %entry
6533 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6534 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm2
6535 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6536 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm3
6537 ; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6538 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6539 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6540 ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6541 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6542 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6543 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6544 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6545 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6546 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6547 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6548 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6549 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6550 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6551 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6552 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6553 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6554 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6555 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6556 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6557 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6558 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6559 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6560 ; X86-NEXT: vmovd %xmm0, %eax
6561 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6562 ; X86-NEXT: vzeroupper
6565 ; X64-LABEL: test_mm512_reduce_mul_epi64:
6566 ; X64: # %bb.0: # %entry
6567 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6568 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
6569 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6570 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
6571 ; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6572 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6573 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6574 ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6575 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6576 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6577 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6578 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6579 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6580 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6581 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6582 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6583 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6584 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6585 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6586 ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6587 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6588 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6589 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6590 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6591 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6592 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6593 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6594 ; X64-NEXT: vmovq %xmm0, %rax
6595 ; X64-NEXT: vzeroupper
6598 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6599 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6600 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6601 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6602 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6603 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6604 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6605 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6606 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6610 define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
6611 ; X86-LABEL: test_mm512_reduce_or_epi64:
6612 ; X86: # %bb.0: # %entry
6613 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6614 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6615 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6616 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6617 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6618 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6619 ; X86-NEXT: vmovd %xmm0, %eax
6620 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6621 ; X86-NEXT: vzeroupper
6624 ; X64-LABEL: test_mm512_reduce_or_epi64:
6625 ; X64: # %bb.0: # %entry
6626 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6627 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6628 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6629 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6630 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6631 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6632 ; X64-NEXT: vmovq %xmm0, %rax
6633 ; X64-NEXT: vzeroupper
6636 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6637 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6638 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6639 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6640 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6641 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6642 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6643 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6644 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6648 define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
6649 ; X86-LABEL: test_mm512_reduce_and_epi64:
6650 ; X86: # %bb.0: # %entry
6651 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6652 ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
6653 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6654 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6655 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6656 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6657 ; X86-NEXT: vmovd %xmm0, %eax
6658 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6659 ; X86-NEXT: vzeroupper
6662 ; X64-LABEL: test_mm512_reduce_and_epi64:
6663 ; X64: # %bb.0: # %entry
6664 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6665 ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
6666 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6667 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6668 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6669 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6670 ; X64-NEXT: vmovq %xmm0, %rax
6671 ; X64-NEXT: vzeroupper
6674 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6675 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6676 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6677 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6678 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6679 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6680 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6681 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6682 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6686 define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6687 ; X86-LABEL: test_mm512_mask_reduce_add_epi64:
6688 ; X86: # %bb.0: # %entry
6689 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6690 ; X86-NEXT: kmovw %eax, %k1
6691 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6692 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6693 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6694 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6695 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6696 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6697 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6698 ; X86-NEXT: vmovd %xmm0, %eax
6699 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6700 ; X86-NEXT: vzeroupper
6703 ; X64-LABEL: test_mm512_mask_reduce_add_epi64:
6704 ; X64: # %bb.0: # %entry
6705 ; X64-NEXT: kmovw %edi, %k1
6706 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6707 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6708 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6709 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6710 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6711 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6712 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6713 ; X64-NEXT: vmovq %xmm0, %rax
6714 ; X64-NEXT: vzeroupper
6717 %0 = bitcast i8 %__M to <8 x i1>
6718 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6719 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6720 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6721 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6722 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6723 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6724 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6725 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6726 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6727 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6731 define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6732 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
6733 ; X86: # %bb.0: # %entry
6734 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6735 ; X86-NEXT: kmovw %eax, %k1
6736 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
6737 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6738 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6739 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2
6740 ; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6741 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm3
6742 ; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6743 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6744 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6745 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6746 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6747 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6748 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6749 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6750 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6751 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6752 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6753 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6754 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6755 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6756 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6757 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6758 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6759 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6760 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6761 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6762 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6763 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6764 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6765 ; X86-NEXT: vmovd %xmm0, %eax
6766 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6767 ; X86-NEXT: vzeroupper
6770 ; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
6771 ; X64: # %bb.0: # %entry
6772 ; X64-NEXT: kmovw %edi, %k1
6773 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
6774 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6775 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6776 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm2
6777 ; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6778 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm3
6779 ; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6780 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6781 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6782 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6783 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6784 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6785 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6786 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6787 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6788 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6789 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6790 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6791 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6792 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6793 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6794 ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6795 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6796 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6797 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6798 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6799 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6800 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6801 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6802 ; X64-NEXT: vmovq %xmm0, %rax
6803 ; X64-NEXT: vzeroupper
6806 %0 = bitcast i8 %__M to <8 x i1>
6807 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
6808 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6809 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6810 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6811 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6812 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6813 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6814 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6815 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6816 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6820 define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6821 ; X86-LABEL: test_mm512_mask_reduce_and_epi64:
6822 ; X86: # %bb.0: # %entry
6823 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6824 ; X86-NEXT: kmovw %eax, %k1
6825 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6826 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6827 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6828 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
6829 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6830 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6831 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6832 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6833 ; X86-NEXT: vmovd %xmm0, %eax
6834 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6835 ; X86-NEXT: vzeroupper
6838 ; X64-LABEL: test_mm512_mask_reduce_and_epi64:
6839 ; X64: # %bb.0: # %entry
6840 ; X64-NEXT: kmovw %edi, %k1
6841 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6842 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6843 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6844 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
6845 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6846 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6847 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6848 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6849 ; X64-NEXT: vmovq %xmm0, %rax
6850 ; X64-NEXT: vzeroupper
6853 %0 = bitcast i8 %__M to <8 x i1>
6854 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
6855 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6856 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6857 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6858 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6859 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6860 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6861 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6862 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6863 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6867 define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6868 ; X86-LABEL: test_mm512_mask_reduce_or_epi64:
6869 ; X86: # %bb.0: # %entry
6870 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6871 ; X86-NEXT: kmovw %eax, %k1
6872 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6873 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6874 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6875 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6876 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6877 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6878 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6879 ; X86-NEXT: vmovd %xmm0, %eax
6880 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6881 ; X86-NEXT: vzeroupper
6884 ; X64-LABEL: test_mm512_mask_reduce_or_epi64:
6885 ; X64: # %bb.0: # %entry
6886 ; X64-NEXT: kmovw %edi, %k1
6887 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6888 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6889 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6890 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6891 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6892 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6893 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6894 ; X64-NEXT: vmovq %xmm0, %rax
6895 ; X64-NEXT: vzeroupper
6898 %0 = bitcast i8 %__M to <8 x i1>
6899 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6900 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6901 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6902 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6903 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6904 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6905 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6906 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6907 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6908 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6912 define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
6913 ; CHECK-LABEL: test_mm512_reduce_add_epi32:
6914 ; CHECK: # %bb.0: # %entry
6915 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6916 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6917 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6918 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6919 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6920 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6921 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6922 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6923 ; CHECK-NEXT: vmovd %xmm0, %eax
6924 ; CHECK-NEXT: vzeroupper
6925 ; CHECK-NEXT: ret{{[l|q]}}
6927 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6928 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6929 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6930 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6931 %add.i = add <8 x i32> %0, %1
6932 %2 = bitcast <8 x i32> %add.i to <4 x i64>
6933 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6934 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6935 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6936 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6937 %add5.i = add <4 x i32> %3, %4
6938 %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6939 %add6.i = add <4 x i32> %shuffle.i, %add5.i
6940 %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6941 %add8.i = add <4 x i32> %shuffle7.i, %add6.i
6942 %vecext.i = extractelement <4 x i32> %add8.i, i32 0
6946 define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
6947 ; CHECK-LABEL: test_mm512_reduce_mul_epi32:
6948 ; CHECK: # %bb.0: # %entry
6949 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6950 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
6951 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6952 ; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0
6953 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6954 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6955 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6956 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6957 ; CHECK-NEXT: vmovd %xmm0, %eax
6958 ; CHECK-NEXT: vzeroupper
6959 ; CHECK-NEXT: ret{{[l|q]}}
6961 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6962 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6963 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6964 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6965 %mul.i = mul <8 x i32> %0, %1
6966 %2 = bitcast <8 x i32> %mul.i to <4 x i64>
6967 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6968 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6969 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6970 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6971 %mul5.i = mul <4 x i32> %3, %4
6972 %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6973 %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
6974 %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6975 %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
6976 %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
6980 define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
6981 ; CHECK-LABEL: test_mm512_reduce_or_epi32:
6982 ; CHECK: # %bb.0: # %entry
6983 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6984 ; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
6985 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6986 ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
6987 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6988 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
6989 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6990 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
6991 ; CHECK-NEXT: vmovd %xmm0, %eax
6992 ; CHECK-NEXT: vzeroupper
6993 ; CHECK-NEXT: ret{{[l|q]}}
6995 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6996 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6997 %or25.i = or <4 x i64> %extract.i, %extract2.i
6998 %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6999 %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7000 %or526.i = or <2 x i64> %extract3.i, %extract4.i
7001 %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
7002 %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7003 %or6.i = or <4 x i32> %shuffle.i, %or5.i
7004 %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7005 %or8.i = or <4 x i32> %shuffle7.i, %or6.i
7006 %vecext.i = extractelement <4 x i32> %or8.i, i32 0
7010 define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
7011 ; CHECK-LABEL: test_mm512_reduce_and_epi32:
7012 ; CHECK: # %bb.0: # %entry
7013 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7014 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
7015 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
7016 ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
7017 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7018 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7019 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7020 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7021 ; CHECK-NEXT: vmovd %xmm0, %eax
7022 ; CHECK-NEXT: vzeroupper
7023 ; CHECK-NEXT: ret{{[l|q]}}
7025 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7026 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7027 %and25.i = and <4 x i64> %extract.i, %extract2.i
7028 %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7029 %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7030 %and526.i = and <2 x i64> %extract3.i, %extract4.i
7031 %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
7032 %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7033 %and6.i = and <4 x i32> %shuffle.i, %and5.i
7034 %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7035 %and8.i = and <4 x i32> %shuffle7.i, %and6.i
7036 %vecext.i = extractelement <4 x i32> %and8.i, i32 0
7040 define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7041 ; X86-LABEL: test_mm512_mask_reduce_add_epi32:
7042 ; X86: # %bb.0: # %entry
7043 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7044 ; X86-NEXT: kmovw %eax, %k1
7045 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7046 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7047 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7048 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7049 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7050 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7051 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7052 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7053 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7054 ; X86-NEXT: vmovd %xmm0, %eax
7055 ; X86-NEXT: vzeroupper
7058 ; X64-LABEL: test_mm512_mask_reduce_add_epi32:
7059 ; X64: # %bb.0: # %entry
7060 ; X64-NEXT: kmovw %edi, %k1
7061 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7062 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7063 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7064 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7065 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7066 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7067 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7068 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7069 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7070 ; X64-NEXT: vmovd %xmm0, %eax
7071 ; X64-NEXT: vzeroupper
7074 %0 = bitcast <8 x i64> %__W to <16 x i32>
7075 %1 = bitcast i16 %__M to <16 x i1>
7076 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7077 %3 = bitcast <16 x i32> %2 to <8 x i64>
7078 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7079 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7080 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7081 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
7082 %add.i = add <8 x i32> %4, %5
7083 %6 = bitcast <8 x i32> %add.i to <4 x i64>
7084 %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7085 %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
7086 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7087 %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
7088 %add6.i = add <4 x i32> %7, %8
7089 %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7090 %add7.i = add <4 x i32> %shuffle.i, %add6.i
7091 %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7092 %add9.i = add <4 x i32> %shuffle8.i, %add7.i
7093 %vecext.i = extractelement <4 x i32> %add9.i, i32 0
7097 define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7098 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
7099 ; X86: # %bb.0: # %entry
7100 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7101 ; X86-NEXT: kmovw %eax, %k1
7102 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7103 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7104 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7105 ; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7106 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7107 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7108 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7109 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7110 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7111 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7112 ; X86-NEXT: vmovd %xmm0, %eax
7113 ; X86-NEXT: vzeroupper
7116 ; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
7117 ; X64: # %bb.0: # %entry
7118 ; X64-NEXT: kmovw %edi, %k1
7119 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7120 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7121 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7122 ; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7123 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7124 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7125 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7126 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7127 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7128 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7129 ; X64-NEXT: vmovd %xmm0, %eax
7130 ; X64-NEXT: vzeroupper
7133 %0 = bitcast <8 x i64> %__W to <16 x i32>
7134 %1 = bitcast i16 %__M to <16 x i1>
7135 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
7136 %3 = bitcast <16 x i32> %2 to <8 x i64>
7137 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7138 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7139 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7140 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
7141 %mul.i = mul <8 x i32> %4, %5
7142 %6 = bitcast <8 x i32> %mul.i to <4 x i64>
7143 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7144 %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
7145 %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7146 %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
7147 %mul7.i = mul <4 x i32> %7, %8
7148 %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7149 %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
7150 %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7151 %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
7152 %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
7156 define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7157 ; X86-LABEL: test_mm512_mask_reduce_and_epi32:
7158 ; X86: # %bb.0: # %entry
7159 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7160 ; X86-NEXT: kmovw %eax, %k1
7161 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7162 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7163 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7164 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
7165 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7166 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
7167 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7168 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7169 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7170 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7171 ; X86-NEXT: vmovd %xmm0, %eax
7172 ; X86-NEXT: vzeroupper
7175 ; X64-LABEL: test_mm512_mask_reduce_and_epi32:
7176 ; X64: # %bb.0: # %entry
7177 ; X64-NEXT: kmovw %edi, %k1
7178 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7179 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7180 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7181 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
7182 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7183 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
7184 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7185 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7186 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7187 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7188 ; X64-NEXT: vmovd %xmm0, %eax
7189 ; X64-NEXT: vzeroupper
7192 %0 = bitcast <8 x i64> %__W to <16 x i32>
7193 %1 = bitcast i16 %__M to <16 x i1>
7194 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
7195 %3 = bitcast <16 x i32> %2 to <8 x i64>
7196 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7197 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7198 %and28.i = and <4 x i64> %extract.i, %extract4.i
7199 %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7200 %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7201 %and729.i = and <2 x i64> %extract5.i, %extract6.i
7202 %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
7203 %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7204 %and8.i = and <4 x i32> %shuffle.i, %and7.i
7205 %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7206 %and10.i = and <4 x i32> %shuffle9.i, %and8.i
7207 %vecext.i = extractelement <4 x i32> %and10.i, i32 0
7211 define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7212 ; X86-LABEL: test_mm512_mask_reduce_or_epi32:
7213 ; X86: # %bb.0: # %entry
7214 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7215 ; X86-NEXT: kmovw %eax, %k1
7216 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7217 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7218 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
7219 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7220 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
7221 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7222 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7223 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7224 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7225 ; X86-NEXT: vmovd %xmm0, %eax
7226 ; X86-NEXT: vzeroupper
7229 ; X64-LABEL: test_mm512_mask_reduce_or_epi32:
7230 ; X64: # %bb.0: # %entry
7231 ; X64-NEXT: kmovw %edi, %k1
7232 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7233 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7234 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
7235 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7236 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
7237 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7238 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7239 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7240 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7241 ; X64-NEXT: vmovd %xmm0, %eax
7242 ; X64-NEXT: vzeroupper
7245 %0 = bitcast <8 x i64> %__W to <16 x i32>
7246 %1 = bitcast i16 %__M to <16 x i1>
7247 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7248 %3 = bitcast <16 x i32> %2 to <8 x i64>
7249 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7250 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7251 %or27.i = or <4 x i64> %extract.i, %extract3.i
7252 %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7253 %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7254 %or628.i = or <2 x i64> %extract4.i, %extract5.i
7255 %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
7256 %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7257 %or7.i = or <4 x i32> %shuffle.i, %or6.i
7258 %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7259 %or9.i = or <4 x i32> %shuffle8.i, %or7.i
7260 %vecext.i = extractelement <4 x i32> %or9.i, i32 0
7264 define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
7265 ; X86-LABEL: test_mm512_reduce_add_pd:
7266 ; X86: # %bb.0: # %entry
7267 ; X86-NEXT: pushl %ebp
7268 ; X86-NEXT: .cfi_def_cfa_offset 8
7269 ; X86-NEXT: .cfi_offset %ebp, -8
7270 ; X86-NEXT: movl %esp, %ebp
7271 ; X86-NEXT: .cfi_def_cfa_register %ebp
7272 ; X86-NEXT: andl $-8, %esp
7273 ; X86-NEXT: subl $8, %esp
7274 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7275 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7276 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7277 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7278 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7279 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7280 ; X86-NEXT: vmovsd %xmm0, (%esp)
7281 ; X86-NEXT: fldl (%esp)
7282 ; X86-NEXT: movl %ebp, %esp
7283 ; X86-NEXT: popl %ebp
7284 ; X86-NEXT: .cfi_def_cfa %esp, 4
7285 ; X86-NEXT: vzeroupper
7288 ; X64-LABEL: test_mm512_reduce_add_pd:
7289 ; X64: # %bb.0: # %entry
7290 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7291 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7292 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7293 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7294 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7295 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7296 ; X64-NEXT: vzeroupper
7299 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7300 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7301 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7302 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7303 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7304 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7305 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7306 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7307 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7308 ret double %vecext.i
7311 define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
7312 ; X86-LABEL: test_mm512_reduce_mul_pd:
7313 ; X86: # %bb.0: # %entry
7314 ; X86-NEXT: pushl %ebp
7315 ; X86-NEXT: .cfi_def_cfa_offset 8
7316 ; X86-NEXT: .cfi_offset %ebp, -8
7317 ; X86-NEXT: movl %esp, %ebp
7318 ; X86-NEXT: .cfi_def_cfa_register %ebp
7319 ; X86-NEXT: andl $-8, %esp
7320 ; X86-NEXT: subl $8, %esp
7321 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7322 ; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7323 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7324 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7325 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7326 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7327 ; X86-NEXT: vmovsd %xmm0, (%esp)
7328 ; X86-NEXT: fldl (%esp)
7329 ; X86-NEXT: movl %ebp, %esp
7330 ; X86-NEXT: popl %ebp
7331 ; X86-NEXT: .cfi_def_cfa %esp, 4
7332 ; X86-NEXT: vzeroupper
7335 ; X64-LABEL: test_mm512_reduce_mul_pd:
7336 ; X64: # %bb.0: # %entry
7337 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7338 ; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7339 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7340 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7341 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7342 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7343 ; X64-NEXT: vzeroupper
7346 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7347 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7348 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7349 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7350 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7351 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7352 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7353 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7354 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7355 ret double %vecext.i
7358 define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
7359 ; X86-LABEL: test_mm512_reduce_add_ps:
7360 ; X86: # %bb.0: # %entry
7361 ; X86-NEXT: pushl %eax
7362 ; X86-NEXT: .cfi_def_cfa_offset 8
7363 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7364 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7365 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7366 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7367 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7368 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7369 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7370 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
7371 ; X86-NEXT: vmovss %xmm0, (%esp)
7372 ; X86-NEXT: flds (%esp)
7373 ; X86-NEXT: popl %eax
7374 ; X86-NEXT: .cfi_def_cfa_offset 4
7375 ; X86-NEXT: vzeroupper
7378 ; X64-LABEL: test_mm512_reduce_add_ps:
7379 ; X64: # %bb.0: # %entry
7380 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7381 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7382 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7383 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7384 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7385 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7386 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7387 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
7388 ; X64-NEXT: vzeroupper
7391 %0 = bitcast <16 x float> %__W to <8 x double>
7392 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7393 %1 = bitcast <4 x double> %extract.i to <8 x float>
7394 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7395 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7396 %add.i = fadd <8 x float> %1, %2
7397 %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7398 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7399 %add5.i = fadd <4 x float> %extract3.i, %extract4.i
7400 %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7401 %add6.i = fadd <4 x float> %add5.i, %shuffle.i
7402 %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7403 %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
7404 %vecext.i = extractelement <4 x float> %add8.i, i32 0
7408 define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
7409 ; X86-LABEL: test_mm512_reduce_mul_ps:
7410 ; X86: # %bb.0: # %entry
7411 ; X86-NEXT: pushl %eax
7412 ; X86-NEXT: .cfi_def_cfa_offset 8
7413 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7414 ; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0
7415 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7416 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7417 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7418 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7419 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7420 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
7421 ; X86-NEXT: vmovss %xmm0, (%esp)
7422 ; X86-NEXT: flds (%esp)
7423 ; X86-NEXT: popl %eax
7424 ; X86-NEXT: .cfi_def_cfa_offset 4
7425 ; X86-NEXT: vzeroupper
7428 ; X64-LABEL: test_mm512_reduce_mul_ps:
7429 ; X64: # %bb.0: # %entry
7430 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7431 ; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
7432 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7433 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7434 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7435 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7436 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7437 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
7438 ; X64-NEXT: vzeroupper
7441 %0 = bitcast <16 x float> %__W to <8 x double>
7442 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7443 %1 = bitcast <4 x double> %extract.i to <8 x float>
7444 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7445 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7446 %mul.i = fmul <8 x float> %1, %2
7447 %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7448 %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7449 %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
7450 %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7451 %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
7452 %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7453 %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
7454 %vecext.i = extractelement <4 x float> %mul8.i, i32 0
7458 define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
7459 ; X86-LABEL: test_mm512_mask_reduce_add_pd:
7460 ; X86: # %bb.0: # %entry
7461 ; X86-NEXT: pushl %ebp
7462 ; X86-NEXT: .cfi_def_cfa_offset 8
7463 ; X86-NEXT: .cfi_offset %ebp, -8
7464 ; X86-NEXT: movl %esp, %ebp
7465 ; X86-NEXT: .cfi_def_cfa_register %ebp
7466 ; X86-NEXT: andl $-8, %esp
7467 ; X86-NEXT: subl $8, %esp
7468 ; X86-NEXT: movb 8(%ebp), %al
7469 ; X86-NEXT: kmovw %eax, %k1
7470 ; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7471 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7472 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7473 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7474 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7475 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7476 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7477 ; X86-NEXT: vmovsd %xmm0, (%esp)
7478 ; X86-NEXT: fldl (%esp)
7479 ; X86-NEXT: movl %ebp, %esp
7480 ; X86-NEXT: popl %ebp
7481 ; X86-NEXT: .cfi_def_cfa %esp, 4
7482 ; X86-NEXT: vzeroupper
7485 ; X64-LABEL: test_mm512_mask_reduce_add_pd:
7486 ; X64: # %bb.0: # %entry
7487 ; X64-NEXT: kmovw %edi, %k1
7488 ; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7489 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7490 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7491 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7492 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7493 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7494 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7495 ; X64-NEXT: vzeroupper
7498 %0 = bitcast i8 %__M to <8 x i1>
7499 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
7500 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7501 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7502 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7503 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7504 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7505 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7506 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7507 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7508 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7509 ret double %vecext.i
7512 define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
7513 ; X86-LABEL: test_mm512_mask_reduce_mul_pd:
7514 ; X86: # %bb.0: # %entry
7515 ; X86-NEXT: pushl %ebp
7516 ; X86-NEXT: .cfi_def_cfa_offset 8
7517 ; X86-NEXT: .cfi_offset %ebp, -8
7518 ; X86-NEXT: movl %esp, %ebp
7519 ; X86-NEXT: .cfi_def_cfa_register %ebp
7520 ; X86-NEXT: andl $-8, %esp
7521 ; X86-NEXT: subl $8, %esp
7522 ; X86-NEXT: movb 8(%ebp), %al
7523 ; X86-NEXT: kmovw %eax, %k1
7524 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7525 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7526 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7527 ; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7528 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7529 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7530 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7531 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7532 ; X86-NEXT: vmovsd %xmm0, (%esp)
7533 ; X86-NEXT: fldl (%esp)
7534 ; X86-NEXT: movl %ebp, %esp
7535 ; X86-NEXT: popl %ebp
7536 ; X86-NEXT: .cfi_def_cfa %esp, 4
7537 ; X86-NEXT: vzeroupper
7540 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
7541 ; X64: # %bb.0: # %entry
7542 ; X64-NEXT: kmovw %edi, %k1
7543 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7544 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7545 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7546 ; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7547 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7548 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7549 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7550 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7551 ; X64-NEXT: vzeroupper
7554 %0 = bitcast i8 %__M to <8 x i1>
7555 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
7556 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7557 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7558 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7559 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7560 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7561 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7562 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7563 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7564 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7565 ret double %vecext.i
7568 define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
7569 ; X86-LABEL: test_mm512_mask_reduce_add_ps:
7570 ; X86: # %bb.0: # %entry
7571 ; X86-NEXT: pushl %eax
7572 ; X86-NEXT: .cfi_def_cfa_offset 8
7573 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7574 ; X86-NEXT: kmovw %eax, %k1
7575 ; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7576 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7577 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7578 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7579 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7580 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7581 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7582 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7583 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
7584 ; X86-NEXT: vmovss %xmm0, (%esp)
7585 ; X86-NEXT: flds (%esp)
7586 ; X86-NEXT: popl %eax
7587 ; X86-NEXT: .cfi_def_cfa_offset 4
7588 ; X86-NEXT: vzeroupper
7591 ; X64-LABEL: test_mm512_mask_reduce_add_ps:
7592 ; X64: # %bb.0: # %entry
7593 ; X64-NEXT: kmovw %edi, %k1
7594 ; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7595 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7596 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7597 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7598 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7599 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7600 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7601 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7602 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
7603 ; X64-NEXT: vzeroupper
7606 %0 = bitcast i16 %__M to <16 x i1>
7607 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
7608 %2 = bitcast <16 x float> %1 to <8 x double>
7609 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7610 %3 = bitcast <4 x double> %extract.i to <8 x float>
7611 %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7612 %4 = bitcast <4 x double> %extract3.i to <8 x float>
7613 %add.i = fadd <8 x float> %3, %4
7614 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7615 %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7616 %add6.i = fadd <4 x float> %extract4.i, %extract5.i
7617 %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7618 %add7.i = fadd <4 x float> %add6.i, %shuffle.i
7619 %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7620 %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
7621 %vecext.i = extractelement <4 x float> %add9.i, i32 0
7625 define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
7626 ; X86-LABEL: test_mm512_mask_reduce_mul_ps:
7627 ; X86: # %bb.0: # %entry
7628 ; X86-NEXT: pushl %eax
7629 ; X86-NEXT: .cfi_def_cfa_offset 8
7630 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7631 ; X86-NEXT: kmovw %eax, %k1
7632 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7633 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7634 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7635 ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0
7636 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7637 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7638 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7639 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7640 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7641 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
7642 ; X86-NEXT: vmovss %xmm0, (%esp)
7643 ; X86-NEXT: flds (%esp)
7644 ; X86-NEXT: popl %eax
7645 ; X86-NEXT: .cfi_def_cfa_offset 4
7646 ; X86-NEXT: vzeroupper
7649 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
7650 ; X64: # %bb.0: # %entry
7651 ; X64-NEXT: kmovw %edi, %k1
7652 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7653 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7654 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7655 ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0
7656 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7657 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7658 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7659 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7660 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7661 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
7662 ; X64-NEXT: vzeroupper
7665 %0 = bitcast i16 %__M to <16 x i1>
7666 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
7667 %2 = bitcast <16 x float> %1 to <8 x double>
7668 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7669 %3 = bitcast <4 x double> %extract.i to <8 x float>
7670 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7671 %4 = bitcast <4 x double> %extract4.i to <8 x float>
7672 %mul.i = fmul <8 x float> %3, %4
7673 %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7674 %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7675 %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
7676 %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7677 %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
7678 %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7679 %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
7680 %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7684 define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
7685 ; X86-LABEL: test_mm512_reduce_max_epi64:
7686 ; X86: # %bb.0: # %entry
7687 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7688 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7689 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7690 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7691 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7692 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7693 ; X86-NEXT: vmovd %xmm0, %eax
7694 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7695 ; X86-NEXT: vzeroupper
7698 ; X64-LABEL: test_mm512_reduce_max_epi64:
7699 ; X64: # %bb.0: # %entry
7700 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7701 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7702 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7703 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7704 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7705 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7706 ; X64-NEXT: vmovq %xmm0, %rax
7707 ; X64-NEXT: vzeroupper
7710 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7711 %0 = icmp slt <8 x i64> %shuffle.i, %__W
7712 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7713 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7714 %2 = icmp sgt <8 x i64> %1, %shuffle1.i
7715 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7716 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7717 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7718 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7719 %vecext.i = extractelement <8 x i64> %5, i32 0
7723 define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
7724 ; X86-LABEL: test_mm512_reduce_max_epu64:
7725 ; X86: # %bb.0: # %entry
7726 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7727 ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7728 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7729 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7730 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7731 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7732 ; X86-NEXT: vmovd %xmm0, %eax
7733 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7734 ; X86-NEXT: vzeroupper
7737 ; X64-LABEL: test_mm512_reduce_max_epu64:
7738 ; X64: # %bb.0: # %entry
7739 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7740 ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7741 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7742 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7743 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7744 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7745 ; X64-NEXT: vmovq %xmm0, %rax
7746 ; X64-NEXT: vzeroupper
7749 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7750 %0 = icmp ult <8 x i64> %shuffle.i, %__W
7751 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7752 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7753 %2 = icmp ugt <8 x i64> %1, %shuffle1.i
7754 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7755 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7756 %4 = icmp ugt <8 x i64> %3, %shuffle3.i
7757 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7758 %vecext.i = extractelement <8 x i64> %5, i32 0
7762 define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
7763 ; X86-LABEL: test_mm512_reduce_max_pd:
7764 ; X86: # %bb.0: # %entry
7765 ; X86-NEXT: pushl %ebp
7766 ; X86-NEXT: .cfi_def_cfa_offset 8
7767 ; X86-NEXT: .cfi_offset %ebp, -8
7768 ; X86-NEXT: movl %esp, %ebp
7769 ; X86-NEXT: .cfi_def_cfa_register %ebp
7770 ; X86-NEXT: andl $-8, %esp
7771 ; X86-NEXT: subl $8, %esp
7772 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7773 ; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7774 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7775 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7776 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7777 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
7778 ; X86-NEXT: vmovsd %xmm0, (%esp)
7779 ; X86-NEXT: fldl (%esp)
7780 ; X86-NEXT: movl %ebp, %esp
7781 ; X86-NEXT: popl %ebp
7782 ; X86-NEXT: .cfi_def_cfa %esp, 4
7783 ; X86-NEXT: vzeroupper
7786 ; X64-LABEL: test_mm512_reduce_max_pd:
7787 ; X64: # %bb.0: # %entry
7788 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7789 ; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7790 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7791 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7792 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7793 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
7794 ; X64-NEXT: vzeroupper
7797 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7798 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7799 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7800 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7801 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7802 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7803 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7804 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
7805 %vecext.i = extractelement <2 x double> %2, i32 0
7806 ret double %vecext.i
7809 define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
7810 ; X86-LABEL: test_mm512_reduce_min_epi64:
7811 ; X86: # %bb.0: # %entry
7812 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7813 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7814 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7815 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7816 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7817 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7818 ; X86-NEXT: vmovd %xmm0, %eax
7819 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7820 ; X86-NEXT: vzeroupper
7823 ; X64-LABEL: test_mm512_reduce_min_epi64:
7824 ; X64: # %bb.0: # %entry
7825 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7826 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7827 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7828 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7829 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7830 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7831 ; X64-NEXT: vmovq %xmm0, %rax
7832 ; X64-NEXT: vzeroupper
7835 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7836 %0 = icmp sgt <8 x i64> %shuffle.i, %__W
7837 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7838 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7839 %2 = icmp slt <8 x i64> %1, %shuffle1.i
7840 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7841 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7842 %4 = icmp slt <8 x i64> %3, %shuffle3.i
7843 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7844 %vecext.i = extractelement <8 x i64> %5, i32 0
7848 define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
7849 ; X86-LABEL: test_mm512_reduce_min_epu64:
7850 ; X86: # %bb.0: # %entry
7851 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7852 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7853 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7854 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7855 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7856 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7857 ; X86-NEXT: vmovd %xmm0, %eax
7858 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7859 ; X86-NEXT: vzeroupper
7862 ; X64-LABEL: test_mm512_reduce_min_epu64:
7863 ; X64: # %bb.0: # %entry
7864 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7865 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7866 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7867 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7868 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7869 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7870 ; X64-NEXT: vmovq %xmm0, %rax
7871 ; X64-NEXT: vzeroupper
7874 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7875 %0 = icmp ugt <8 x i64> %shuffle.i, %__W
7876 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7877 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7878 %2 = icmp ult <8 x i64> %1, %shuffle1.i
7879 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7880 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7881 %4 = icmp ult <8 x i64> %3, %shuffle3.i
7882 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7883 %vecext.i = extractelement <8 x i64> %5, i32 0
7887 define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
7888 ; X86-LABEL: test_mm512_reduce_min_pd:
7889 ; X86: # %bb.0: # %entry
7890 ; X86-NEXT: pushl %ebp
7891 ; X86-NEXT: .cfi_def_cfa_offset 8
7892 ; X86-NEXT: .cfi_offset %ebp, -8
7893 ; X86-NEXT: movl %esp, %ebp
7894 ; X86-NEXT: .cfi_def_cfa_register %ebp
7895 ; X86-NEXT: andl $-8, %esp
7896 ; X86-NEXT: subl $8, %esp
7897 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7898 ; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0
7899 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7900 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
7901 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7902 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
7903 ; X86-NEXT: vmovsd %xmm0, (%esp)
7904 ; X86-NEXT: fldl (%esp)
7905 ; X86-NEXT: movl %ebp, %esp
7906 ; X86-NEXT: popl %ebp
7907 ; X86-NEXT: .cfi_def_cfa %esp, 4
7908 ; X86-NEXT: vzeroupper
7911 ; X64-LABEL: test_mm512_reduce_min_pd:
7912 ; X64: # %bb.0: # %entry
7913 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7914 ; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
7915 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7916 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
7917 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7918 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
7919 ; X64-NEXT: vzeroupper
7922 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7923 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7924 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7925 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7926 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7927 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7928 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7929 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
7930 %vecext.i = extractelement <2 x double> %2, i32 0
7931 ret double %vecext.i
7934 define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7935 ; X86-LABEL: test_mm512_mask_reduce_max_epi64:
7936 ; X86: # %bb.0: # %entry
7937 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7938 ; X86-NEXT: kmovw %eax, %k1
7939 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
7940 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7941 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7942 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7943 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7944 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7945 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7946 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7947 ; X86-NEXT: vmovd %xmm0, %eax
7948 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7949 ; X86-NEXT: vzeroupper
7952 ; X64-LABEL: test_mm512_mask_reduce_max_epi64:
7953 ; X64: # %bb.0: # %entry
7954 ; X64-NEXT: kmovw %edi, %k1
7955 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
7956 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7957 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7958 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7959 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7960 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7961 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7962 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7963 ; X64-NEXT: vmovq %xmm0, %rax
7964 ; X64-NEXT: vzeroupper
7967 %0 = bitcast i8 %__M to <8 x i1>
7968 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
7969 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7970 %2 = icmp sgt <8 x i64> %1, %shuffle.i
7971 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
7972 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7973 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7974 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7975 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7976 %6 = icmp sgt <8 x i64> %5, %shuffle5.i
7977 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
7978 %vecext.i = extractelement <8 x i64> %7, i32 0
7982 define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
7983 ; X86-LABEL: test_mm512_mask_reduce_max_epu64:
7984 ; X86: # %bb.0: # %entry
7985 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7986 ; X86-NEXT: kmovw %eax, %k1
7987 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
7988 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7989 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7990 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7991 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7992 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7993 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7994 ; X86-NEXT: vmovd %xmm0, %eax
7995 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7996 ; X86-NEXT: vzeroupper
7999 ; X64-LABEL: test_mm512_mask_reduce_max_epu64:
8000 ; X64: # %bb.0: # %entry
8001 ; X64-NEXT: kmovw %edi, %k1
8002 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8003 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8004 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8005 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8006 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8007 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8008 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8009 ; X64-NEXT: vmovq %xmm0, %rax
8010 ; X64-NEXT: vzeroupper
8013 %0 = bitcast i8 %__M to <8 x i1>
8014 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
8015 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8016 %2 = icmp ugt <8 x i64> %1, %shuffle.i
8017 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8018 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8019 %4 = icmp ugt <8 x i64> %3, %shuffle2.i
8020 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
8021 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8022 %6 = icmp ugt <8 x i64> %5, %shuffle4.i
8023 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
8024 %vecext.i = extractelement <8 x i64> %7, i32 0
8028 define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
8029 ; X86-LABEL: test_mm512_mask_reduce_max_pd:
8030 ; X86: # %bb.0: # %entry
8031 ; X86-NEXT: pushl %ebp
8032 ; X86-NEXT: .cfi_def_cfa_offset 8
8033 ; X86-NEXT: .cfi_offset %ebp, -8
8034 ; X86-NEXT: movl %esp, %ebp
8035 ; X86-NEXT: .cfi_def_cfa_register %ebp
8036 ; X86-NEXT: andl $-8, %esp
8037 ; X86-NEXT: subl $8, %esp
8038 ; X86-NEXT: movb 8(%ebp), %al
8039 ; X86-NEXT: kmovw %eax, %k1
8040 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8041 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8042 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8043 ; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8044 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8045 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8046 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8047 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
8048 ; X86-NEXT: vmovsd %xmm0, (%esp)
8049 ; X86-NEXT: fldl (%esp)
8050 ; X86-NEXT: movl %ebp, %esp
8051 ; X86-NEXT: popl %ebp
8052 ; X86-NEXT: .cfi_def_cfa %esp, 4
8053 ; X86-NEXT: vzeroupper
8056 ; X64-LABEL: test_mm512_mask_reduce_max_pd:
8057 ; X64: # %bb.0: # %entry
8058 ; X64-NEXT: kmovw %edi, %k1
8059 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8060 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8061 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8062 ; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8063 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8064 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8065 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8066 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
8067 ; X64-NEXT: vzeroupper
8070 %0 = bitcast i8 %__M to <8 x i1>
8071 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
8072 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8073 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8074 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
8075 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8076 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8077 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
8078 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8079 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
8080 %vecext.i = extractelement <2 x double> %4, i32 0
8081 ret double %vecext.i
8084 define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
8085 ; X86-LABEL: test_mm512_mask_reduce_min_epi64:
8086 ; X86: # %bb.0: # %entry
8087 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8088 ; X86-NEXT: kmovw %eax, %k1
8089 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
8090 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8091 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8092 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8093 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8094 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8095 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8096 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8097 ; X86-NEXT: vmovd %xmm0, %eax
8098 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8099 ; X86-NEXT: vzeroupper
8102 ; X64-LABEL: test_mm512_mask_reduce_min_epi64:
8103 ; X64: # %bb.0: # %entry
8104 ; X64-NEXT: kmovw %edi, %k1
8105 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
8106 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8107 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8108 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8109 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8110 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8111 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8112 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8113 ; X64-NEXT: vmovq %xmm0, %rax
8114 ; X64-NEXT: vzeroupper
8117 %0 = bitcast i8 %__M to <8 x i1>
8118 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
8119 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8120 %2 = icmp slt <8 x i64> %1, %shuffle.i
8121 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8122 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8123 %4 = icmp slt <8 x i64> %3, %shuffle3.i
8124 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8125 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8126 %6 = icmp slt <8 x i64> %5, %shuffle5.i
8127 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8128 %vecext.i = extractelement <8 x i64> %7, i32 0
8132 define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8133 ; X86-LABEL: test_mm512_mask_reduce_min_epu64:
8134 ; X86: # %bb.0: # %entry
8135 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8136 ; X86-NEXT: kmovw %eax, %k1
8137 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8138 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8139 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8140 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8141 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8142 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8143 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8144 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8145 ; X86-NEXT: vmovd %xmm0, %eax
8146 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8147 ; X86-NEXT: vzeroupper
8150 ; X64-LABEL: test_mm512_mask_reduce_min_epu64:
8151 ; X64: # %bb.0: # %entry
8152 ; X64-NEXT: kmovw %edi, %k1
8153 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8154 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8155 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8156 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8157 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8158 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8159 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8160 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8161 ; X64-NEXT: vmovq %xmm0, %rax
8162 ; X64-NEXT: vzeroupper
8165 %0 = bitcast i8 %__M to <8 x i1>
8166 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
8167 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8168 %2 = icmp ult <8 x i64> %1, %shuffle.i
8169 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8170 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8171 %4 = icmp ult <8 x i64> %3, %shuffle3.i
8172 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8173 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8174 %6 = icmp ult <8 x i64> %5, %shuffle5.i
8175 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8176 %vecext.i = extractelement <8 x i64> %7, i32 0
8180 define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
8181 ; X86-LABEL: test_mm512_mask_reduce_min_pd:
8182 ; X86: # %bb.0: # %entry
8183 ; X86-NEXT: pushl %ebp
8184 ; X86-NEXT: .cfi_def_cfa_offset 8
8185 ; X86-NEXT: .cfi_offset %ebp, -8
8186 ; X86-NEXT: movl %esp, %ebp
8187 ; X86-NEXT: .cfi_def_cfa_register %ebp
8188 ; X86-NEXT: andl $-8, %esp
8189 ; X86-NEXT: subl $8, %esp
8190 ; X86-NEXT: movb 8(%ebp), %al
8191 ; X86-NEXT: kmovw %eax, %k1
8192 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8193 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8194 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8195 ; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0
8196 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8197 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
8198 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8199 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
8200 ; X86-NEXT: vmovsd %xmm0, (%esp)
8201 ; X86-NEXT: fldl (%esp)
8202 ; X86-NEXT: movl %ebp, %esp
8203 ; X86-NEXT: popl %ebp
8204 ; X86-NEXT: .cfi_def_cfa %esp, 4
8205 ; X86-NEXT: vzeroupper
8208 ; X64-LABEL: test_mm512_mask_reduce_min_pd:
8209 ; X64: # %bb.0: # %entry
8210 ; X64-NEXT: kmovw %edi, %k1
8211 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8212 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8213 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8214 ; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0
8215 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8216 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
8217 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8218 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
8219 ; X64-NEXT: vzeroupper
8222 %0 = bitcast i8 %__M to <8 x i1>
8223 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
8224 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8225 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8226 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
8227 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8228 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8229 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
8230 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8231 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
8232 %vecext.i = extractelement <2 x double> %4, i32 0
8233 ret double %vecext.i
8236 define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
8237 ; CHECK-LABEL: test_mm512_reduce_max_epi32:
8238 ; CHECK: # %bb.0: # %entry
8239 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8240 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
8241 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8242 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8243 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8244 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8245 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8246 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8247 ; CHECK-NEXT: vmovd %xmm0, %eax
8248 ; CHECK-NEXT: vzeroupper
8249 ; CHECK-NEXT: ret{{[l|q]}}
8251 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8252 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8253 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8254 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8255 %2 = icmp sgt <8 x i32> %0, %1
8256 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8257 %4 = bitcast <8 x i32> %3 to <4 x i64>
8258 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8259 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8260 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8261 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8262 %7 = icmp sgt <4 x i32> %5, %6
8263 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8264 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8265 %9 = icmp sgt <4 x i32> %8, %shuffle.i
8266 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8267 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8268 %11 = icmp sgt <4 x i32> %10, %shuffle8.i
8269 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8270 %vecext.i = extractelement <4 x i32> %12, i32 0
8274 define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
8275 ; CHECK-LABEL: test_mm512_reduce_max_epu32:
8276 ; CHECK: # %bb.0: # %entry
8277 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8278 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8279 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8280 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8281 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8282 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8283 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8284 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8285 ; CHECK-NEXT: vmovd %xmm0, %eax
8286 ; CHECK-NEXT: vzeroupper
8287 ; CHECK-NEXT: ret{{[l|q]}}
8289 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8290 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8291 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8292 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8293 %2 = icmp ugt <8 x i32> %0, %1
8294 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8295 %4 = bitcast <8 x i32> %3 to <4 x i64>
8296 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8297 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8298 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8299 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8300 %7 = icmp ugt <4 x i32> %5, %6
8301 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8302 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8303 %9 = icmp ugt <4 x i32> %8, %shuffle.i
8304 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8305 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8306 %11 = icmp ugt <4 x i32> %10, %shuffle8.i
8307 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8308 %vecext.i = extractelement <4 x i32> %12, i32 0
8312 define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
8313 ; X86-LABEL: test_mm512_reduce_max_ps:
8314 ; X86: # %bb.0: # %entry
8315 ; X86-NEXT: pushl %eax
8316 ; X86-NEXT: .cfi_def_cfa_offset 8
8317 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8318 ; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8319 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8320 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8321 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8322 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8323 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8324 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8325 ; X86-NEXT: vmovss %xmm0, (%esp)
8326 ; X86-NEXT: flds (%esp)
8327 ; X86-NEXT: popl %eax
8328 ; X86-NEXT: .cfi_def_cfa_offset 4
8329 ; X86-NEXT: vzeroupper
8332 ; X64-LABEL: test_mm512_reduce_max_ps:
8333 ; X64: # %bb.0: # %entry
8334 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8335 ; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8336 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8337 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8338 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8339 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8340 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8341 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8342 ; X64-NEXT: vzeroupper
8345 %0 = bitcast <16 x float> %__W to <8 x double>
8346 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8347 %1 = bitcast <4 x double> %extract.i to <8 x float>
8348 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8349 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8350 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
8351 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8352 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8353 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8354 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8355 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
8356 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8357 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8358 %vecext.i = extractelement <4 x float> %6, i32 0
8362 define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
8363 ; CHECK-LABEL: test_mm512_reduce_min_epi32:
8364 ; CHECK: # %bb.0: # %entry
8365 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8366 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
8367 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8368 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8369 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8370 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8371 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8372 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8373 ; CHECK-NEXT: vmovd %xmm0, %eax
8374 ; CHECK-NEXT: vzeroupper
8375 ; CHECK-NEXT: ret{{[l|q]}}
8377 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8378 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8379 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8380 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8381 %2 = icmp slt <8 x i32> %0, %1
8382 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8383 %4 = bitcast <8 x i32> %3 to <4 x i64>
8384 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8385 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8386 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8387 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8388 %7 = icmp slt <4 x i32> %5, %6
8389 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8390 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8391 %9 = icmp slt <4 x i32> %8, %shuffle.i
8392 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8393 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8394 %11 = icmp slt <4 x i32> %10, %shuffle8.i
8395 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8396 %vecext.i = extractelement <4 x i32> %12, i32 0
8400 define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
8401 ; CHECK-LABEL: test_mm512_reduce_min_epu32:
8402 ; CHECK: # %bb.0: # %entry
8403 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8404 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
8405 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8406 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8407 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8408 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8409 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8410 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8411 ; CHECK-NEXT: vmovd %xmm0, %eax
8412 ; CHECK-NEXT: vzeroupper
8413 ; CHECK-NEXT: ret{{[l|q]}}
8415 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8416 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8417 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8418 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8419 %2 = icmp ult <8 x i32> %0, %1
8420 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8421 %4 = bitcast <8 x i32> %3 to <4 x i64>
8422 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8423 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8424 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8425 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8426 %7 = icmp ult <4 x i32> %5, %6
8427 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8428 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8429 %9 = icmp ult <4 x i32> %8, %shuffle.i
8430 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8431 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8432 %11 = icmp ult <4 x i32> %10, %shuffle8.i
8433 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8434 %vecext.i = extractelement <4 x i32> %12, i32 0
8438 define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
8439 ; X86-LABEL: test_mm512_reduce_min_ps:
8440 ; X86: # %bb.0: # %entry
8441 ; X86-NEXT: pushl %eax
8442 ; X86-NEXT: .cfi_def_cfa_offset 8
8443 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8444 ; X86-NEXT: vminps %ymm1, %ymm0, %ymm0
8445 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8446 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8447 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8448 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8449 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8450 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
8451 ; X86-NEXT: vmovss %xmm0, (%esp)
8452 ; X86-NEXT: flds (%esp)
8453 ; X86-NEXT: popl %eax
8454 ; X86-NEXT: .cfi_def_cfa_offset 4
8455 ; X86-NEXT: vzeroupper
8458 ; X64-LABEL: test_mm512_reduce_min_ps:
8459 ; X64: # %bb.0: # %entry
8460 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8461 ; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
8462 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8463 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8464 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8465 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8466 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8467 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
8468 ; X64-NEXT: vzeroupper
8471 %0 = bitcast <16 x float> %__W to <8 x double>
8472 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8473 %1 = bitcast <4 x double> %extract.i to <8 x float>
8474 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8475 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8476 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
8477 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8478 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8479 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8480 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8481 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
8482 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8483 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8484 %vecext.i = extractelement <4 x float> %6, i32 0
8488 define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8489 ; X86-LABEL: test_mm512_mask_reduce_max_epi32:
8490 ; X86: # %bb.0: # %entry
8491 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8492 ; X86-NEXT: kmovw %eax, %k1
8493 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8494 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8495 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8496 ; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8497 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8498 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8499 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8500 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8501 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8502 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8503 ; X86-NEXT: vmovd %xmm0, %eax
8504 ; X86-NEXT: vzeroupper
8507 ; X64-LABEL: test_mm512_mask_reduce_max_epi32:
8508 ; X64: # %bb.0: # %entry
8509 ; X64-NEXT: kmovw %edi, %k1
8510 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8511 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8512 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8513 ; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8514 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8515 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8516 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8517 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8518 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8519 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8520 ; X64-NEXT: vmovd %xmm0, %eax
8521 ; X64-NEXT: vzeroupper
8524 %0 = bitcast <8 x i64> %__W to <16 x i32>
8525 %1 = bitcast i16 %__M to <16 x i1>
8526 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
8527 %3 = bitcast <16 x i32> %2 to <8 x i64>
8528 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8529 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8530 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8531 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8532 %6 = icmp sgt <8 x i32> %4, %5
8533 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8534 %8 = bitcast <8 x i32> %7 to <4 x i64>
8535 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8536 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8537 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8538 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8539 %11 = icmp sgt <4 x i32> %9, %10
8540 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8541 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8542 %13 = icmp sgt <4 x i32> %12, %shuffle.i
8543 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8544 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8545 %15 = icmp sgt <4 x i32> %14, %shuffle10.i
8546 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8547 %vecext.i = extractelement <4 x i32> %16, i32 0
8551 define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8552 ; X86-LABEL: test_mm512_mask_reduce_max_epu32:
8553 ; X86: # %bb.0: # %entry
8554 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8555 ; X86-NEXT: kmovw %eax, %k1
8556 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8557 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8558 ; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8559 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8560 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8561 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8562 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8563 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8564 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8565 ; X86-NEXT: vmovd %xmm0, %eax
8566 ; X86-NEXT: vzeroupper
8569 ; X64-LABEL: test_mm512_mask_reduce_max_epu32:
8570 ; X64: # %bb.0: # %entry
8571 ; X64-NEXT: kmovw %edi, %k1
8572 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8573 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8574 ; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8575 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8576 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8577 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8578 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8579 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8580 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8581 ; X64-NEXT: vmovd %xmm0, %eax
8582 ; X64-NEXT: vzeroupper
8585 %0 = bitcast <8 x i64> %__W to <16 x i32>
8586 %1 = bitcast i16 %__M to <16 x i1>
8587 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
8588 %3 = bitcast <16 x i32> %2 to <8 x i64>
8589 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8590 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8591 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8592 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
8593 %6 = icmp ugt <8 x i32> %4, %5
8594 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8595 %8 = bitcast <8 x i32> %7 to <4 x i64>
8596 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8597 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8598 %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
8599 %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
8600 %11 = icmp ugt <4 x i32> %9, %10
8601 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8602 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8603 %13 = icmp ugt <4 x i32> %12, %shuffle.i
8604 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8605 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8606 %15 = icmp ugt <4 x i32> %14, %shuffle9.i
8607 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
8608 %vecext.i = extractelement <4 x i32> %16, i32 0
8612 define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
8613 ; X86-LABEL: test_mm512_mask_reduce_max_ps:
8614 ; X86: # %bb.0: # %entry
8615 ; X86-NEXT: pushl %eax
8616 ; X86-NEXT: .cfi_def_cfa_offset 8
8617 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8618 ; X86-NEXT: kmovw %eax, %k1
8619 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8620 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8621 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8622 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8623 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8624 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8625 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8626 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8627 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8628 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8629 ; X86-NEXT: vmovss %xmm0, (%esp)
8630 ; X86-NEXT: flds (%esp)
8631 ; X86-NEXT: popl %eax
8632 ; X86-NEXT: .cfi_def_cfa_offset 4
8633 ; X86-NEXT: vzeroupper
8636 ; X64-LABEL: test_mm512_mask_reduce_max_ps:
8637 ; X64: # %bb.0: # %entry
8638 ; X64-NEXT: kmovw %edi, %k1
8639 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8640 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8641 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8642 ; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8643 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8644 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8645 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8646 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8647 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8648 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8649 ; X64-NEXT: vzeroupper
8652 %0 = bitcast i16 %__M to <16 x i1>
8653 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
8654 %2 = bitcast <16 x float> %1 to <8 x double>
8655 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8656 %3 = bitcast <4 x double> %extract.i to <8 x float>
8657 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8658 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8659 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
8660 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8661 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8662 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8663 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8664 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
8665 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8666 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8667 %vecext.i = extractelement <4 x float> %8, i32 0
8671 define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8672 ; X86-LABEL: test_mm512_mask_reduce_min_epi32:
8673 ; X86: # %bb.0: # %entry
8674 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8675 ; X86-NEXT: kmovw %eax, %k1
8676 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8677 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8678 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8679 ; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8680 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8681 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8682 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8683 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8684 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8685 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8686 ; X86-NEXT: vmovd %xmm0, %eax
8687 ; X86-NEXT: vzeroupper
8690 ; X64-LABEL: test_mm512_mask_reduce_min_epi32:
8691 ; X64: # %bb.0: # %entry
8692 ; X64-NEXT: kmovw %edi, %k1
8693 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8694 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8695 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8696 ; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8697 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8698 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8699 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8700 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8701 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8702 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8703 ; X64-NEXT: vmovd %xmm0, %eax
8704 ; X64-NEXT: vzeroupper
8707 %0 = bitcast <8 x i64> %__W to <16 x i32>
8708 %1 = bitcast i16 %__M to <16 x i1>
8709 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
8710 %3 = bitcast <16 x i32> %2 to <8 x i64>
8711 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8712 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8713 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8714 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8715 %6 = icmp slt <8 x i32> %4, %5
8716 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8717 %8 = bitcast <8 x i32> %7 to <4 x i64>
8718 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8719 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8720 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8721 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8722 %11 = icmp slt <4 x i32> %9, %10
8723 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8724 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8725 %13 = icmp slt <4 x i32> %12, %shuffle.i
8726 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8727 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8728 %15 = icmp slt <4 x i32> %14, %shuffle10.i
8729 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8730 %vecext.i = extractelement <4 x i32> %16, i32 0
8734 define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8735 ; X86-LABEL: test_mm512_mask_reduce_min_epu32:
8736 ; X86: # %bb.0: # %entry
8737 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8738 ; X86-NEXT: kmovw %eax, %k1
8739 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8740 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8741 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8742 ; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0
8743 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8744 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8745 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8746 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8747 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8748 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8749 ; X86-NEXT: vmovd %xmm0, %eax
8750 ; X86-NEXT: vzeroupper
8753 ; X64-LABEL: test_mm512_mask_reduce_min_epu32:
8754 ; X64: # %bb.0: # %entry
8755 ; X64-NEXT: kmovw %edi, %k1
8756 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8757 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8758 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8759 ; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0
8760 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8761 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8762 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8763 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8764 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8765 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8766 ; X64-NEXT: vmovd %xmm0, %eax
8767 ; X64-NEXT: vzeroupper
8770 %0 = bitcast <8 x i64> %__W to <16 x i32>
8771 %1 = bitcast i16 %__M to <16 x i1>
8772 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
8773 %3 = bitcast <16 x i32> %2 to <8 x i64>
8774 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8775 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8776 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8777 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8778 %6 = icmp ult <8 x i32> %4, %5
8779 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8780 %8 = bitcast <8 x i32> %7 to <4 x i64>
8781 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8782 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8783 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8784 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8785 %11 = icmp ult <4 x i32> %9, %10
8786 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8787 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8788 %13 = icmp ult <4 x i32> %12, %shuffle.i
8789 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8790 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8791 %15 = icmp ult <4 x i32> %14, %shuffle10.i
8792 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8793 %vecext.i = extractelement <4 x i32> %16, i32 0
8797 define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
8798 ; X86-LABEL: test_mm512_mask_reduce_min_ps:
8799 ; X86: # %bb.0: # %entry
8800 ; X86-NEXT: pushl %eax
8801 ; X86-NEXT: .cfi_def_cfa_offset 8
8802 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8803 ; X86-NEXT: kmovw %eax, %k1
8804 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8805 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8806 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8807 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0
8808 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8809 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8810 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8811 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8812 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8813 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
8814 ; X86-NEXT: vmovss %xmm0, (%esp)
8815 ; X86-NEXT: flds (%esp)
8816 ; X86-NEXT: popl %eax
8817 ; X86-NEXT: .cfi_def_cfa_offset 4
8818 ; X86-NEXT: vzeroupper
8821 ; X64-LABEL: test_mm512_mask_reduce_min_ps:
8822 ; X64: # %bb.0: # %entry
8823 ; X64-NEXT: kmovw %edi, %k1
8824 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8825 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8826 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8827 ; X64-NEXT: vminps %ymm0, %ymm1, %ymm0
8828 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8829 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8830 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8831 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8832 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8833 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
8834 ; X64-NEXT: vzeroupper
8837 %0 = bitcast i16 %__M to <16 x i1>
8838 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
8839 %2 = bitcast <16 x float> %1 to <8 x double>
8840 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8841 %3 = bitcast <4 x double> %extract.i to <8 x float>
8842 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8843 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8844 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
8845 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8846 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8847 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8848 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8849 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
8850 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8851 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8852 %vecext.i = extractelement <4 x float> %8, i32 0
8856 define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8857 ; X86-LABEL: test_mm512_mask_max_pd:
8858 ; X86: # %bb.0: # %entry
8859 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8860 ; X86-NEXT: kmovw %eax, %k1
8861 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8864 ; X64-LABEL: test_mm512_mask_max_pd:
8865 ; X64: # %bb.0: # %entry
8866 ; X64-NEXT: kmovw %edi, %k1
8867 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8870 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8871 %1 = bitcast i8 %__U to <8 x i1>
8872 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8876 define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8877 ; X86-LABEL: test_mm512_maskz_max_pd:
8878 ; X86: # %bb.0: # %entry
8879 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8880 ; X86-NEXT: kmovw %eax, %k1
8881 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8884 ; X64-LABEL: test_mm512_maskz_max_pd:
8885 ; X64: # %bb.0: # %entry
8886 ; X64-NEXT: kmovw %edi, %k1
8887 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8890 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8891 %1 = bitcast i8 %__U to <8 x i1>
8892 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8896 define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8897 ; X86-LABEL: test_mm512_mask_max_ps:
8898 ; X86: # %bb.0: # %entry
8899 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8900 ; X86-NEXT: kmovw %eax, %k1
8901 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8904 ; X64-LABEL: test_mm512_mask_max_ps:
8905 ; X64: # %bb.0: # %entry
8906 ; X64-NEXT: kmovw %edi, %k1
8907 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8910 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8911 %1 = bitcast i16 %__U to <16 x i1>
8912 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
8916 define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8917 ; X86-LABEL: test_mm512_mask_max_round_pd:
8918 ; X86: # %bb.0: # %entry
8919 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8920 ; X86-NEXT: kmovw %eax, %k1
8921 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8924 ; X64-LABEL: test_mm512_mask_max_round_pd:
8925 ; X64: # %bb.0: # %entry
8926 ; X64-NEXT: kmovw %edi, %k1
8927 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8930 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8931 %1 = bitcast i8 %__U to <8 x i1>
8932 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8936 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
8938 define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8939 ; X86-LABEL: test_mm512_maskz_max_round_pd:
8940 ; X86: # %bb.0: # %entry
8941 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8942 ; X86-NEXT: kmovw %eax, %k1
8943 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8946 ; X64-LABEL: test_mm512_maskz_max_round_pd:
8947 ; X64: # %bb.0: # %entry
8948 ; X64-NEXT: kmovw %edi, %k1
8949 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8952 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8953 %1 = bitcast i8 %__U to <8 x i1>
8954 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8958 define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
8959 ; CHECK-LABEL: test_mm512_max_round_pd:
8960 ; CHECK: # %bb.0: # %entry
8961 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
8962 ; CHECK-NEXT: ret{{[l|q]}}
8964 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8968 define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8969 ; X86-LABEL: test_mm512_maskz_max_ps:
8970 ; X86: # %bb.0: # %entry
8971 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8972 ; X86-NEXT: kmovw %eax, %k1
8973 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8976 ; X64-LABEL: test_mm512_maskz_max_ps:
8977 ; X64: # %bb.0: # %entry
8978 ; X64-NEXT: kmovw %edi, %k1
8979 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8982 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8983 %1 = bitcast i16 %__U to <16 x i1>
8984 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
8988 define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8989 ; X86-LABEL: test_mm512_mask_max_round_ps:
8990 ; X86: # %bb.0: # %entry
8991 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8992 ; X86-NEXT: kmovw %eax, %k1
8993 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8996 ; X64-LABEL: test_mm512_mask_max_round_ps:
8997 ; X64: # %bb.0: # %entry
8998 ; X64-NEXT: kmovw %edi, %k1
8999 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9002 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9003 %1 = bitcast i16 %__U to <16 x i1>
9004 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9008 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
9010 define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9011 ; X86-LABEL: test_mm512_maskz_max_round_ps:
9012 ; X86: # %bb.0: # %entry
9013 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9014 ; X86-NEXT: kmovw %eax, %k1
9015 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9018 ; X64-LABEL: test_mm512_maskz_max_round_ps:
9019 ; X64: # %bb.0: # %entry
9020 ; X64-NEXT: kmovw %edi, %k1
9021 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9024 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9025 %1 = bitcast i16 %__U to <16 x i1>
9026 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9030 define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
9031 ; CHECK-LABEL: test_mm512_max_round_ps:
9032 ; CHECK: # %bb.0: # %entry
9033 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
9034 ; CHECK-NEXT: ret{{[l|q]}}
9036 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9040 define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9041 ; X86-LABEL: test_mm512_mask_min_pd:
9042 ; X86: # %bb.0: # %entry
9043 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9044 ; X86-NEXT: kmovw %eax, %k1
9045 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9048 ; X64-LABEL: test_mm512_mask_min_pd:
9049 ; X64: # %bb.0: # %entry
9050 ; X64-NEXT: kmovw %edi, %k1
9051 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9054 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9055 %1 = bitcast i8 %__U to <8 x i1>
9056 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9060 define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9061 ; X86-LABEL: test_mm512_maskz_min_pd:
9062 ; X86: # %bb.0: # %entry
9063 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9064 ; X86-NEXT: kmovw %eax, %k1
9065 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9068 ; X64-LABEL: test_mm512_maskz_min_pd:
9069 ; X64: # %bb.0: # %entry
9070 ; X64-NEXT: kmovw %edi, %k1
9071 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9074 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9075 %1 = bitcast i8 %__U to <8 x i1>
9076 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9080 define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9081 ; X86-LABEL: test_mm512_mask_min_round_pd:
9082 ; X86: # %bb.0: # %entry
9083 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9084 ; X86-NEXT: kmovw %eax, %k1
9085 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9088 ; X64-LABEL: test_mm512_mask_min_round_pd:
9089 ; X64: # %bb.0: # %entry
9090 ; X64-NEXT: kmovw %edi, %k1
9091 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9094 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9095 %1 = bitcast i8 %__U to <8 x i1>
9096 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9100 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
9102 define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9103 ; X86-LABEL: test_mm512_maskz_min_round_pd:
9104 ; X86: # %bb.0: # %entry
9105 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9106 ; X86-NEXT: kmovw %eax, %k1
9107 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9110 ; X64-LABEL: test_mm512_maskz_min_round_pd:
9111 ; X64: # %bb.0: # %entry
9112 ; X64-NEXT: kmovw %edi, %k1
9113 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9116 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9117 %1 = bitcast i8 %__U to <8 x i1>
9118 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9122 define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
9123 ; CHECK-LABEL: test_mm512_min_round_pd:
9124 ; CHECK: # %bb.0: # %entry
9125 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
9126 ; CHECK-NEXT: ret{{[l|q]}}
9128 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9132 define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9133 ; X86-LABEL: test_mm512_mask_min_ps:
9134 ; X86: # %bb.0: # %entry
9135 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9136 ; X86-NEXT: kmovw %eax, %k1
9137 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9140 ; X64-LABEL: test_mm512_mask_min_ps:
9141 ; X64: # %bb.0: # %entry
9142 ; X64-NEXT: kmovw %edi, %k1
9143 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9146 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9147 %1 = bitcast i16 %__U to <16 x i1>
9148 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9152 define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9153 ; X86-LABEL: test_mm512_maskz_min_ps:
9154 ; X86: # %bb.0: # %entry
9155 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9156 ; X86-NEXT: kmovw %eax, %k1
9157 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9160 ; X64-LABEL: test_mm512_maskz_min_ps:
9161 ; X64: # %bb.0: # %entry
9162 ; X64-NEXT: kmovw %edi, %k1
9163 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9166 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9167 %1 = bitcast i16 %__U to <16 x i1>
9168 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9172 define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9173 ; X86-LABEL: test_mm512_mask_min_round_ps:
9174 ; X86: # %bb.0: # %entry
9175 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9176 ; X86-NEXT: kmovw %eax, %k1
9177 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9180 ; X64-LABEL: test_mm512_mask_min_round_ps:
9181 ; X64: # %bb.0: # %entry
9182 ; X64-NEXT: kmovw %edi, %k1
9183 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9186 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9187 %1 = bitcast i16 %__U to <16 x i1>
9188 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9192 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
9194 define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9195 ; X86-LABEL: test_mm512_maskz_min_round_ps:
9196 ; X86: # %bb.0: # %entry
9197 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9198 ; X86-NEXT: kmovw %eax, %k1
9199 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9202 ; X64-LABEL: test_mm512_maskz_min_round_ps:
9203 ; X64: # %bb.0: # %entry
9204 ; X64-NEXT: kmovw %edi, %k1
9205 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9208 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9209 %1 = bitcast i16 %__U to <16 x i1>
9210 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9214 define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
9215 ; CHECK-LABEL: test_mm512_min_round_ps:
9216 ; CHECK: # %bb.0: # %entry
9217 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
9218 ; CHECK-NEXT: ret{{[l|q]}}
9220 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9224 define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
9225 ; CHECK-LABEL: test_mm512_sqrt_pd:
9226 ; CHECK: # %bb.0: # %entry
9227 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
9228 ; CHECK-NEXT: ret{{[l|q]}}
9230 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
9234 define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9235 ; X86-LABEL: test_mm512_mask_sqrt_pd:
9236 ; X86: # %bb.0: # %entry
9237 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9238 ; X86-NEXT: kmovw %eax, %k1
9239 ; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9242 ; X64-LABEL: test_mm512_mask_sqrt_pd:
9243 ; X64: # %bb.0: # %entry
9244 ; X64-NEXT: kmovw %edi, %k1
9245 ; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9248 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9249 %1 = bitcast i8 %__U to <8 x i1>
9250 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9254 define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
9255 ; X86-LABEL: test_mm512_maskz_sqrt_pd:
9256 ; X86: # %bb.0: # %entry
9257 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9258 ; X86-NEXT: kmovw %eax, %k1
9259 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9262 ; X64-LABEL: test_mm512_maskz_sqrt_pd:
9263 ; X64: # %bb.0: # %entry
9264 ; X64-NEXT: kmovw %edi, %k1
9265 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9268 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9269 %1 = bitcast i8 %__U to <8 x i1>
9270 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9274 define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9275 ; X86-LABEL: test_mm512_mask_sqrt_round_pd:
9276 ; X86: # %bb.0: # %entry
9277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9278 ; X86-NEXT: kmovw %eax, %k1
9279 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9282 ; X64-LABEL: test_mm512_mask_sqrt_round_pd:
9283 ; X64: # %bb.0: # %entry
9284 ; X64-NEXT: kmovw %edi, %k1
9285 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9288 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9289 %1 = bitcast i8 %__U to <8 x i1>
9290 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9294 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
9296 define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
9297 ; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
9298 ; X86: # %bb.0: # %entry
9299 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9300 ; X86-NEXT: kmovw %eax, %k1
9301 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9304 ; X64-LABEL: test_mm512_maskz_sqrt_round_pd:
9305 ; X64: # %bb.0: # %entry
9306 ; X64-NEXT: kmovw %edi, %k1
9307 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9310 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9311 %1 = bitcast i8 %__U to <8 x i1>
9312 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9316 define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
9317 ; CHECK-LABEL: test_mm512_sqrt_round_pd:
9318 ; CHECK: # %bb.0: # %entry
9319 ; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0
9320 ; CHECK-NEXT: ret{{[l|q]}}
9322 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9326 define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
9327 ; CHECK-LABEL: test_mm512_sqrt_ps:
9328 ; CHECK: # %bb.0: # %entry
9329 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
9330 ; CHECK-NEXT: ret{{[l|q]}}
9332 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
9336 define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9337 ; X86-LABEL: test_mm512_mask_sqrt_ps:
9338 ; X86: # %bb.0: # %entry
9339 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9340 ; X86-NEXT: kmovw %eax, %k1
9341 ; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9344 ; X64-LABEL: test_mm512_mask_sqrt_ps:
9345 ; X64: # %bb.0: # %entry
9346 ; X64-NEXT: kmovw %edi, %k1
9347 ; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9350 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9351 %1 = bitcast i16 %__U to <16 x i1>
9352 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9356 define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
9357 ; X86-LABEL: test_mm512_maskz_sqrt_ps:
9358 ; X86: # %bb.0: # %entry
9359 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9360 ; X86-NEXT: kmovw %eax, %k1
9361 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9364 ; X64-LABEL: test_mm512_maskz_sqrt_ps:
9365 ; X64: # %bb.0: # %entry
9366 ; X64-NEXT: kmovw %edi, %k1
9367 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9370 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9371 %1 = bitcast i16 %__U to <16 x i1>
9372 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9376 define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9377 ; X86-LABEL: test_mm512_mask_sqrt_round_ps:
9378 ; X86: # %bb.0: # %entry
9379 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9380 ; X86-NEXT: kmovw %eax, %k1
9381 ; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9384 ; X64-LABEL: test_mm512_mask_sqrt_round_ps:
9385 ; X64: # %bb.0: # %entry
9386 ; X64-NEXT: kmovw %edi, %k1
9387 ; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9390 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9391 %1 = bitcast i16 %__U to <16 x i1>
9392 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9396 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
9398 define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
9399 ; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
9400 ; X86: # %bb.0: # %entry
9401 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9402 ; X86-NEXT: kmovw %eax, %k1
9403 ; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9406 ; X64-LABEL: test_mm512_maskz_sqrt_round_ps:
9407 ; X64: # %bb.0: # %entry
9408 ; X64-NEXT: kmovw %edi, %k1
9409 ; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9412 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9413 %1 = bitcast i16 %__U to <16 x i1>
9414 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9418 define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
9419 ; CHECK-LABEL: test_mm512_sqrt_round_ps:
9420 ; CHECK: # %bb.0: # %entry
9421 ; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0
9422 ; CHECK-NEXT: ret{{[l|q]}}
9424 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9428 define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
9429 ; CHECK-LABEL: test_mm512_rol_epi32:
9430 ; CHECK: # %bb.0: # %entry
9431 ; CHECK-NEXT: vprold $5, %zmm0, %zmm0
9432 ; CHECK-NEXT: ret{{[l|q]}}
9434 %0 = bitcast <8 x i64> %__A to <16 x i32>
9435 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9436 %2 = bitcast <16 x i32> %1 to <8 x i64>
9440 define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9441 ; X86-LABEL: test_mm512_mask_rol_epi32:
9442 ; X86: # %bb.0: # %entry
9443 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9444 ; X86-NEXT: kmovw %eax, %k1
9445 ; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9448 ; X64-LABEL: test_mm512_mask_rol_epi32:
9449 ; X64: # %bb.0: # %entry
9450 ; X64-NEXT: kmovw %edi, %k1
9451 ; X64-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9454 %0 = bitcast <8 x i64> %__A to <16 x i32>
9455 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9456 %2 = bitcast <8 x i64> %__W to <16 x i32>
9457 %3 = bitcast i16 %__U to <16 x i1>
9458 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9459 %5 = bitcast <16 x i32> %4 to <8 x i64>
9463 define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9464 ; X86-LABEL: test_mm512_maskz_rol_epi32:
9465 ; X86: # %bb.0: # %entry
9466 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9467 ; X86-NEXT: kmovw %eax, %k1
9468 ; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9471 ; X64-LABEL: test_mm512_maskz_rol_epi32:
9472 ; X64: # %bb.0: # %entry
9473 ; X64-NEXT: kmovw %edi, %k1
9474 ; X64-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9477 %0 = bitcast <8 x i64> %__A to <16 x i32>
9478 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9479 %2 = bitcast i16 %__U to <16 x i1>
9480 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9481 %4 = bitcast <16 x i32> %3 to <8 x i64>
9485 define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
9486 ; CHECK-LABEL: test_mm512_rol_epi64:
9487 ; CHECK: # %bb.0: # %entry
9488 ; CHECK-NEXT: vprolq $5, %zmm0, %zmm0
9489 ; CHECK-NEXT: ret{{[l|q]}}
9491 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9495 define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9496 ; X86-LABEL: test_mm512_mask_rol_epi64:
9497 ; X86: # %bb.0: # %entry
9498 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9499 ; X86-NEXT: kmovw %eax, %k1
9500 ; X86-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9503 ; X64-LABEL: test_mm512_mask_rol_epi64:
9504 ; X64: # %bb.0: # %entry
9505 ; X64-NEXT: kmovw %edi, %k1
9506 ; X64-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9509 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9510 %1 = bitcast i8 %__U to <8 x i1>
9511 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9515 define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9516 ; X86-LABEL: test_mm512_maskz_rol_epi64:
9517 ; X86: # %bb.0: # %entry
9518 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9519 ; X86-NEXT: kmovw %eax, %k1
9520 ; X86-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9523 ; X64-LABEL: test_mm512_maskz_rol_epi64:
9524 ; X64: # %bb.0: # %entry
9525 ; X64-NEXT: kmovw %edi, %k1
9526 ; X64-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9529 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9530 %1 = bitcast i8 %__U to <8 x i1>
9531 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9535 define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9536 ; CHECK-LABEL: test_mm512_rolv_epi32:
9537 ; CHECK: # %bb.0: # %entry
9538 ; CHECK-NEXT: vprolvd %zmm1, %zmm0, %zmm0
9539 ; CHECK-NEXT: ret{{[l|q]}}
9541 %0 = bitcast <8 x i64> %__A to <16 x i32>
9542 %1 = bitcast <8 x i64> %__B to <16 x i32>
9543 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9544 %3 = bitcast <16 x i32> %2 to <8 x i64>
9548 define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9549 ; X86-LABEL: test_mm512_mask_rolv_epi32:
9550 ; X86: # %bb.0: # %entry
9551 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9552 ; X86-NEXT: kmovw %eax, %k1
9553 ; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9556 ; X64-LABEL: test_mm512_mask_rolv_epi32:
9557 ; X64: # %bb.0: # %entry
9558 ; X64-NEXT: kmovw %edi, %k1
9559 ; X64-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9562 %0 = bitcast <8 x i64> %__A to <16 x i32>
9563 %1 = bitcast <8 x i64> %__B to <16 x i32>
9564 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9565 %3 = bitcast <8 x i64> %__W to <16 x i32>
9566 %4 = bitcast i16 %__U to <16 x i1>
9567 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9568 %6 = bitcast <16 x i32> %5 to <8 x i64>
9572 define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9573 ; X86-LABEL: test_mm512_maskz_rolv_epi32:
9574 ; X86: # %bb.0: # %entry
9575 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9576 ; X86-NEXT: kmovw %eax, %k1
9577 ; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9580 ; X64-LABEL: test_mm512_maskz_rolv_epi32:
9581 ; X64: # %bb.0: # %entry
9582 ; X64-NEXT: kmovw %edi, %k1
9583 ; X64-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9586 %0 = bitcast <8 x i64> %__A to <16 x i32>
9587 %1 = bitcast <8 x i64> %__B to <16 x i32>
9588 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9589 %3 = bitcast i16 %__U to <16 x i1>
9590 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9591 %5 = bitcast <16 x i32> %4 to <8 x i64>
9595 define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9596 ; CHECK-LABEL: test_mm512_rolv_epi64:
9597 ; CHECK: # %bb.0: # %entry
9598 ; CHECK-NEXT: vprolvq %zmm1, %zmm0, %zmm0
9599 ; CHECK-NEXT: ret{{[l|q]}}
9601 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9605 define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9606 ; X86-LABEL: test_mm512_mask_rolv_epi64:
9607 ; X86: # %bb.0: # %entry
9608 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9609 ; X86-NEXT: kmovw %eax, %k1
9610 ; X86-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9613 ; X64-LABEL: test_mm512_mask_rolv_epi64:
9614 ; X64: # %bb.0: # %entry
9615 ; X64-NEXT: kmovw %edi, %k1
9616 ; X64-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9619 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9620 %1 = bitcast i8 %__U to <8 x i1>
9621 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9625 define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9626 ; X86-LABEL: test_mm512_maskz_rolv_epi64:
9627 ; X86: # %bb.0: # %entry
9628 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9629 ; X86-NEXT: kmovw %eax, %k1
9630 ; X86-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9633 ; X64-LABEL: test_mm512_maskz_rolv_epi64:
9634 ; X64: # %bb.0: # %entry
9635 ; X64-NEXT: kmovw %edi, %k1
9636 ; X64-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9639 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9640 %1 = bitcast i8 %__U to <8 x i1>
9641 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9645 define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
9646 ; CHECK-LABEL: test_mm512_ror_epi32:
9647 ; CHECK: # %bb.0: # %entry
9648 ; CHECK-NEXT: vprord $5, %zmm0, %zmm0
9649 ; CHECK-NEXT: ret{{[l|q]}}
9651 %0 = bitcast <8 x i64> %__A to <16 x i32>
9652 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9653 %2 = bitcast <16 x i32> %1 to <8 x i64>
9658 define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9659 ; X86-LABEL: test_mm512_mask_ror_epi32:
9660 ; X86: # %bb.0: # %entry
9661 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9662 ; X86-NEXT: kmovw %eax, %k1
9663 ; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9666 ; X64-LABEL: test_mm512_mask_ror_epi32:
9667 ; X64: # %bb.0: # %entry
9668 ; X64-NEXT: kmovw %edi, %k1
9669 ; X64-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9672 %0 = bitcast <8 x i64> %__A to <16 x i32>
9673 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9674 %2 = bitcast <8 x i64> %__W to <16 x i32>
9675 %3 = bitcast i16 %__U to <16 x i1>
9676 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9677 %5 = bitcast <16 x i32> %4 to <8 x i64>
9681 define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9682 ; X86-LABEL: test_mm512_maskz_ror_epi32:
9683 ; X86: # %bb.0: # %entry
9684 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9685 ; X86-NEXT: kmovw %eax, %k1
9686 ; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9689 ; X64-LABEL: test_mm512_maskz_ror_epi32:
9690 ; X64: # %bb.0: # %entry
9691 ; X64-NEXT: kmovw %edi, %k1
9692 ; X64-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9695 %0 = bitcast <8 x i64> %__A to <16 x i32>
9696 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9697 %2 = bitcast i16 %__U to <16 x i1>
9698 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9699 %4 = bitcast <16 x i32> %3 to <8 x i64>
9703 define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
9704 ; CHECK-LABEL: test_mm512_ror_epi64:
9705 ; CHECK: # %bb.0: # %entry
9706 ; CHECK-NEXT: vprorq $5, %zmm0, %zmm0
9707 ; CHECK-NEXT: ret{{[l|q]}}
9709 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9713 define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9714 ; X86-LABEL: test_mm512_mask_ror_epi64:
9715 ; X86: # %bb.0: # %entry
9716 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9717 ; X86-NEXT: kmovw %eax, %k1
9718 ; X86-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9721 ; X64-LABEL: test_mm512_mask_ror_epi64:
9722 ; X64: # %bb.0: # %entry
9723 ; X64-NEXT: kmovw %edi, %k1
9724 ; X64-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9727 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9728 %1 = bitcast i8 %__U to <8 x i1>
9729 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9733 define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9734 ; X86-LABEL: test_mm512_maskz_ror_epi64:
9735 ; X86: # %bb.0: # %entry
9736 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9737 ; X86-NEXT: kmovw %eax, %k1
9738 ; X86-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9741 ; X64-LABEL: test_mm512_maskz_ror_epi64:
9742 ; X64: # %bb.0: # %entry
9743 ; X64-NEXT: kmovw %edi, %k1
9744 ; X64-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9747 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9748 %1 = bitcast i8 %__U to <8 x i1>
9749 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9753 define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9754 ; CHECK-LABEL: test_mm512_rorv_epi32:
9755 ; CHECK: # %bb.0: # %entry
9756 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
9757 ; CHECK-NEXT: ret{{[l|q]}}
9759 %0 = bitcast <8 x i64> %__A to <16 x i32>
9760 %1 = bitcast <8 x i64> %__B to <16 x i32>
9761 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9762 %3 = bitcast <16 x i32> %2 to <8 x i64>
9766 define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9767 ; X86-LABEL: test_mm512_mask_rorv_epi32:
9768 ; X86: # %bb.0: # %entry
9769 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9770 ; X86-NEXT: kmovw %eax, %k1
9771 ; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9774 ; X64-LABEL: test_mm512_mask_rorv_epi32:
9775 ; X64: # %bb.0: # %entry
9776 ; X64-NEXT: kmovw %edi, %k1
9777 ; X64-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9780 %0 = bitcast <8 x i64> %__A to <16 x i32>
9781 %1 = bitcast <8 x i64> %__B to <16 x i32>
9782 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9783 %3 = bitcast <8 x i64> %__W to <16 x i32>
9784 %4 = bitcast i16 %__U to <16 x i1>
9785 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9786 %6 = bitcast <16 x i32> %5 to <8 x i64>
9790 define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9791 ; X86-LABEL: test_mm512_maskz_rorv_epi32:
9792 ; X86: # %bb.0: # %entry
9793 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9794 ; X86-NEXT: kmovw %eax, %k1
9795 ; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9798 ; X64-LABEL: test_mm512_maskz_rorv_epi32:
9799 ; X64: # %bb.0: # %entry
9800 ; X64-NEXT: kmovw %edi, %k1
9801 ; X64-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9804 %0 = bitcast <8 x i64> %__A to <16 x i32>
9805 %1 = bitcast <8 x i64> %__B to <16 x i32>
9806 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9807 %3 = bitcast i16 %__U to <16 x i1>
9808 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9809 %5 = bitcast <16 x i32> %4 to <8 x i64>
9813 define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9814 ; CHECK-LABEL: test_mm512_rorv_epi64:
9815 ; CHECK: # %bb.0: # %entry
9816 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
9817 ; CHECK-NEXT: ret{{[l|q]}}
9819 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9823 define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9824 ; X86-LABEL: test_mm512_mask_rorv_epi64:
9825 ; X86: # %bb.0: # %entry
9826 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9827 ; X86-NEXT: kmovw %eax, %k1
9828 ; X86-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9831 ; X64-LABEL: test_mm512_mask_rorv_epi64:
9832 ; X64: # %bb.0: # %entry
9833 ; X64-NEXT: kmovw %edi, %k1
9834 ; X64-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9837 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9838 %1 = bitcast i8 %__U to <8 x i1>
9839 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9843 define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9844 ; X86-LABEL: test_mm512_maskz_rorv_epi64:
9845 ; X86: # %bb.0: # %entry
9846 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9847 ; X86-NEXT: kmovw %eax, %k1
9848 ; X86-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9851 ; X64-LABEL: test_mm512_maskz_rorv_epi64:
9852 ; X64: # %bb.0: # %entry
9853 ; X64-NEXT: kmovw %edi, %k1
9854 ; X64-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9857 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9858 %1 = bitcast i8 %__U to <8 x i1>
9859 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9863 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
9864 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
9865 declare float @llvm.fma.f32(float, float, float) #9
9866 declare double @llvm.fma.f64(double, double, double) #9
9867 declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
9868 declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
9869 declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
9870 declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
9871 declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
9872 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
9873 declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
9874 declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
9875 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
9876 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
9877 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
9878 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
9879 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
9880 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
9881 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
9882 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
9883 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
9884 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
9886 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9887 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
9888 declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9889 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)