1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
8 define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
9 ; X86-LABEL: test_mm512_kunpackb:
10 ; X86: # %bb.0: # %entry
11 ; X86-NEXT: pushl %ebp
12 ; X86-NEXT: .cfi_def_cfa_offset 8
13 ; X86-NEXT: .cfi_offset %ebp, -8
14 ; X86-NEXT: movl %esp, %ebp
15 ; X86-NEXT: .cfi_def_cfa_register %ebp
16 ; X86-NEXT: andl $-64, %esp
17 ; X86-NEXT: subl $64, %esp
18 ; X86-NEXT: vmovdqa64 136(%ebp), %zmm3
19 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
20 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
21 ; X86-NEXT: kunpckbw %k0, %k1, %k1
22 ; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
23 ; X86-NEXT: kmovw %k0, %eax
24 ; X86-NEXT: movzwl %ax, %eax
25 ; X86-NEXT: movl %ebp, %esp
27 ; X86-NEXT: .cfi_def_cfa %esp, 4
28 ; X86-NEXT: vzeroupper
31 ; X64-LABEL: test_mm512_kunpackb:
32 ; X64: # %bb.0: # %entry
33 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
34 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
35 ; X64-NEXT: kunpckbw %k0, %k1, %k1
36 ; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
37 ; X64-NEXT: kmovw %k0, %eax
38 ; X64-NEXT: movzwl %ax, %eax
39 ; X64-NEXT: vzeroupper
42 %0 = bitcast <8 x i64> %__E to <16 x i32>
43 %1 = bitcast <8 x i64> %__F to <16 x i32>
44 %2 = bitcast <8 x i64> %__A to <16 x i32>
45 %3 = bitcast <8 x i64> %__B to <16 x i32>
46 %4 = icmp ne <16 x i32> %2, %3
47 %5 = bitcast <8 x i64> %__C to <16 x i32>
48 %6 = bitcast <8 x i64> %__D to <16 x i32>
49 %7 = icmp ne <16 x i32> %5, %6
50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
53 %11 = icmp ne <16 x i32> %0, %1
54 %12 = and <16 x i1> %11, %10
55 %13 = bitcast <16 x i1> %12 to i16
59 define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
60 ; X86-LABEL: test_mm512_kortestc:
61 ; X86: # %bb.0: # %entry
62 ; X86-NEXT: pushl %ebp
63 ; X86-NEXT: .cfi_def_cfa_offset 8
64 ; X86-NEXT: .cfi_offset %ebp, -8
65 ; X86-NEXT: movl %esp, %ebp
66 ; X86-NEXT: .cfi_def_cfa_register %ebp
67 ; X86-NEXT: andl $-64, %esp
68 ; X86-NEXT: subl $64, %esp
69 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
70 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
71 ; X86-NEXT: korw %k0, %k1, %k0
72 ; X86-NEXT: kmovw %k0, %eax
73 ; X86-NEXT: cmpw $-1, %ax
75 ; X86-NEXT: andb $1, %al
76 ; X86-NEXT: movzbl %al, %eax
77 ; X86-NEXT: movl %ebp, %esp
79 ; X86-NEXT: .cfi_def_cfa %esp, 4
80 ; X86-NEXT: vzeroupper
83 ; X64-LABEL: test_mm512_kortestc:
84 ; X64: # %bb.0: # %entry
85 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
86 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
87 ; X64-NEXT: korw %k0, %k1, %k0
88 ; X64-NEXT: kmovw %k0, %eax
89 ; X64-NEXT: cmpw $-1, %ax
91 ; X64-NEXT: andb $1, %al
92 ; X64-NEXT: movzbl %al, %eax
93 ; X64-NEXT: vzeroupper
96 %0 = bitcast <8 x i64> %__A to <16 x i32>
97 %1 = bitcast <8 x i64> %__B to <16 x i32>
98 %2 = icmp ne <16 x i32> %0, %1
99 %3 = bitcast <8 x i64> %__C to <16 x i32>
100 %4 = bitcast <8 x i64> %__D to <16 x i32>
101 %5 = icmp ne <16 x i32> %3, %4
102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16
103 %8 = icmp eq i16 %7, -1
104 %9 = zext i1 %8 to i32
108 define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
109 ; X86-LABEL: test_mm512_kortestz:
110 ; X86: # %bb.0: # %entry
111 ; X86-NEXT: pushl %ebp
112 ; X86-NEXT: .cfi_def_cfa_offset 8
113 ; X86-NEXT: .cfi_offset %ebp, -8
114 ; X86-NEXT: movl %esp, %ebp
115 ; X86-NEXT: .cfi_def_cfa_register %ebp
116 ; X86-NEXT: andl $-64, %esp
117 ; X86-NEXT: subl $64, %esp
118 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
119 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
120 ; X86-NEXT: korw %k0, %k1, %k0
121 ; X86-NEXT: kmovw %k0, %eax
122 ; X86-NEXT: cmpw $0, %ax
124 ; X86-NEXT: andb $1, %al
125 ; X86-NEXT: movzbl %al, %eax
126 ; X86-NEXT: movl %ebp, %esp
127 ; X86-NEXT: popl %ebp
128 ; X86-NEXT: .cfi_def_cfa %esp, 4
129 ; X86-NEXT: vzeroupper
132 ; X64-LABEL: test_mm512_kortestz:
133 ; X64: # %bb.0: # %entry
134 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
135 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
136 ; X64-NEXT: korw %k0, %k1, %k0
137 ; X64-NEXT: kmovw %k0, %eax
138 ; X64-NEXT: cmpw $0, %ax
140 ; X64-NEXT: andb $1, %al
141 ; X64-NEXT: movzbl %al, %eax
142 ; X64-NEXT: vzeroupper
145 %0 = bitcast <8 x i64> %__A to <16 x i32>
146 %1 = bitcast <8 x i64> %__B to <16 x i32>
147 %2 = icmp ne <16 x i32> %0, %1
148 %3 = bitcast <8 x i64> %__C to <16 x i32>
149 %4 = bitcast <8 x i64> %__D to <16 x i32>
150 %5 = icmp ne <16 x i32> %3, %4
151 %6 = or <16 x i1> %5, %2
152 %7 = bitcast <16 x i1> %6 to i16
153 %8 = icmp eq i16 %7, 0
154 %9 = zext i1 %8 to i32
158 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
159 ; CHECK-LABEL: test_mm512_shuffle_f32x4:
160 ; CHECK: # %bb.0: # %entry
161 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
162 ; CHECK-NEXT: ret{{[l|q]}}
164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
165 ret <16 x float> %shuffle
169 define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
170 ; X86-LABEL: test_mm512_mask_shuffle_f32x4:
171 ; X86: # %bb.0: # %entry
172 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
173 ; X86-NEXT: kmovw %eax, %k1
174 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
177 ; X64-LABEL: test_mm512_mask_shuffle_f32x4:
178 ; X64: # %bb.0: # %entry
179 ; X64-NEXT: kmovw %edi, %k1
180 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
183 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
184 %0 = bitcast i16 %__U to <16 x i1>
185 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
189 define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
190 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
191 ; X86: # %bb.0: # %entry
192 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
193 ; X86-NEXT: kmovw %eax, %k1
194 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
197 ; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
198 ; X64: # %bb.0: # %entry
199 ; X64-NEXT: kmovw %edi, %k1
200 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
203 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
204 %0 = bitcast i16 %__U to <16 x i1>
205 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
209 define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
210 ; CHECK-LABEL: test_mm512_shuffle_f64x2:
211 ; CHECK: # %bb.0: # %entry
212 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
213 ; CHECK-NEXT: ret{{[l|q]}}
215 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
216 ret <8 x double> %shuffle
219 define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
220 ; X86-LABEL: test_mm512_mask_shuffle_f64x2:
221 ; X86: # %bb.0: # %entry
222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
223 ; X86-NEXT: kmovw %eax, %k1
224 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
227 ; X64-LABEL: test_mm512_mask_shuffle_f64x2:
228 ; X64: # %bb.0: # %entry
229 ; X64-NEXT: kmovw %edi, %k1
230 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
233 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
234 %0 = bitcast i8 %__U to <8 x i1>
235 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
239 define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
240 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
241 ; X86: # %bb.0: # %entry
242 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
243 ; X86-NEXT: kmovw %eax, %k1
244 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
247 ; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
248 ; X64: # %bb.0: # %entry
249 ; X64-NEXT: kmovw %edi, %k1
250 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
253 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
254 %0 = bitcast i8 %__U to <8 x i1>
255 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
259 define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
260 ; CHECK-LABEL: test_mm512_shuffle_i32x4:
261 ; CHECK: # %bb.0: # %entry
262 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
263 ; CHECK-NEXT: ret{{[l|q]}}
265 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
266 ret <8 x i64> %shuffle
269 define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
270 ; X86-LABEL: test_mm512_mask_shuffle_i32x4:
271 ; X86: # %bb.0: # %entry
272 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
273 ; X86-NEXT: kmovw %eax, %k1
274 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
277 ; X64-LABEL: test_mm512_mask_shuffle_i32x4:
278 ; X64: # %bb.0: # %entry
279 ; X64-NEXT: kmovw %edi, %k1
280 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
283 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
284 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
285 %1 = bitcast <8 x i64> %__W to <16 x i32>
286 %2 = bitcast i16 %__U to <16 x i1>
287 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
288 %4 = bitcast <16 x i32> %3 to <8 x i64>
292 define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
293 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
294 ; X86: # %bb.0: # %entry
295 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
296 ; X86-NEXT: kmovw %eax, %k1
297 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
300 ; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
301 ; X64: # %bb.0: # %entry
302 ; X64-NEXT: kmovw %edi, %k1
303 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
306 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
307 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
308 %1 = bitcast i16 %__U to <16 x i1>
309 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
310 %3 = bitcast <16 x i32> %2 to <8 x i64>
314 define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
315 ; CHECK-LABEL: test_mm512_shuffle_i64x2:
316 ; CHECK: # %bb.0: # %entry
317 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
318 ; CHECK-NEXT: ret{{[l|q]}}
320 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
321 ret <8 x i64> %shuffle
324 define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
325 ; X86-LABEL: test_mm512_mask_shuffle_i64x2:
326 ; X86: # %bb.0: # %entry
327 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
328 ; X86-NEXT: kmovw %eax, %k1
329 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
332 ; X64-LABEL: test_mm512_mask_shuffle_i64x2:
333 ; X64: # %bb.0: # %entry
334 ; X64-NEXT: kmovw %edi, %k1
335 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
338 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
339 %0 = bitcast i8 %__U to <8 x i1>
340 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
344 define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
345 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
346 ; X86: # %bb.0: # %entry
347 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
348 ; X86-NEXT: kmovw %eax, %k1
349 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
352 ; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
353 ; X64: # %bb.0: # %entry
354 ; X64-NEXT: kmovw %edi, %k1
355 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
358 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
359 %0 = bitcast i8 %__U to <8 x i1>
360 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
365 define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
366 ; CHECK-LABEL: test_mm512_testn_epi32_mask:
367 ; CHECK: # %bb.0: # %entry
368 ; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
369 ; CHECK-NEXT: kmovw %k0, %eax
370 ; CHECK-NEXT: movzwl %ax, %eax
371 ; CHECK-NEXT: vzeroupper
372 ; CHECK-NEXT: ret{{[l|q]}}
374 %and1.i.i = and <8 x i64> %__B, %__A
375 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
376 %1 = icmp eq <16 x i32> %0, zeroinitializer
377 %2 = bitcast <16 x i1> %1 to i16
381 define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
382 ; X86-LABEL: test_mm512_mask_testn_epi32_mask:
383 ; X86: # %bb.0: # %entry
384 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
385 ; X86-NEXT: kmovw %eax, %k1
386 ; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
387 ; X86-NEXT: kmovw %k0, %eax
388 ; X86-NEXT: movzwl %ax, %eax
389 ; X86-NEXT: vzeroupper
392 ; X64-LABEL: test_mm512_mask_testn_epi32_mask:
393 ; X64: # %bb.0: # %entry
394 ; X64-NEXT: kmovw %edi, %k1
395 ; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
396 ; X64-NEXT: kmovw %k0, %eax
397 ; X64-NEXT: movzwl %ax, %eax
398 ; X64-NEXT: vzeroupper
401 %and1.i.i = and <8 x i64> %__B, %__A
402 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
403 %1 = icmp eq <16 x i32> %0, zeroinitializer
404 %2 = bitcast i16 %__U to <16 x i1>
405 %3 = and <16 x i1> %1, %2
406 %4 = bitcast <16 x i1> %3 to i16
410 define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
411 ; CHECK-LABEL: test_mm512_testn_epi64_mask:
412 ; CHECK: # %bb.0: # %entry
413 ; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
414 ; CHECK-NEXT: kmovw %k0, %eax
415 ; CHECK-NEXT: movzbl %al, %eax
416 ; CHECK-NEXT: vzeroupper
417 ; CHECK-NEXT: ret{{[l|q]}}
419 %and1.i.i = and <8 x i64> %__B, %__A
420 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
421 %1 = bitcast <8 x i1> %0 to i8
425 define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
426 ; X86-LABEL: test_mm512_mask_testn_epi64_mask:
427 ; X86: # %bb.0: # %entry
428 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
429 ; X86-NEXT: kmovw %eax, %k1
430 ; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
431 ; X86-NEXT: kmovw %k0, %eax
432 ; X86-NEXT: movzbl %al, %eax
433 ; X86-NEXT: vzeroupper
436 ; X64-LABEL: test_mm512_mask_testn_epi64_mask:
437 ; X64: # %bb.0: # %entry
438 ; X64-NEXT: kmovw %edi, %k1
439 ; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
440 ; X64-NEXT: kmovw %k0, %eax
441 ; X64-NEXT: movzbl %al, %eax
442 ; X64-NEXT: vzeroupper
445 %and1.i.i = and <8 x i64> %__B, %__A
446 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
447 %1 = bitcast i8 %__U to <8 x i1>
448 %2 = and <8 x i1> %0, %1
449 %3 = bitcast <8 x i1> %2 to i8
453 define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
454 ; X86-LABEL: test_mm512_mask_test_epi32_mask:
455 ; X86: # %bb.0: # %entry
456 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
457 ; X86-NEXT: kmovw %eax, %k1
458 ; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
459 ; X86-NEXT: kmovw %k0, %eax
460 ; X86-NEXT: movzwl %ax, %eax
461 ; X86-NEXT: vzeroupper
464 ; X64-LABEL: test_mm512_mask_test_epi32_mask:
465 ; X64: # %bb.0: # %entry
466 ; X64-NEXT: kmovw %edi, %k1
467 ; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
468 ; X64-NEXT: kmovw %k0, %eax
469 ; X64-NEXT: movzwl %ax, %eax
470 ; X64-NEXT: vzeroupper
473 %and1.i.i = and <8 x i64> %__B, %__A
474 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
475 %1 = icmp ne <16 x i32> %0, zeroinitializer
476 %2 = bitcast i16 %__U to <16 x i1>
477 %3 = and <16 x i1> %1, %2
478 %4 = bitcast <16 x i1> %3 to i16
482 define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
483 ; X86-LABEL: test_mm512_mask_test_epi64_mask:
484 ; X86: # %bb.0: # %entry
485 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
486 ; X86-NEXT: kmovw %eax, %k1
487 ; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
488 ; X86-NEXT: kmovw %k0, %eax
489 ; X86-NEXT: movzbl %al, %eax
490 ; X86-NEXT: vzeroupper
493 ; X64-LABEL: test_mm512_mask_test_epi64_mask:
494 ; X64: # %bb.0: # %entry
495 ; X64-NEXT: kmovw %edi, %k1
496 ; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
497 ; X64-NEXT: kmovw %k0, %eax
498 ; X64-NEXT: movzbl %al, %eax
499 ; X64-NEXT: vzeroupper
502 %and1.i.i = and <8 x i64> %__B, %__A
503 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
504 %1 = bitcast i8 %__U to <8 x i1>
505 %2 = and <8 x i1> %0, %1
506 %3 = bitcast <8 x i1> %2 to i8
510 define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
511 ; X86-LABEL: test_mm512_mask_set1_epi32:
512 ; X86: # %bb.0: # %entry
513 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
514 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
515 ; X86-NEXT: kmovw %ecx, %k1
516 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1}
519 ; X64-LABEL: test_mm512_mask_set1_epi32:
520 ; X64: # %bb.0: # %entry
521 ; X64-NEXT: kmovw %edi, %k1
522 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1}
525 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
526 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
527 %0 = bitcast <8 x i64> %__O to <16 x i32>
528 %1 = bitcast i16 %__M to <16 x i1>
529 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
530 %3 = bitcast <16 x i32> %2 to <8 x i64>
534 define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) {
535 ; X86-LABEL: test_mm512_maskz_set1_epi32:
536 ; X86: # %bb.0: # %entry
537 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
538 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
539 ; X86-NEXT: kmovw %ecx, %k1
540 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
543 ; X64-LABEL: test_mm512_maskz_set1_epi32:
544 ; X64: # %bb.0: # %entry
545 ; X64-NEXT: kmovw %edi, %k1
546 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z}
549 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
550 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
551 %0 = bitcast i16 %__M to <16 x i1>
552 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
553 %2 = bitcast <16 x i32> %1 to <8 x i64>
557 define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
558 ; X86-LABEL: test_mm512_mask_set1_epi64:
559 ; X86: # %bb.0: # %entry
560 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
561 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
562 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
563 ; X86-NEXT: kmovw %eax, %k1
564 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
567 ; X64-LABEL: test_mm512_mask_set1_epi64:
568 ; X64: # %bb.0: # %entry
569 ; X64-NEXT: kmovw %edi, %k1
570 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1}
573 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
574 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
575 %0 = bitcast i8 %__M to <8 x i1>
576 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
580 define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
581 ; X86-LABEL: test_mm512_maskz_set1_epi64:
582 ; X86: # %bb.0: # %entry
583 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
584 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
585 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
586 ; X86-NEXT: kmovw %eax, %k1
587 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
590 ; X64-LABEL: test_mm512_maskz_set1_epi64:
591 ; X64: # %bb.0: # %entry
592 ; X64-NEXT: kmovw %edi, %k1
593 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z}
596 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
597 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
598 %0 = bitcast i8 %__M to <8 x i1>
599 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
604 define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
605 ; CHECK-LABEL: test_mm512_broadcastd_epi32:
607 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
608 ; CHECK-NEXT: ret{{[l|q]}}
609 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
610 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
611 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
615 define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
616 ; X86-LABEL: test_mm512_mask_broadcastd_epi32:
618 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
619 ; X86-NEXT: kmovw %eax, %k1
620 ; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
623 ; X64-LABEL: test_mm512_mask_broadcastd_epi32:
625 ; X64-NEXT: kmovw %edi, %k1
626 ; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
628 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
629 %arg1 = bitcast i16 %a1 to <16 x i1>
630 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
631 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
632 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
633 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
637 define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
638 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
640 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
641 ; X86-NEXT: kmovw %eax, %k1
642 ; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
645 ; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
647 ; X64-NEXT: kmovw %edi, %k1
648 ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
650 %arg0 = bitcast i16 %a0 to <16 x i1>
651 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
652 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
653 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
654 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
658 define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
659 ; CHECK-LABEL: test_mm512_broadcastq_epi64:
661 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
662 ; CHECK-NEXT: ret{{[l|q]}}
663 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
667 define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
668 ; X86-LABEL: test_mm512_mask_broadcastq_epi64:
670 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
671 ; X86-NEXT: kmovw %eax, %k1
672 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
675 ; X64-LABEL: test_mm512_mask_broadcastq_epi64:
677 ; X64-NEXT: kmovw %edi, %k1
678 ; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
680 %arg1 = bitcast i8 %a1 to <8 x i1>
681 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
682 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
686 define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
687 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
689 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
690 ; X86-NEXT: kmovw %eax, %k1
691 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
694 ; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
696 ; X64-NEXT: kmovw %edi, %k1
697 ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
699 %arg0 = bitcast i8 %a0 to <8 x i1>
700 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
701 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
705 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
706 ; CHECK-LABEL: test_mm512_broadcastsd_pd:
708 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
709 ; CHECK-NEXT: ret{{[l|q]}}
710 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
711 ret <8 x double> %res
714 define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
715 ; X86-LABEL: test_mm512_mask_broadcastsd_pd:
717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
718 ; X86-NEXT: kmovw %eax, %k1
719 ; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
722 ; X64-LABEL: test_mm512_mask_broadcastsd_pd:
724 ; X64-NEXT: kmovw %edi, %k1
725 ; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
727 %arg1 = bitcast i8 %a1 to <8 x i1>
728 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
729 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
730 ret <8 x double> %res1
733 define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
734 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
736 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
737 ; X86-NEXT: kmovw %eax, %k1
738 ; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
741 ; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
743 ; X64-NEXT: kmovw %edi, %k1
744 ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
746 %arg0 = bitcast i8 %a0 to <8 x i1>
747 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
748 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
749 ret <8 x double> %res1
752 define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
753 ; CHECK-LABEL: test_mm512_broadcastss_ps:
755 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
756 ; CHECK-NEXT: ret{{[l|q]}}
757 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
758 ret <16 x float> %res
761 define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
762 ; X86-LABEL: test_mm512_mask_broadcastss_ps:
764 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
765 ; X86-NEXT: kmovw %eax, %k1
766 ; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
769 ; X64-LABEL: test_mm512_mask_broadcastss_ps:
771 ; X64-NEXT: kmovw %edi, %k1
772 ; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
774 %arg1 = bitcast i16 %a1 to <16 x i1>
775 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
776 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
777 ret <16 x float> %res1
780 define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
781 ; X86-LABEL: test_mm512_maskz_broadcastss_ps:
783 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
784 ; X86-NEXT: kmovw %eax, %k1
785 ; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
788 ; X64-LABEL: test_mm512_maskz_broadcastss_ps:
790 ; X64-NEXT: kmovw %edi, %k1
791 ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
793 %arg0 = bitcast i16 %a0 to <16 x i1>
794 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
795 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
796 ret <16 x float> %res1
799 define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
800 ; CHECK-LABEL: test_mm512_movedup_pd:
802 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
803 ; CHECK-NEXT: ret{{[l|q]}}
804 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
805 ret <8 x double> %res
808 define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
809 ; X86-LABEL: test_mm512_mask_movedup_pd:
811 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
812 ; X86-NEXT: kmovw %eax, %k1
813 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
816 ; X64-LABEL: test_mm512_mask_movedup_pd:
818 ; X64-NEXT: kmovw %edi, %k1
819 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
821 %arg1 = bitcast i8 %a1 to <8 x i1>
822 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
823 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
824 ret <8 x double> %res1
827 define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
828 ; X86-LABEL: test_mm512_maskz_movedup_pd:
830 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
831 ; X86-NEXT: kmovw %eax, %k1
832 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
835 ; X64-LABEL: test_mm512_maskz_movedup_pd:
837 ; X64-NEXT: kmovw %edi, %k1
838 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
840 %arg0 = bitcast i8 %a0 to <8 x i1>
841 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
842 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
843 ret <8 x double> %res1
846 define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
847 ; CHECK-LABEL: test_mm512_movehdup_ps:
849 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
850 ; CHECK-NEXT: ret{{[l|q]}}
851 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
852 ret <16 x float> %res
855 define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
856 ; X86-LABEL: test_mm512_mask_movehdup_ps:
858 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
859 ; X86-NEXT: kmovw %eax, %k1
860 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
863 ; X64-LABEL: test_mm512_mask_movehdup_ps:
865 ; X64-NEXT: kmovw %edi, %k1
866 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
868 %arg1 = bitcast i16 %a1 to <16 x i1>
869 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
870 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
871 ret <16 x float> %res1
874 define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
875 ; X86-LABEL: test_mm512_maskz_movehdup_ps:
877 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
878 ; X86-NEXT: kmovw %eax, %k1
879 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
882 ; X64-LABEL: test_mm512_maskz_movehdup_ps:
884 ; X64-NEXT: kmovw %edi, %k1
885 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
887 %arg0 = bitcast i16 %a0 to <16 x i1>
888 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
889 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
890 ret <16 x float> %res1
893 define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
894 ; CHECK-LABEL: test_mm512_moveldup_ps:
896 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
897 ; CHECK-NEXT: ret{{[l|q]}}
898 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
899 ret <16 x float> %res
902 define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
903 ; X86-LABEL: test_mm512_mask_moveldup_ps:
905 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
906 ; X86-NEXT: kmovw %eax, %k1
907 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
910 ; X64-LABEL: test_mm512_mask_moveldup_ps:
912 ; X64-NEXT: kmovw %edi, %k1
913 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
915 %arg1 = bitcast i16 %a1 to <16 x i1>
916 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
917 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
918 ret <16 x float> %res1
921 define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
922 ; X86-LABEL: test_mm512_maskz_moveldup_ps:
924 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
925 ; X86-NEXT: kmovw %eax, %k1
926 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
929 ; X64-LABEL: test_mm512_maskz_moveldup_ps:
931 ; X64-NEXT: kmovw %edi, %k1
932 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
934 %arg0 = bitcast i16 %a0 to <16 x i1>
935 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
936 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
937 ret <16 x float> %res1
940 define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
941 ; CHECK-LABEL: test_mm512_permute_pd:
943 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
944 ; CHECK-NEXT: ret{{[l|q]}}
945 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
946 ret <8 x double> %res
949 define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
950 ; X86-LABEL: test_mm512_mask_permute_pd:
952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
953 ; X86-NEXT: kmovw %eax, %k1
954 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
957 ; X64-LABEL: test_mm512_mask_permute_pd:
959 ; X64-NEXT: kmovw %edi, %k1
960 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
962 %arg1 = bitcast i8 %a1 to <8 x i1>
963 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
964 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
965 ret <8 x double> %res1
968 define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
969 ; X86-LABEL: test_mm512_maskz_permute_pd:
971 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
972 ; X86-NEXT: kmovw %eax, %k1
973 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
976 ; X64-LABEL: test_mm512_maskz_permute_pd:
978 ; X64-NEXT: kmovw %edi, %k1
979 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
981 %arg0 = bitcast i8 %a0 to <8 x i1>
982 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
983 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
984 ret <8 x double> %res1
987 define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
988 ; CHECK-LABEL: test_mm512_permute_ps:
990 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
991 ; CHECK-NEXT: ret{{[l|q]}}
992 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
993 ret <16 x float> %res
996 define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
997 ; X86-LABEL: test_mm512_mask_permute_ps:
999 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1000 ; X86-NEXT: kmovw %eax, %k1
1001 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1004 ; X64-LABEL: test_mm512_mask_permute_ps:
1006 ; X64-NEXT: kmovw %edi, %k1
1007 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1009 %arg1 = bitcast i16 %a1 to <16 x i1>
1010 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1011 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1012 ret <16 x float> %res1
1015 define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
1016 ; X86-LABEL: test_mm512_maskz_permute_ps:
1018 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1019 ; X86-NEXT: kmovw %eax, %k1
1020 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1023 ; X64-LABEL: test_mm512_maskz_permute_ps:
1025 ; X64-NEXT: kmovw %edi, %k1
1026 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1028 %arg0 = bitcast i16 %a0 to <16 x i1>
1029 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1030 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1031 ret <16 x float> %res1
1034 define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
1035 ; CHECK-LABEL: test_mm512_permutex_epi64:
1037 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1038 ; CHECK-NEXT: ret{{[l|q]}}
1039 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1043 define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
1044 ; X86-LABEL: test_mm512_mask_permutex_epi64:
1046 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1047 ; X86-NEXT: kmovw %eax, %k1
1048 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1051 ; X64-LABEL: test_mm512_mask_permutex_epi64:
1053 ; X64-NEXT: kmovw %edi, %k1
1054 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1056 %arg1 = bitcast i8 %a1 to <8 x i1>
1057 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1058 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1062 define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
1063 ; X86-LABEL: test_mm512_maskz_permutex_epi64:
1065 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1066 ; X86-NEXT: kmovw %eax, %k1
1067 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1070 ; X64-LABEL: test_mm512_maskz_permutex_epi64:
1072 ; X64-NEXT: kmovw %edi, %k1
1073 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1075 %arg0 = bitcast i8 %a0 to <8 x i1>
1076 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1077 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1081 define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
1082 ; CHECK-LABEL: test_mm512_permutex_pd:
1084 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1085 ; CHECK-NEXT: ret{{[l|q]}}
1086 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1087 ret <8 x double> %res
1090 define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
1091 ; X86-LABEL: test_mm512_mask_permutex_pd:
1093 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1094 ; X86-NEXT: kmovw %eax, %k1
1095 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1098 ; X64-LABEL: test_mm512_mask_permutex_pd:
1100 ; X64-NEXT: kmovw %edi, %k1
1101 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1103 %arg1 = bitcast i8 %a1 to <8 x i1>
1104 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1105 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1106 ret <8 x double> %res1
1109 define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
1110 ; X86-LABEL: test_mm512_maskz_permutex_pd:
1112 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1113 ; X86-NEXT: kmovw %eax, %k1
1114 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1117 ; X64-LABEL: test_mm512_maskz_permutex_pd:
1119 ; X64-NEXT: kmovw %edi, %k1
1120 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1122 %arg0 = bitcast i8 %a0 to <8 x i1>
1123 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1124 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1125 ret <8 x double> %res1
1128 define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
1129 ; CHECK-LABEL: test_mm512_shuffle_epi32:
1131 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1132 ; CHECK-NEXT: ret{{[l|q]}}
1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1134 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1135 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1139 define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
1140 ; X86-LABEL: test_mm512_mask_shuffle_epi32:
1142 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1143 ; X86-NEXT: kmovw %eax, %k1
1144 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1147 ; X64-LABEL: test_mm512_mask_shuffle_epi32:
1149 ; X64-NEXT: kmovw %edi, %k1
1150 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1152 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1153 %arg1 = bitcast i16 %a1 to <16 x i1>
1154 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1155 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1156 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1157 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1161 define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
1162 ; X86-LABEL: test_mm512_maskz_shuffle_epi32:
1164 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1165 ; X86-NEXT: kmovw %eax, %k1
1166 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1169 ; X64-LABEL: test_mm512_maskz_shuffle_epi32:
1171 ; X64-NEXT: kmovw %edi, %k1
1172 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1174 %arg0 = bitcast i16 %a0 to <16 x i1>
1175 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1176 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1177 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1178 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1182 define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
1183 ; CHECK-LABEL: test_mm512_shuffle_pd:
1185 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1186 ; CHECK-NEXT: ret{{[l|q]}}
1187 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1188 ret <8 x double> %res
1191 define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1192 ; X86-LABEL: test_mm512_mask_shuffle_pd:
1194 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1195 ; X86-NEXT: kmovw %eax, %k1
1196 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1199 ; X64-LABEL: test_mm512_mask_shuffle_pd:
1201 ; X64-NEXT: kmovw %edi, %k1
1202 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1204 %arg1 = bitcast i8 %a1 to <8 x i1>
1205 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1206 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1207 ret <8 x double> %res1
1210 define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1211 ; X86-LABEL: test_mm512_maskz_shuffle_pd:
1213 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1214 ; X86-NEXT: kmovw %eax, %k1
1215 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1218 ; X64-LABEL: test_mm512_maskz_shuffle_pd:
1220 ; X64-NEXT: kmovw %edi, %k1
1221 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1223 %arg0 = bitcast i8 %a0 to <8 x i1>
1224 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1225 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1226 ret <8 x double> %res1
1229 define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1230 ; CHECK-LABEL: test_mm512_unpackhi_epi32:
1232 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1233 ; CHECK-NEXT: ret{{[l|q]}}
1234 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1235 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1236 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1237 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1241 define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1242 ; X86-LABEL: test_mm512_mask_unpackhi_epi32:
1244 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1245 ; X86-NEXT: kmovw %eax, %k1
1246 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1249 ; X64-LABEL: test_mm512_mask_unpackhi_epi32:
1251 ; X64-NEXT: kmovw %edi, %k1
1252 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1254 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1255 %arg1 = bitcast i16 %a1 to <16 x i1>
1256 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1257 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1258 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1259 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1264 define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1265 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
1267 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1268 ; X86-NEXT: kmovw %eax, %k1
1269 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1272 ; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
1274 ; X64-NEXT: kmovw %edi, %k1
1275 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1277 %arg0 = bitcast i16 %a0 to <16 x i1>
1278 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1279 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1280 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1281 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1282 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1286 define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1287 ; CHECK-LABEL: test_mm512_unpackhi_epi64:
1289 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1290 ; CHECK-NEXT: ret{{[l|q]}}
1291 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1295 define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1296 ; X86-LABEL: test_mm512_mask_unpackhi_epi64:
1298 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1299 ; X86-NEXT: kmovw %eax, %k1
1300 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1303 ; X64-LABEL: test_mm512_mask_unpackhi_epi64:
1305 ; X64-NEXT: kmovw %edi, %k1
1306 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1308 %arg1 = bitcast i8 %a1 to <8 x i1>
1309 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1310 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1314 define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1315 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
1317 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1318 ; X86-NEXT: kmovw %eax, %k1
1319 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1322 ; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
1324 ; X64-NEXT: kmovw %edi, %k1
1325 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1327 %arg0 = bitcast i8 %a0 to <8 x i1>
1328 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1329 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1333 define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
1334 ; CHECK-LABEL: test_mm512_unpackhi_pd:
1336 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1337 ; CHECK-NEXT: ret{{[l|q]}}
1338 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1339 ret <8 x double> %res
1342 define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1343 ; X86-LABEL: test_mm512_mask_unpackhi_pd:
1345 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1346 ; X86-NEXT: kmovw %eax, %k1
1347 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1350 ; X64-LABEL: test_mm512_mask_unpackhi_pd:
1352 ; X64-NEXT: kmovw %edi, %k1
1353 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1355 %arg1 = bitcast i8 %a1 to <8 x i1>
1356 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1357 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1358 ret <8 x double> %res1
1361 define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1362 ; X86-LABEL: test_mm512_maskz_unpackhi_pd:
1364 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1365 ; X86-NEXT: kmovw %eax, %k1
1366 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1369 ; X64-LABEL: test_mm512_maskz_unpackhi_pd:
1371 ; X64-NEXT: kmovw %edi, %k1
1372 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1374 %arg0 = bitcast i8 %a0 to <8 x i1>
1375 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1376 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1377 ret <8 x double> %res1
1380 define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
1381 ; CHECK-LABEL: test_mm512_unpackhi_ps:
1383 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1384 ; CHECK-NEXT: ret{{[l|q]}}
1385 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1386 ret <16 x float> %res
1389 define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1390 ; X86-LABEL: test_mm512_mask_unpackhi_ps:
1392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1393 ; X86-NEXT: kmovw %eax, %k1
1394 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1397 ; X64-LABEL: test_mm512_mask_unpackhi_ps:
1399 ; X64-NEXT: kmovw %edi, %k1
1400 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1402 %arg1 = bitcast i16 %a1 to <16 x i1>
1403 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1404 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1405 ret <16 x float> %res1
1408 define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1409 ; X86-LABEL: test_mm512_maskz_unpackhi_ps:
1411 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1412 ; X86-NEXT: kmovw %eax, %k1
1413 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1416 ; X64-LABEL: test_mm512_maskz_unpackhi_ps:
1418 ; X64-NEXT: kmovw %edi, %k1
1419 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1421 %arg0 = bitcast i16 %a0 to <16 x i1>
1422 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1423 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1424 ret <16 x float> %res1
1427 define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1428 ; CHECK-LABEL: test_mm512_unpacklo_epi32:
1430 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1431 ; CHECK-NEXT: ret{{[l|q]}}
1432 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1433 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1434 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1435 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1439 define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1440 ; X86-LABEL: test_mm512_mask_unpacklo_epi32:
1442 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1443 ; X86-NEXT: kmovw %eax, %k1
1444 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1447 ; X64-LABEL: test_mm512_mask_unpacklo_epi32:
1449 ; X64-NEXT: kmovw %edi, %k1
1450 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1452 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1453 %arg1 = bitcast i16 %a1 to <16 x i1>
1454 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1455 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1456 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1457 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1458 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1462 define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1463 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
1465 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1466 ; X86-NEXT: kmovw %eax, %k1
1467 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1470 ; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
1472 ; X64-NEXT: kmovw %edi, %k1
1473 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1475 %arg0 = bitcast i16 %a0 to <16 x i1>
1476 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1477 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1478 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1479 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1480 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1484 define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1485 ; CHECK-LABEL: test_mm512_unpacklo_epi64:
1487 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1488 ; CHECK-NEXT: ret{{[l|q]}}
1489 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1493 define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1494 ; X86-LABEL: test_mm512_mask_unpacklo_epi64:
1496 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1497 ; X86-NEXT: kmovw %eax, %k1
1498 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1501 ; X64-LABEL: test_mm512_mask_unpacklo_epi64:
1503 ; X64-NEXT: kmovw %edi, %k1
1504 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1506 %arg1 = bitcast i8 %a1 to <8 x i1>
1507 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1508 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1512 define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1513 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
1515 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1516 ; X86-NEXT: kmovw %eax, %k1
1517 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1520 ; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
1522 ; X64-NEXT: kmovw %edi, %k1
1523 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1525 %arg0 = bitcast i8 %a0 to <8 x i1>
1526 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1527 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1531 define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
1532 ; CHECK-LABEL: test_mm512_unpacklo_pd:
1534 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1535 ; CHECK-NEXT: ret{{[l|q]}}
1536 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1537 ret <8 x double> %res
1540 define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1541 ; X86-LABEL: test_mm512_mask_unpacklo_pd:
1543 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1544 ; X86-NEXT: kmovw %eax, %k1
1545 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1548 ; X64-LABEL: test_mm512_mask_unpacklo_pd:
1550 ; X64-NEXT: kmovw %edi, %k1
1551 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1553 %arg1 = bitcast i8 %a1 to <8 x i1>
1554 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1555 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1556 ret <8 x double> %res1
1559 define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1560 ; X86-LABEL: test_mm512_maskz_unpacklo_pd:
1562 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1563 ; X86-NEXT: kmovw %eax, %k1
1564 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1567 ; X64-LABEL: test_mm512_maskz_unpacklo_pd:
1569 ; X64-NEXT: kmovw %edi, %k1
1570 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1572 %arg0 = bitcast i8 %a0 to <8 x i1>
1573 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1574 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1575 ret <8 x double> %res1
1578 define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
1579 ; CHECK-LABEL: test_mm512_unpacklo_ps:
1581 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1582 ; CHECK-NEXT: ret{{[l|q]}}
1583 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1584 ret <16 x float> %res
1587 define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1588 ; X86-LABEL: test_mm512_mask_unpacklo_ps:
1590 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1591 ; X86-NEXT: kmovw %eax, %k1
1592 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1595 ; X64-LABEL: test_mm512_mask_unpacklo_ps:
1597 ; X64-NEXT: kmovw %edi, %k1
1598 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1600 %arg1 = bitcast i16 %a1 to <16 x i1>
1601 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1602 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1603 ret <16 x float> %res1
1606 define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1607 ; X86-LABEL: test_mm512_maskz_unpacklo_ps:
1609 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: kmovw %eax, %k1
1611 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1614 ; X64-LABEL: test_mm512_maskz_unpacklo_ps:
1616 ; X64-NEXT: kmovw %edi, %k1
1617 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1619 %arg0 = bitcast i16 %a0 to <16 x i1>
1620 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1621 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1622 ret <16 x float> %res1
1625 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
1626 ; CHECK-LABEL: test_mm512_zextpd128_pd512:
1628 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1629 ; CHECK-NEXT: ret{{[l|q]}}
1630 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1631 ret <8 x double> %res
1634 define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
1635 ; CHECK-LABEL: test_mm512_zextpd256_pd512:
1637 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1638 ; CHECK-NEXT: ret{{[l|q]}}
1639 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1640 ret <8 x double> %res
1643 define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
1644 ; CHECK-LABEL: test_mm512_zextps128_ps512:
1646 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1647 ; CHECK-NEXT: ret{{[l|q]}}
1648 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
1649 ret <16 x float> %res
1652 define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
1653 ; CHECK-LABEL: test_mm512_zextps256_ps512:
1655 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1656 ; CHECK-NEXT: ret{{[l|q]}}
1657 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1658 ret <16 x float> %res
1661 define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
1662 ; CHECK-LABEL: test_mm512_zextsi128_si512:
1664 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1665 ; CHECK-NEXT: ret{{[l|q]}}
1666 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1670 define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
1671 ; CHECK-LABEL: test_mm512_zextsi256_si512:
1673 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1674 ; CHECK-NEXT: ret{{[l|q]}}
1675 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1679 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1680 ; CHECK-LABEL: test_mm512_mul_epi32:
1682 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0
1683 ; CHECK-NEXT: ret{{[l|q]}}
1684 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1685 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1686 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1687 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1688 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1692 define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1693 ; X86-LABEL: test_mm512_maskz_mul_epi32:
1694 ; X86: # %bb.0: # %entry
1695 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1696 ; X86-NEXT: kmovw %eax, %k1
1697 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1700 ; X64-LABEL: test_mm512_maskz_mul_epi32:
1701 ; X64: # %bb.0: # %entry
1702 ; X64-NEXT: kmovw %edi, %k1
1703 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1706 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1707 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1708 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1709 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1710 %4 = mul nsw <8 x i64> %3, %1
1711 %5 = bitcast i8 %__k to <8 x i1>
1712 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
1716 define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1717 ; X86-LABEL: test_mm512_mask_mul_epi32:
1718 ; X86: # %bb.0: # %entry
1719 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1720 ; X86-NEXT: kmovw %eax, %k1
1721 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1722 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1725 ; X64-LABEL: test_mm512_mask_mul_epi32:
1726 ; X64: # %bb.0: # %entry
1727 ; X64-NEXT: kmovw %edi, %k1
1728 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1729 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1732 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1733 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1734 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1735 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1736 %4 = mul nsw <8 x i64> %3, %1
1737 %5 = bitcast i8 %__k to <8 x i1>
1738 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src
1742 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1743 ; CHECK-LABEL: test_mm512_mul_epu32:
1745 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
1746 ; CHECK-NEXT: ret{{[l|q]}}
1747 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1748 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1749 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1753 define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1754 ; X86-LABEL: test_mm512_maskz_mul_epu32:
1755 ; X86: # %bb.0: # %entry
1756 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1757 ; X86-NEXT: kmovw %eax, %k1
1758 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1761 ; X64-LABEL: test_mm512_maskz_mul_epu32:
1762 ; X64: # %bb.0: # %entry
1763 ; X64-NEXT: kmovw %edi, %k1
1764 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1767 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1768 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1769 %2 = mul nuw <8 x i64> %1, %0
1770 %3 = bitcast i8 %__k to <8 x i1>
1771 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1775 define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1776 ; X86-LABEL: test_mm512_mask_mul_epu32:
1777 ; X86: # %bb.0: # %entry
1778 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1779 ; X86-NEXT: kmovw %eax, %k1
1780 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1781 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1784 ; X64-LABEL: test_mm512_mask_mul_epu32:
1785 ; X64: # %bb.0: # %entry
1786 ; X64-NEXT: kmovw %edi, %k1
1787 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1788 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1791 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1792 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1793 %2 = mul nuw <8 x i64> %1, %0
1794 %3 = bitcast i8 %__k to <8 x i1>
1795 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src
1799 define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
1800 ; X86-LABEL: test_mm512_set1_epi8:
1801 ; X86: # %bb.0: # %entry
1802 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1803 ; X86-NEXT: vmovd %eax, %xmm0
1804 ; X86-NEXT: vpbroadcastb %xmm0, %ymm0
1805 ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1808 ; X64-LABEL: test_mm512_set1_epi8:
1809 ; X64: # %bb.0: # %entry
1810 ; X64-NEXT: vmovd %edi, %xmm0
1811 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0
1812 ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1815 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
1816 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
1817 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
1821 define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
1822 ; X86-LABEL: test_mm_cvtu32_sd:
1823 ; X86: # %bb.0: # %entry
1824 ; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
1827 ; X64-LABEL: test_mm_cvtu32_sd:
1828 ; X64: # %bb.0: # %entry
1829 ; X64-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0
1832 %conv.i = uitofp i32 %__B to double
1833 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1834 ret <2 x double> %vecins.i
1837 define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
1838 ; X86-LABEL: test_mm_cvtu64_sd:
1839 ; X86: # %bb.0: # %entry
1840 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1841 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1842 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1843 ; X86-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1844 ; X86-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1845 ; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
1846 ; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1849 ; X64-LABEL: test_mm_cvtu64_sd:
1850 ; X64: # %bb.0: # %entry
1851 ; X64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0
1854 %conv.i = uitofp i64 %__B to double
1855 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1856 ret <2 x double> %vecins.i
1859 define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
1860 ; X86-LABEL: test_mm_cvtu32_ss:
1861 ; X86: # %bb.0: # %entry
1862 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
1865 ; X64-LABEL: test_mm_cvtu32_ss:
1866 ; X64: # %bb.0: # %entry
1867 ; X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
1870 %conv.i = uitofp i32 %__B to float
1871 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1872 ret <4 x float> %vecins.i
1875 define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
1876 ; X86-LABEL: test_mm_cvtu64_ss:
1877 ; X86: # %bb.0: # %entry
1878 ; X86-NEXT: pushl %ebp
1879 ; X86-NEXT: .cfi_def_cfa_offset 8
1880 ; X86-NEXT: .cfi_offset %ebp, -8
1881 ; X86-NEXT: movl %esp, %ebp
1882 ; X86-NEXT: .cfi_def_cfa_register %ebp
1883 ; X86-NEXT: andl $-8, %esp
1884 ; X86-NEXT: subl $16, %esp
1885 ; X86-NEXT: movl 12(%ebp), %eax
1886 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1887 ; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1888 ; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
1889 ; X86-NEXT: shrl $31, %eax
1890 ; X86-NEXT: fildll {{[0-9]+}}(%esp)
1891 ; X86-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
1892 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
1893 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1894 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1895 ; X86-NEXT: movl %ebp, %esp
1896 ; X86-NEXT: popl %ebp
1897 ; X86-NEXT: .cfi_def_cfa %esp, 4
1900 ; X64-LABEL: test_mm_cvtu64_ss:
1901 ; X64: # %bb.0: # %entry
1902 ; X64-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0
1905 %conv.i = uitofp i64 %__B to float
1906 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1907 ret <4 x float> %vecins.i
1910 define <16 x float> @test_mm512_cvtph_ps(<4 x i64> %__A) {
1911 ; CHECK-LABEL: test_mm512_cvtph_ps:
1912 ; CHECK: # %bb.0: # %entry
1913 ; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0
1914 ; CHECK-NEXT: ret{{[l|q]}}
1916 %0 = bitcast <4 x i64> %__A to <16 x i16>
1917 %1 = bitcast <16 x i16> %0 to <16 x half>
1918 %2 = fpext <16 x half> %1 to <16 x float>
1922 define <16 x float> @test_mm512_mask_cvtph_ps(<16 x float> %__W, i16 zeroext %__U, <4 x i64> %__A) {
1923 ; X86-LABEL: test_mm512_mask_cvtph_ps:
1924 ; X86: # %bb.0: # %entry
1925 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1926 ; X86-NEXT: kmovw %eax, %k1
1927 ; X86-NEXT: vcvtph2ps %ymm1, %zmm0 {%k1}
1930 ; X64-LABEL: test_mm512_mask_cvtph_ps:
1931 ; X64: # %bb.0: # %entry
1932 ; X64-NEXT: kmovw %edi, %k1
1933 ; X64-NEXT: vcvtph2ps %ymm1, %zmm0 {%k1}
1936 %0 = bitcast <4 x i64> %__A to <16 x i16>
1937 %1 = bitcast <16 x i16> %0 to <16 x half>
1938 %2 = bitcast i16 %__U to <16 x i1>
1939 %3 = fpext <16 x half> %1 to <16 x float>
1940 %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> %__W
1944 define <16 x float> @test_mm512_maskz_cvtph_ps(i16 zeroext %__U, <4 x i64> %__A) {
1945 ; X86-LABEL: test_mm512_maskz_cvtph_ps:
1946 ; X86: # %bb.0: # %entry
1947 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1948 ; X86-NEXT: kmovw %eax, %k1
1949 ; X86-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1952 ; X64-LABEL: test_mm512_maskz_cvtph_ps:
1953 ; X64: # %bb.0: # %entry
1954 ; X64-NEXT: kmovw %edi, %k1
1955 ; X64-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1958 %0 = bitcast <4 x i64> %__A to <16 x i16>
1959 %1 = bitcast <16 x i16> %0 to <16 x half>
1960 %2 = bitcast i16 %__U to <16 x i1>
1961 %3 = fpext <16 x half> %1 to <16 x float>
1962 %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> zeroinitializer
1966 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
1967 ; CHECK-LABEL: test_mm512_cvtps_pd:
1968 ; CHECK: # %bb.0: # %entry
1969 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1970 ; CHECK-NEXT: ret{{[l|q]}}
1972 %conv.i = fpext <8 x float> %__A to <8 x double>
1973 ret <8 x double> %conv.i
1976 define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
1977 ; CHECK-LABEL: test_mm512_cvtpslo_pd:
1978 ; CHECK: # %bb.0: # %entry
1979 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1980 ; CHECK-NEXT: ret{{[l|q]}}
1982 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1983 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1984 ret <8 x double> %conv.i.i
1987 define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
1988 ; X86-LABEL: test_mm512_mask_cvtps_pd:
1989 ; X86: # %bb.0: # %entry
1990 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1991 ; X86-NEXT: kmovw %eax, %k1
1992 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1995 ; X64-LABEL: test_mm512_mask_cvtps_pd:
1996 ; X64: # %bb.0: # %entry
1997 ; X64-NEXT: kmovw %edi, %k1
1998 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
2001 %conv.i.i = fpext <8 x float> %__A to <8 x double>
2002 %0 = bitcast i8 %__U to <8 x i1>
2003 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
2007 define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
2008 ; X86-LABEL: test_mm512_mask_cvtpslo_pd:
2009 ; X86: # %bb.0: # %entry
2010 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2011 ; X86-NEXT: kmovw %eax, %k1
2012 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
2015 ; X64-LABEL: test_mm512_mask_cvtpslo_pd:
2016 ; X64: # %bb.0: # %entry
2017 ; X64-NEXT: kmovw %edi, %k1
2018 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
2021 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2022 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
2023 %0 = bitcast i8 %__U to <8 x i1>
2024 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
2028 define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
2029 ; X86-LABEL: test_mm512_maskz_cvtps_pd:
2030 ; X86: # %bb.0: # %entry
2031 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2032 ; X86-NEXT: kmovw %eax, %k1
2033 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
2036 ; X64-LABEL: test_mm512_maskz_cvtps_pd:
2037 ; X64: # %bb.0: # %entry
2038 ; X64-NEXT: kmovw %edi, %k1
2039 ; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
2042 %conv.i.i = fpext <8 x float> %__A to <8 x double>
2043 %0 = bitcast i8 %__U to <8 x i1>
2044 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
2048 define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
2049 ; CHECK-LABEL: test_mm512_cvtepi32_epi8:
2050 ; CHECK: # %bb.0: # %entry
2051 ; CHECK-NEXT: vpmovdb %zmm0, %xmm0
2052 ; CHECK-NEXT: vzeroupper
2053 ; CHECK-NEXT: ret{{[l|q]}}
2055 %0 = bitcast <8 x i64> %__A to <16 x i32>
2056 %conv.i = trunc <16 x i32> %0 to <16 x i8>
2057 %1 = bitcast <16 x i8> %conv.i to <2 x i64>
2061 define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
2062 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
2063 ; X86: # %bb.0: # %entry
2064 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2065 ; X86-NEXT: kmovw %eax, %k1
2066 ; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2067 ; X86-NEXT: vzeroupper
2070 ; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
2071 ; X64: # %bb.0: # %entry
2072 ; X64-NEXT: kmovw %edi, %k1
2073 ; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2074 ; X64-NEXT: vzeroupper
2077 %0 = bitcast <8 x i64> %__A to <16 x i32>
2078 %1 = bitcast <2 x i64> %__O to <16 x i8>
2079 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
2080 %3 = bitcast <16 x i8> %2 to <2 x i64>
2084 define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
2085 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
2086 ; X86: # %bb.0: # %entry
2087 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2088 ; X86-NEXT: kmovw %eax, %k1
2089 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2090 ; X86-NEXT: vzeroupper
2093 ; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
2094 ; X64: # %bb.0: # %entry
2095 ; X64-NEXT: kmovw %edi, %k1
2096 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2097 ; X64-NEXT: vzeroupper
2100 %0 = bitcast <8 x i64> %__A to <16 x i32>
2101 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
2102 %2 = bitcast <16 x i8> %1 to <2 x i64>
2106 define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
2107 ; CHECK-LABEL: test_mm512_cvtepi64_epi32:
2108 ; CHECK: # %bb.0: # %entry
2109 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0
2110 ; CHECK-NEXT: ret{{[l|q]}}
2112 %conv.i = trunc <8 x i64> %__A to <8 x i32>
2113 %0 = bitcast <8 x i32> %conv.i to <4 x i64>
2117 define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2118 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
2119 ; X86: # %bb.0: # %entry
2120 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2121 ; X86-NEXT: kmovw %eax, %k1
2122 ; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2125 ; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
2126 ; X64: # %bb.0: # %entry
2127 ; X64-NEXT: kmovw %edi, %k1
2128 ; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2131 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2132 %0 = bitcast <4 x i64> %__O to <8 x i32>
2133 %1 = bitcast i8 %__M to <8 x i1>
2134 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
2135 %3 = bitcast <8 x i32> %2 to <4 x i64>
2139 define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
2140 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
2141 ; X86: # %bb.0: # %entry
2142 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2143 ; X86-NEXT: kmovw %eax, %k1
2144 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2147 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
2148 ; X64: # %bb.0: # %entry
2149 ; X64-NEXT: kmovw %edi, %k1
2150 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2153 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2154 %0 = bitcast i8 %__M to <8 x i1>
2155 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
2156 %2 = bitcast <8 x i32> %1 to <4 x i64>
2160 define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
2161 ; CHECK-LABEL: test_mm512_cvtepi64_epi16:
2162 ; CHECK: # %bb.0: # %entry
2163 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0
2164 ; CHECK-NEXT: vzeroupper
2165 ; CHECK-NEXT: ret{{[l|q]}}
2167 %conv.i = trunc <8 x i64> %__A to <8 x i16>
2168 %0 = bitcast <8 x i16> %conv.i to <2 x i64>
2172 define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2173 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
2174 ; X86: # %bb.0: # %entry
2175 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2176 ; X86-NEXT: kmovw %eax, %k1
2177 ; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2178 ; X86-NEXT: vzeroupper
2181 ; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
2182 ; X64: # %bb.0: # %entry
2183 ; X64-NEXT: kmovw %edi, %k1
2184 ; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2185 ; X64-NEXT: vzeroupper
2188 %0 = bitcast <2 x i64> %__O to <8 x i16>
2189 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
2190 %2 = bitcast <8 x i16> %1 to <2 x i64>
2194 define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
2195 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
2196 ; X86: # %bb.0: # %entry
2197 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2198 ; X86-NEXT: kmovw %eax, %k1
2199 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2200 ; X86-NEXT: vzeroupper
2203 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
2204 ; X64: # %bb.0: # %entry
2205 ; X64-NEXT: kmovw %edi, %k1
2206 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2207 ; X64-NEXT: vzeroupper
2210 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
2211 %1 = bitcast <8 x i16> %0 to <2 x i64>
2215 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
2216 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
2218 define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2219 ; CHECK-LABEL: test_mm512_ternarylogic_epi32:
2220 ; CHECK: # %bb.0: # %entry
2221 ; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0
2222 ; CHECK-NEXT: ret{{[l|q]}}
2224 %0 = bitcast <8 x i64> %__A to <16 x i32>
2225 %1 = bitcast <8 x i64> %__B to <16 x i32>
2226 %2 = bitcast <8 x i64> %__C to <16 x i32>
2227 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2228 %4 = bitcast <16 x i32> %3 to <8 x i64>
2232 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
2234 define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2235 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
2236 ; X86: # %bb.0: # %entry
2237 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2238 ; X86-NEXT: kmovw %eax, %k1
2239 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2242 ; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
2243 ; X64: # %bb.0: # %entry
2244 ; X64-NEXT: kmovw %edi, %k1
2245 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2248 %0 = bitcast <8 x i64> %__A to <16 x i32>
2249 %1 = bitcast <8 x i64> %__B to <16 x i32>
2250 %2 = bitcast <8 x i64> %__C to <16 x i32>
2251 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2252 %4 = bitcast i16 %__U to <16 x i1>
2253 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2254 %6 = bitcast <16 x i32> %5 to <8 x i64>
2258 define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2259 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
2260 ; X86: # %bb.0: # %entry
2261 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2262 ; X86-NEXT: kmovw %eax, %k1
2263 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2266 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
2267 ; X64: # %bb.0: # %entry
2268 ; X64-NEXT: kmovw %edi, %k1
2269 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2272 %0 = bitcast <8 x i64> %__A to <16 x i32>
2273 %1 = bitcast <8 x i64> %__B to <16 x i32>
2274 %2 = bitcast <8 x i64> %__C to <16 x i32>
2275 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2276 %4 = bitcast i16 %__U to <16 x i1>
2277 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2278 %6 = bitcast <16 x i32> %5 to <8 x i64>
2282 define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2283 ; CHECK-LABEL: test_mm512_ternarylogic_epi64:
2284 ; CHECK: # %bb.0: # %entry
2285 ; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0
2286 ; CHECK-NEXT: ret{{[l|q]}}
2288 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2292 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
2294 define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2295 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
2296 ; X86: # %bb.0: # %entry
2297 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2298 ; X86-NEXT: kmovw %eax, %k1
2299 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2302 ; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
2303 ; X64: # %bb.0: # %entry
2304 ; X64-NEXT: kmovw %edi, %k1
2305 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2308 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2309 %1 = bitcast i8 %__U to <8 x i1>
2310 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2314 define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2315 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
2316 ; X86: # %bb.0: # %entry
2317 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2318 ; X86-NEXT: kmovw %eax, %k1
2319 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2322 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
2323 ; X64: # %bb.0: # %entry
2324 ; X64-NEXT: kmovw %edi, %k1
2325 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2328 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2329 %1 = bitcast i8 %__U to <8 x i1>
2330 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2334 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2336 define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
2337 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
2338 ; X86: # %bb.0: # %entry
2339 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2340 ; X86-NEXT: kmovw %eax, %k1
2341 ; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2342 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2345 ; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
2346 ; X64: # %bb.0: # %entry
2347 ; X64-NEXT: kmovw %edi, %k1
2348 ; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2349 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2352 %0 = bitcast <8 x i64> %__A to <16 x i32>
2353 %1 = bitcast <8 x i64> %__I to <16 x i32>
2354 %2 = bitcast <8 x i64> %__B to <16 x i32>
2355 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2356 %4 = bitcast i16 %__U to <16 x i1>
2357 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
2358 %6 = bitcast <16 x i32> %5 to <8 x i64>
2362 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2364 define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
2365 ; X86-LABEL: test_mm512_mask2_permutex2var_pd:
2366 ; X86: # %bb.0: # %entry
2367 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2368 ; X86-NEXT: kmovw %eax, %k1
2369 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2370 ; X86-NEXT: vmovapd %zmm1, %zmm0
2373 ; X64-LABEL: test_mm512_mask2_permutex2var_pd:
2374 ; X64: # %bb.0: # %entry
2375 ; X64-NEXT: kmovw %edi, %k1
2376 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2377 ; X64-NEXT: vmovapd %zmm1, %zmm0
2380 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2381 %1 = bitcast <8 x i64> %__I to <8 x double>
2382 %2 = bitcast i8 %__U to <8 x i1>
2383 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
2387 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2389 define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
2390 ; X86-LABEL: test_mm512_mask2_permutex2var_ps:
2391 ; X86: # %bb.0: # %entry
2392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2393 ; X86-NEXT: kmovw %eax, %k1
2394 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2395 ; X86-NEXT: vmovaps %zmm1, %zmm0
2398 ; X64-LABEL: test_mm512_mask2_permutex2var_ps:
2399 ; X64: # %bb.0: # %entry
2400 ; X64-NEXT: kmovw %edi, %k1
2401 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2402 ; X64-NEXT: vmovaps %zmm1, %zmm0
2405 %0 = bitcast <8 x i64> %__I to <16 x i32>
2406 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2407 %2 = bitcast <8 x i64> %__I to <16 x float>
2408 %3 = bitcast i16 %__U to <16 x i1>
2409 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
2413 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2415 define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
2416 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
2417 ; X86: # %bb.0: # %entry
2418 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2419 ; X86-NEXT: kmovw %eax, %k1
2420 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2421 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2424 ; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
2425 ; X64: # %bb.0: # %entry
2426 ; X64-NEXT: kmovw %edi, %k1
2427 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2428 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2431 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2432 %1 = bitcast i8 %__U to <8 x i1>
2433 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
2437 define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2438 ; CHECK-LABEL: test_mm512_permutex2var_epi32:
2439 ; CHECK: # %bb.0: # %entry
2440 ; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
2441 ; CHECK-NEXT: ret{{[l|q]}}
2443 %0 = bitcast <8 x i64> %__A to <16 x i32>
2444 %1 = bitcast <8 x i64> %__I to <16 x i32>
2445 %2 = bitcast <8 x i64> %__B to <16 x i32>
2446 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2447 %4 = bitcast <16 x i32> %3 to <8 x i64>
2451 define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2452 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
2453 ; X86: # %bb.0: # %entry
2454 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2455 ; X86-NEXT: kmovw %eax, %k1
2456 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2459 ; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
2460 ; X64: # %bb.0: # %entry
2461 ; X64-NEXT: kmovw %edi, %k1
2462 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2465 %0 = bitcast <8 x i64> %__A to <16 x i32>
2466 %1 = bitcast <8 x i64> %__I to <16 x i32>
2467 %2 = bitcast <8 x i64> %__B to <16 x i32>
2468 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2469 %4 = bitcast i16 %__U to <16 x i1>
2470 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2471 %6 = bitcast <16 x i32> %5 to <8 x i64>
2475 define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2476 ; X86-LABEL: test_mm512_mask_permutex2var_epi32:
2477 ; X86: # %bb.0: # %entry
2478 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2479 ; X86-NEXT: kmovw %eax, %k1
2480 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2483 ; X64-LABEL: test_mm512_mask_permutex2var_epi32:
2484 ; X64: # %bb.0: # %entry
2485 ; X64-NEXT: kmovw %edi, %k1
2486 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2489 %0 = bitcast <8 x i64> %__A to <16 x i32>
2490 %1 = bitcast <8 x i64> %__I to <16 x i32>
2491 %2 = bitcast <8 x i64> %__B to <16 x i32>
2492 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2493 %4 = bitcast i16 %__U to <16 x i1>
2494 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2495 %6 = bitcast <16 x i32> %5 to <8 x i64>
2499 define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2500 ; CHECK-LABEL: test_mm512_permutex2var_pd:
2501 ; CHECK: # %bb.0: # %entry
2502 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
2503 ; CHECK-NEXT: ret{{[l|q]}}
2505 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2509 define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
2510 ; X86-LABEL: test_mm512_mask_permutex2var_pd:
2511 ; X86: # %bb.0: # %entry
2512 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2513 ; X86-NEXT: kmovw %eax, %k1
2514 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2517 ; X64-LABEL: test_mm512_mask_permutex2var_pd:
2518 ; X64: # %bb.0: # %entry
2519 ; X64-NEXT: kmovw %edi, %k1
2520 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2523 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2524 %1 = bitcast i8 %__U to <8 x i1>
2525 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2529 define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2530 ; X86-LABEL: test_mm512_maskz_permutex2var_pd:
2531 ; X86: # %bb.0: # %entry
2532 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2533 ; X86-NEXT: kmovw %eax, %k1
2534 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2537 ; X64-LABEL: test_mm512_maskz_permutex2var_pd:
2538 ; X64: # %bb.0: # %entry
2539 ; X64-NEXT: kmovw %edi, %k1
2540 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2543 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2544 %1 = bitcast i8 %__U to <8 x i1>
2545 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
2549 define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2550 ; CHECK-LABEL: test_mm512_permutex2var_ps:
2551 ; CHECK: # %bb.0: # %entry
2552 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
2553 ; CHECK-NEXT: ret{{[l|q]}}
2555 %0 = bitcast <8 x i64> %__I to <16 x i32>
2556 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2560 define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
2561 ; X86-LABEL: test_mm512_mask_permutex2var_ps:
2562 ; X86: # %bb.0: # %entry
2563 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2564 ; X86-NEXT: kmovw %eax, %k1
2565 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2568 ; X64-LABEL: test_mm512_mask_permutex2var_ps:
2569 ; X64: # %bb.0: # %entry
2570 ; X64-NEXT: kmovw %edi, %k1
2571 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2574 %0 = bitcast <8 x i64> %__I to <16 x i32>
2575 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2576 %2 = bitcast i16 %__U to <16 x i1>
2577 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
2581 define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2582 ; X86-LABEL: test_mm512_maskz_permutex2var_ps:
2583 ; X86: # %bb.0: # %entry
2584 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2585 ; X86-NEXT: kmovw %eax, %k1
2586 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2589 ; X64-LABEL: test_mm512_maskz_permutex2var_ps:
2590 ; X64: # %bb.0: # %entry
2591 ; X64-NEXT: kmovw %edi, %k1
2592 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2595 %0 = bitcast <8 x i64> %__I to <16 x i32>
2596 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2597 %2 = bitcast i16 %__U to <16 x i1>
2598 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2602 define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2603 ; CHECK-LABEL: test_mm512_permutex2var_epi64:
2604 ; CHECK: # %bb.0: # %entry
2605 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
2606 ; CHECK-NEXT: ret{{[l|q]}}
2608 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2612 define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2613 ; X86-LABEL: test_mm512_mask_permutex2var_epi64:
2614 ; X86: # %bb.0: # %entry
2615 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2616 ; X86-NEXT: kmovw %eax, %k1
2617 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2620 ; X64-LABEL: test_mm512_mask_permutex2var_epi64:
2621 ; X64: # %bb.0: # %entry
2622 ; X64-NEXT: kmovw %edi, %k1
2623 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2626 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2627 %1 = bitcast i8 %__U to <8 x i1>
2628 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2632 define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2633 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
2634 ; X86: # %bb.0: # %entry
2635 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2636 ; X86-NEXT: kmovw %eax, %k1
2637 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2640 ; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
2641 ; X64: # %bb.0: # %entry
2642 ; X64-NEXT: kmovw %edi, %k1
2643 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2646 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2647 %1 = bitcast i8 %__U to <8 x i1>
2648 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2651 define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2652 ; X86-LABEL: test_mm_mask_add_ss:
2653 ; X86: # %bb.0: # %entry
2654 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2655 ; X86-NEXT: kmovw %eax, %k1
2656 ; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2659 ; X64-LABEL: test_mm_mask_add_ss:
2660 ; X64: # %bb.0: # %entry
2661 ; X64-NEXT: kmovw %edi, %k1
2662 ; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2665 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2666 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2667 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2669 %tobool.i = icmp eq i8 %0, 0
2670 %vecext1.i = extractelement <4 x float> %__W, i32 0
2671 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
2672 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2673 ret <4 x float> %vecins.i
2676 define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2677 ; X86-LABEL: test_mm_maskz_add_ss:
2678 ; X86: # %bb.0: # %entry
2679 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2680 ; X86-NEXT: kmovw %eax, %k1
2681 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2684 ; X64-LABEL: test_mm_maskz_add_ss:
2685 ; X64: # %bb.0: # %entry
2686 ; X64-NEXT: kmovw %edi, %k1
2687 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2690 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2691 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2692 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2694 %tobool.i = icmp eq i8 %0, 0
2695 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
2696 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2697 ret <4 x float> %vecins.i
2700 define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2701 ; X86-LABEL: test_mm_mask_add_sd:
2702 ; X86: # %bb.0: # %entry
2703 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2704 ; X86-NEXT: kmovw %eax, %k1
2705 ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2708 ; X64-LABEL: test_mm_mask_add_sd:
2709 ; X64: # %bb.0: # %entry
2710 ; X64-NEXT: kmovw %edi, %k1
2711 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2714 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2715 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2716 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2718 %tobool.i = icmp eq i8 %0, 0
2719 %vecext1.i = extractelement <2 x double> %__W, i32 0
2720 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
2721 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2722 ret <2 x double> %vecins.i
2725 define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2726 ; X86-LABEL: test_mm_maskz_add_sd:
2727 ; X86: # %bb.0: # %entry
2728 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2729 ; X86-NEXT: kmovw %eax, %k1
2730 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2733 ; X64-LABEL: test_mm_maskz_add_sd:
2734 ; X64: # %bb.0: # %entry
2735 ; X64-NEXT: kmovw %edi, %k1
2736 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2739 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2740 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2741 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2743 %tobool.i = icmp eq i8 %0, 0
2744 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
2745 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2746 ret <2 x double> %vecins.i
2749 define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2750 ; X86-LABEL: test_mm_mask_sub_ss:
2751 ; X86: # %bb.0: # %entry
2752 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2753 ; X86-NEXT: kmovw %eax, %k1
2754 ; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2757 ; X64-LABEL: test_mm_mask_sub_ss:
2758 ; X64: # %bb.0: # %entry
2759 ; X64-NEXT: kmovw %edi, %k1
2760 ; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2763 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2764 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2765 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2767 %tobool.i = icmp eq i8 %0, 0
2768 %vecext1.i = extractelement <4 x float> %__W, i32 0
2769 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
2770 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2771 ret <4 x float> %vecins.i
2774 define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2775 ; X86-LABEL: test_mm_maskz_sub_ss:
2776 ; X86: # %bb.0: # %entry
2777 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2778 ; X86-NEXT: kmovw %eax, %k1
2779 ; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2782 ; X64-LABEL: test_mm_maskz_sub_ss:
2783 ; X64: # %bb.0: # %entry
2784 ; X64-NEXT: kmovw %edi, %k1
2785 ; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2788 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2789 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2790 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2792 %tobool.i = icmp eq i8 %0, 0
2793 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
2794 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2795 ret <4 x float> %vecins.i
2798 define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2799 ; X86-LABEL: test_mm_mask_sub_sd:
2800 ; X86: # %bb.0: # %entry
2801 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2802 ; X86-NEXT: kmovw %eax, %k1
2803 ; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2806 ; X64-LABEL: test_mm_mask_sub_sd:
2807 ; X64: # %bb.0: # %entry
2808 ; X64-NEXT: kmovw %edi, %k1
2809 ; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2812 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2813 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2814 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2816 %tobool.i = icmp eq i8 %0, 0
2817 %vecext1.i = extractelement <2 x double> %__W, i32 0
2818 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
2819 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2820 ret <2 x double> %vecins.i
2823 define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2824 ; X86-LABEL: test_mm_maskz_sub_sd:
2825 ; X86: # %bb.0: # %entry
2826 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2827 ; X86-NEXT: kmovw %eax, %k1
2828 ; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2831 ; X64-LABEL: test_mm_maskz_sub_sd:
2832 ; X64: # %bb.0: # %entry
2833 ; X64-NEXT: kmovw %edi, %k1
2834 ; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2837 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2838 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2839 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2841 %tobool.i = icmp eq i8 %0, 0
2842 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
2843 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2844 ret <2 x double> %vecins.i
2847 define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2848 ; X86-LABEL: test_mm_mask_mul_ss:
2849 ; X86: # %bb.0: # %entry
2850 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2851 ; X86-NEXT: kmovw %eax, %k1
2852 ; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2855 ; X64-LABEL: test_mm_mask_mul_ss:
2856 ; X64: # %bb.0: # %entry
2857 ; X64-NEXT: kmovw %edi, %k1
2858 ; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2861 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2862 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2863 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2865 %tobool.i = icmp eq i8 %0, 0
2866 %vecext1.i = extractelement <4 x float> %__W, i32 0
2867 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
2868 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2869 ret <4 x float> %vecins.i
2872 define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2873 ; X86-LABEL: test_mm_maskz_mul_ss:
2874 ; X86: # %bb.0: # %entry
2875 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2876 ; X86-NEXT: kmovw %eax, %k1
2877 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2880 ; X64-LABEL: test_mm_maskz_mul_ss:
2881 ; X64: # %bb.0: # %entry
2882 ; X64-NEXT: kmovw %edi, %k1
2883 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2886 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2887 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2888 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2890 %tobool.i = icmp eq i8 %0, 0
2891 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
2892 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2893 ret <4 x float> %vecins.i
2896 define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2897 ; X86-LABEL: test_mm_mask_mul_sd:
2898 ; X86: # %bb.0: # %entry
2899 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2900 ; X86-NEXT: kmovw %eax, %k1
2901 ; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2904 ; X64-LABEL: test_mm_mask_mul_sd:
2905 ; X64: # %bb.0: # %entry
2906 ; X64-NEXT: kmovw %edi, %k1
2907 ; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2910 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2911 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2912 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2914 %tobool.i = icmp eq i8 %0, 0
2915 %vecext1.i = extractelement <2 x double> %__W, i32 0
2916 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
2917 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2918 ret <2 x double> %vecins.i
2921 define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2922 ; X86-LABEL: test_mm_maskz_mul_sd:
2923 ; X86: # %bb.0: # %entry
2924 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2925 ; X86-NEXT: kmovw %eax, %k1
2926 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2929 ; X64-LABEL: test_mm_maskz_mul_sd:
2930 ; X64: # %bb.0: # %entry
2931 ; X64-NEXT: kmovw %edi, %k1
2932 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2935 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2936 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2937 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2939 %tobool.i = icmp eq i8 %0, 0
2940 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
2941 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2942 ret <2 x double> %vecins.i
2945 define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2946 ; X86-LABEL: test_mm_mask_div_ss:
2947 ; X86: # %bb.0: # %entry
2948 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2949 ; X86-NEXT: kmovw %eax, %k1
2950 ; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2953 ; X64-LABEL: test_mm_mask_div_ss:
2954 ; X64: # %bb.0: # %entry
2955 ; X64-NEXT: kmovw %edi, %k1
2956 ; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2959 %0 = extractelement <4 x float> %__A, i64 0
2960 %1 = extractelement <4 x float> %__B, i64 0
2961 %2 = extractelement <4 x float> %__W, i64 0
2962 %3 = fdiv float %0, %1
2963 %4 = bitcast i8 %__U to <8 x i1>
2964 %5 = extractelement <8 x i1> %4, i64 0
2965 %6 = select i1 %5, float %3, float %2
2966 %7 = insertelement <4 x float> %__A, float %6, i64 0
2970 define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2971 ; X86-LABEL: test_mm_maskz_div_ss:
2972 ; X86: # %bb.0: # %entry
2973 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2974 ; X86-NEXT: kmovw %eax, %k1
2975 ; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2978 ; X64-LABEL: test_mm_maskz_div_ss:
2979 ; X64: # %bb.0: # %entry
2980 ; X64-NEXT: kmovw %edi, %k1
2981 ; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2984 %0 = extractelement <4 x float> %__A, i64 0
2985 %1 = extractelement <4 x float> %__B, i64 0
2986 %2 = fdiv float %0, %1
2987 %3 = bitcast i8 %__U to <8 x i1>
2988 %4 = extractelement <8 x i1> %3, i64 0
2989 %5 = select i1 %4, float %2, float 0.000000e+00
2990 %6 = insertelement <4 x float> %__A, float %5, i64 0
2994 define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2995 ; X86-LABEL: test_mm_mask_div_sd:
2996 ; X86: # %bb.0: # %entry
2997 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2998 ; X86-NEXT: kmovw %eax, %k1
2999 ; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
3002 ; X64-LABEL: test_mm_mask_div_sd:
3003 ; X64: # %bb.0: # %entry
3004 ; X64-NEXT: kmovw %edi, %k1
3005 ; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
3008 %0 = extractelement <2 x double> %__A, i64 0
3009 %1 = extractelement <2 x double> %__B, i64 0
3010 %2 = extractelement <2 x double> %__W, i64 0
3011 %3 = fdiv double %0, %1
3012 %4 = bitcast i8 %__U to <8 x i1>
3013 %5 = extractelement <8 x i1> %4, i64 0
3014 %6 = select i1 %5, double %3, double %2
3015 %7 = insertelement <2 x double> %__A, double %6, i64 0
3019 define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3020 ; X86-LABEL: test_mm_maskz_div_sd:
3021 ; X86: # %bb.0: # %entry
3022 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3023 ; X86-NEXT: kmovw %eax, %k1
3024 ; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
3027 ; X64-LABEL: test_mm_maskz_div_sd:
3028 ; X64: # %bb.0: # %entry
3029 ; X64-NEXT: kmovw %edi, %k1
3030 ; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
3033 %0 = extractelement <2 x double> %__A, i64 0
3034 %1 = extractelement <2 x double> %__B, i64 0
3035 %2 = fdiv double %0, %1
3036 %3 = bitcast i8 %__U to <8 x i1>
3037 %4 = extractelement <8 x i1> %3, i64 0
3038 %5 = select i1 %4, double %2, double 0.000000e+00
3039 %6 = insertelement <2 x double> %__A, double %5, i64 0
3044 define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3045 ; CHECK-LABEL: test_mm512_fmadd_round_pd:
3046 ; CHECK: # %bb.0: # %entry
3047 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3048 ; CHECK-NEXT: ret{{[l|q]}}
3050 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3054 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3056 define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3057 ; X86-LABEL: test_mm512_mask_fmadd_round_pd:
3058 ; X86: # %bb.0: # %entry
3059 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3060 ; X86-NEXT: kmovw %eax, %k1
3061 ; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3064 ; X64-LABEL: test_mm512_mask_fmadd_round_pd:
3065 ; X64: # %bb.0: # %entry
3066 ; X64-NEXT: kmovw %edi, %k1
3067 ; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3070 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3071 %1 = bitcast i8 %__U to <8 x i1>
3072 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3076 define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3077 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
3078 ; X86: # %bb.0: # %entry
3079 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3080 ; X86-NEXT: kmovw %eax, %k1
3081 ; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3082 ; X86-NEXT: vmovapd %zmm2, %zmm0
3085 ; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
3086 ; X64: # %bb.0: # %entry
3087 ; X64-NEXT: kmovw %edi, %k1
3088 ; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3089 ; X64-NEXT: vmovapd %zmm2, %zmm0
3092 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3093 %1 = bitcast i8 %__U to <8 x i1>
3094 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3098 define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3099 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
3100 ; X86: # %bb.0: # %entry
3101 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3102 ; X86-NEXT: kmovw %eax, %k1
3103 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3106 ; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
3107 ; X64: # %bb.0: # %entry
3108 ; X64-NEXT: kmovw %edi, %k1
3109 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3112 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3113 %1 = bitcast i8 %__U to <8 x i1>
3114 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3118 define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3119 ; X86-LABEL: test_mm512_fmsub_round_pd:
3120 ; X86: # %bb.0: # %entry
3121 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
3122 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3125 ; X64-LABEL: test_mm512_fmsub_round_pd:
3126 ; X64: # %bb.0: # %entry
3127 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
3128 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3131 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3132 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3136 define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3137 ; X86-LABEL: test_mm512_mask_fmsub_round_pd:
3138 ; X86: # %bb.0: # %entry
3139 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3140 ; X86-NEXT: kmovw %eax, %k1
3141 ; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3144 ; X64-LABEL: test_mm512_mask_fmsub_round_pd:
3145 ; X64: # %bb.0: # %entry
3146 ; X64-NEXT: kmovw %edi, %k1
3147 ; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3150 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3151 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3152 %1 = bitcast i8 %__U to <8 x i1>
3153 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3157 define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3158 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
3159 ; X86: # %bb.0: # %entry
3160 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3161 ; X86-NEXT: kmovw %eax, %k1
3162 ; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3165 ; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
3166 ; X64: # %bb.0: # %entry
3167 ; X64-NEXT: kmovw %edi, %k1
3168 ; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3171 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3172 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3173 %1 = bitcast i8 %__U to <8 x i1>
3174 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3178 define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3179 ; X86-LABEL: test_mm512_fnmadd_round_pd:
3180 ; X86: # %bb.0: # %entry
3181 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
3182 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3185 ; X64-LABEL: test_mm512_fnmadd_round_pd:
3186 ; X64: # %bb.0: # %entry
3187 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
3188 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3191 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3192 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3196 define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3197 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
3198 ; X86: # %bb.0: # %entry
3199 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3200 ; X86-NEXT: kmovw %eax, %k1
3201 ; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3202 ; X86-NEXT: vmovapd %zmm2, %zmm0
3205 ; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
3206 ; X64: # %bb.0: # %entry
3207 ; X64-NEXT: kmovw %edi, %k1
3208 ; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3209 ; X64-NEXT: vmovapd %zmm2, %zmm0
3212 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3213 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3214 %1 = bitcast i8 %__U to <8 x i1>
3215 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3219 define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3220 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
3221 ; X86: # %bb.0: # %entry
3222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3223 ; X86-NEXT: kmovw %eax, %k1
3224 ; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3227 ; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
3228 ; X64: # %bb.0: # %entry
3229 ; X64-NEXT: kmovw %edi, %k1
3230 ; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3233 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3234 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3235 %1 = bitcast i8 %__U to <8 x i1>
3236 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3240 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3241 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
3242 ; CHECK: # %bb.0: # %entry
3243 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3244 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3245 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3246 ; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
3247 ; CHECK-NEXT: ret{{[l|q]}}
3249 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3250 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3251 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3255 define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3256 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
3257 ; X86: # %bb.0: # %entry
3258 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3259 ; X86-NEXT: kmovw %eax, %k1
3260 ; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3263 ; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
3264 ; X64: # %bb.0: # %entry
3265 ; X64-NEXT: kmovw %edi, %k1
3266 ; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3269 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3270 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3271 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3272 %1 = bitcast i8 %__U to <8 x i1>
3273 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3277 define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3278 ; CHECK-LABEL: test_mm512_fmadd_pd:
3279 ; CHECK: # %bb.0: # %entry
3280 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3281 ; CHECK-NEXT: ret{{[l|q]}}
3283 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3287 define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3288 ; X86-LABEL: test_mm512_mask_fmadd_pd:
3289 ; X86: # %bb.0: # %entry
3290 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3291 ; X86-NEXT: kmovw %eax, %k1
3292 ; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3295 ; X64-LABEL: test_mm512_mask_fmadd_pd:
3296 ; X64: # %bb.0: # %entry
3297 ; X64-NEXT: kmovw %edi, %k1
3298 ; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3301 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3302 %1 = bitcast i8 %__U to <8 x i1>
3303 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3307 define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3308 ; X86-LABEL: test_mm512_mask3_fmadd_pd:
3309 ; X86: # %bb.0: # %entry
3310 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3311 ; X86-NEXT: kmovw %eax, %k1
3312 ; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3313 ; X86-NEXT: vmovapd %zmm2, %zmm0
3316 ; X64-LABEL: test_mm512_mask3_fmadd_pd:
3317 ; X64: # %bb.0: # %entry
3318 ; X64-NEXT: kmovw %edi, %k1
3319 ; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3320 ; X64-NEXT: vmovapd %zmm2, %zmm0
3323 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3324 %1 = bitcast i8 %__U to <8 x i1>
3325 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3329 define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3330 ; X86-LABEL: test_mm512_maskz_fmadd_pd:
3331 ; X86: # %bb.0: # %entry
3332 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3333 ; X86-NEXT: kmovw %eax, %k1
3334 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3337 ; X64-LABEL: test_mm512_maskz_fmadd_pd:
3338 ; X64: # %bb.0: # %entry
3339 ; X64-NEXT: kmovw %edi, %k1
3340 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3343 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3344 %1 = bitcast i8 %__U to <8 x i1>
3345 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3349 define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3350 ; X86-LABEL: test_mm512_fmsub_pd:
3351 ; X86: # %bb.0: # %entry
3352 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
3353 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3356 ; X64-LABEL: test_mm512_fmsub_pd:
3357 ; X64: # %bb.0: # %entry
3358 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
3359 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3362 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3363 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3367 define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3368 ; X86-LABEL: test_mm512_mask_fmsub_pd:
3369 ; X86: # %bb.0: # %entry
3370 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3371 ; X86-NEXT: kmovw %eax, %k1
3372 ; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3375 ; X64-LABEL: test_mm512_mask_fmsub_pd:
3376 ; X64: # %bb.0: # %entry
3377 ; X64-NEXT: kmovw %edi, %k1
3378 ; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3381 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3382 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3383 %1 = bitcast i8 %__U to <8 x i1>
3384 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3388 define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3389 ; X86-LABEL: test_mm512_maskz_fmsub_pd:
3390 ; X86: # %bb.0: # %entry
3391 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3392 ; X86-NEXT: kmovw %eax, %k1
3393 ; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3396 ; X64-LABEL: test_mm512_maskz_fmsub_pd:
3397 ; X64: # %bb.0: # %entry
3398 ; X64-NEXT: kmovw %edi, %k1
3399 ; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3402 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3403 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3404 %1 = bitcast i8 %__U to <8 x i1>
3405 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3409 define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3410 ; X86-LABEL: test_mm512_fnmadd_pd:
3411 ; X86: # %bb.0: # %entry
3412 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
3413 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3416 ; X64-LABEL: test_mm512_fnmadd_pd:
3417 ; X64: # %bb.0: # %entry
3418 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
3419 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3422 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3423 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3427 define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3428 ; X86-LABEL: test_mm512_mask3_fnmadd_pd:
3429 ; X86: # %bb.0: # %entry
3430 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3431 ; X86-NEXT: kmovw %eax, %k1
3432 ; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3433 ; X86-NEXT: vmovapd %zmm2, %zmm0
3436 ; X64-LABEL: test_mm512_mask3_fnmadd_pd:
3437 ; X64: # %bb.0: # %entry
3438 ; X64-NEXT: kmovw %edi, %k1
3439 ; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3440 ; X64-NEXT: vmovapd %zmm2, %zmm0
3443 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3444 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3445 %1 = bitcast i8 %__U to <8 x i1>
3446 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3450 define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3451 ; X86-LABEL: test_mm512_maskz_fnmadd_pd:
3452 ; X86: # %bb.0: # %entry
3453 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3454 ; X86-NEXT: kmovw %eax, %k1
3455 ; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3458 ; X64-LABEL: test_mm512_maskz_fnmadd_pd:
3459 ; X64: # %bb.0: # %entry
3460 ; X64-NEXT: kmovw %edi, %k1
3461 ; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3464 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3465 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3466 %1 = bitcast i8 %__U to <8 x i1>
3467 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3471 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3472 ; CHECK-LABEL: test_mm512_fnmsub_pd:
3473 ; CHECK: # %bb.0: # %entry
3474 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3475 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3476 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3477 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3478 ; CHECK-NEXT: ret{{[l|q]}}
3480 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3481 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3482 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3486 define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3487 ; X86-LABEL: test_mm512_maskz_fnmsub_pd:
3488 ; X86: # %bb.0: # %entry
3489 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3490 ; X86-NEXT: kmovw %eax, %k1
3491 ; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3494 ; X64-LABEL: test_mm512_maskz_fnmsub_pd:
3495 ; X64: # %bb.0: # %entry
3496 ; X64-NEXT: kmovw %edi, %k1
3497 ; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3500 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3501 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3502 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3503 %1 = bitcast i8 %__U to <8 x i1>
3504 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3508 define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3509 ; CHECK-LABEL: test_mm512_fmadd_round_ps:
3510 ; CHECK: # %bb.0: # %entry
3511 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3512 ; CHECK-NEXT: ret{{[l|q]}}
3514 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3518 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
3520 define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3521 ; X86-LABEL: test_mm512_mask_fmadd_round_ps:
3522 ; X86: # %bb.0: # %entry
3523 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3524 ; X86-NEXT: kmovw %eax, %k1
3525 ; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3528 ; X64-LABEL: test_mm512_mask_fmadd_round_ps:
3529 ; X64: # %bb.0: # %entry
3530 ; X64-NEXT: kmovw %edi, %k1
3531 ; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3534 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3535 %1 = bitcast i16 %__U to <16 x i1>
3536 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3540 define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3541 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
3542 ; X86: # %bb.0: # %entry
3543 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3544 ; X86-NEXT: kmovw %eax, %k1
3545 ; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3546 ; X86-NEXT: vmovaps %zmm2, %zmm0
3549 ; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
3550 ; X64: # %bb.0: # %entry
3551 ; X64-NEXT: kmovw %edi, %k1
3552 ; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3553 ; X64-NEXT: vmovaps %zmm2, %zmm0
3556 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3557 %1 = bitcast i16 %__U to <16 x i1>
3558 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3562 define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3563 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
3564 ; X86: # %bb.0: # %entry
3565 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3566 ; X86-NEXT: kmovw %eax, %k1
3567 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3570 ; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
3571 ; X64: # %bb.0: # %entry
3572 ; X64-NEXT: kmovw %edi, %k1
3573 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3576 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3577 %1 = bitcast i16 %__U to <16 x i1>
3578 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3582 define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3583 ; X86-LABEL: test_mm512_fmsub_round_ps:
3584 ; X86: # %bb.0: # %entry
3585 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
3586 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3589 ; X64-LABEL: test_mm512_fmsub_round_ps:
3590 ; X64: # %bb.0: # %entry
3591 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
3592 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3595 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3596 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3600 define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3601 ; X86-LABEL: test_mm512_mask_fmsub_round_ps:
3602 ; X86: # %bb.0: # %entry
3603 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3604 ; X86-NEXT: kmovw %eax, %k1
3605 ; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3608 ; X64-LABEL: test_mm512_mask_fmsub_round_ps:
3609 ; X64: # %bb.0: # %entry
3610 ; X64-NEXT: kmovw %edi, %k1
3611 ; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3614 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3615 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3616 %1 = bitcast i16 %__U to <16 x i1>
3617 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3621 define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3622 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
3623 ; X86: # %bb.0: # %entry
3624 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3625 ; X86-NEXT: kmovw %eax, %k1
3626 ; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3629 ; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
3630 ; X64: # %bb.0: # %entry
3631 ; X64-NEXT: kmovw %edi, %k1
3632 ; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3635 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3636 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3637 %1 = bitcast i16 %__U to <16 x i1>
3638 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3642 define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3643 ; X86-LABEL: test_mm512_fnmadd_round_ps:
3644 ; X86: # %bb.0: # %entry
3645 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
3646 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3649 ; X64-LABEL: test_mm512_fnmadd_round_ps:
3650 ; X64: # %bb.0: # %entry
3651 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
3652 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3655 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3656 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3660 define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3661 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
3662 ; X86: # %bb.0: # %entry
3663 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3664 ; X86-NEXT: kmovw %eax, %k1
3665 ; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3666 ; X86-NEXT: vmovaps %zmm2, %zmm0
3669 ; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
3670 ; X64: # %bb.0: # %entry
3671 ; X64-NEXT: kmovw %edi, %k1
3672 ; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3673 ; X64-NEXT: vmovaps %zmm2, %zmm0
3676 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3677 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3678 %1 = bitcast i16 %__U to <16 x i1>
3679 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3683 define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3684 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
3685 ; X86: # %bb.0: # %entry
3686 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3687 ; X86-NEXT: kmovw %eax, %k1
3688 ; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3691 ; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
3692 ; X64: # %bb.0: # %entry
3693 ; X64-NEXT: kmovw %edi, %k1
3694 ; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3697 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3698 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3699 %1 = bitcast i16 %__U to <16 x i1>
3700 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3704 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3705 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
3706 ; CHECK: # %bb.0: # %entry
3707 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3708 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3709 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3710 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
3711 ; CHECK-NEXT: ret{{[l|q]}}
3713 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3714 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3715 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3719 define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3720 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
3721 ; X86: # %bb.0: # %entry
3722 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3723 ; X86-NEXT: kmovw %eax, %k1
3724 ; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3727 ; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
3728 ; X64: # %bb.0: # %entry
3729 ; X64-NEXT: kmovw %edi, %k1
3730 ; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3733 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3734 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3735 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3736 %1 = bitcast i16 %__U to <16 x i1>
3737 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3741 define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3742 ; CHECK-LABEL: test_mm512_fmadd_ps:
3743 ; CHECK: # %bb.0: # %entry
3744 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3745 ; CHECK-NEXT: ret{{[l|q]}}
3747 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3751 define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3752 ; X86-LABEL: test_mm512_mask_fmadd_ps:
3753 ; X86: # %bb.0: # %entry
3754 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3755 ; X86-NEXT: kmovw %eax, %k1
3756 ; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3759 ; X64-LABEL: test_mm512_mask_fmadd_ps:
3760 ; X64: # %bb.0: # %entry
3761 ; X64-NEXT: kmovw %edi, %k1
3762 ; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3765 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3766 %1 = bitcast i16 %__U to <16 x i1>
3767 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3771 define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3772 ; X86-LABEL: test_mm512_mask3_fmadd_ps:
3773 ; X86: # %bb.0: # %entry
3774 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3775 ; X86-NEXT: kmovw %eax, %k1
3776 ; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3777 ; X86-NEXT: vmovaps %zmm2, %zmm0
3780 ; X64-LABEL: test_mm512_mask3_fmadd_ps:
3781 ; X64: # %bb.0: # %entry
3782 ; X64-NEXT: kmovw %edi, %k1
3783 ; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3784 ; X64-NEXT: vmovaps %zmm2, %zmm0
3787 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3788 %1 = bitcast i16 %__U to <16 x i1>
3789 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3793 define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3794 ; X86-LABEL: test_mm512_maskz_fmadd_ps:
3795 ; X86: # %bb.0: # %entry
3796 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3797 ; X86-NEXT: kmovw %eax, %k1
3798 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3801 ; X64-LABEL: test_mm512_maskz_fmadd_ps:
3802 ; X64: # %bb.0: # %entry
3803 ; X64-NEXT: kmovw %edi, %k1
3804 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3807 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3808 %1 = bitcast i16 %__U to <16 x i1>
3809 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3813 define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3814 ; X86-LABEL: test_mm512_fmsub_ps:
3815 ; X86: # %bb.0: # %entry
3816 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
3817 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3820 ; X64-LABEL: test_mm512_fmsub_ps:
3821 ; X64: # %bb.0: # %entry
3822 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
3823 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3826 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3827 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3831 define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3832 ; X86-LABEL: test_mm512_mask_fmsub_ps:
3833 ; X86: # %bb.0: # %entry
3834 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3835 ; X86-NEXT: kmovw %eax, %k1
3836 ; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3839 ; X64-LABEL: test_mm512_mask_fmsub_ps:
3840 ; X64: # %bb.0: # %entry
3841 ; X64-NEXT: kmovw %edi, %k1
3842 ; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3845 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3846 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3847 %1 = bitcast i16 %__U to <16 x i1>
3848 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3852 define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3853 ; X86-LABEL: test_mm512_maskz_fmsub_ps:
3854 ; X86: # %bb.0: # %entry
3855 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3856 ; X86-NEXT: kmovw %eax, %k1
3857 ; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3860 ; X64-LABEL: test_mm512_maskz_fmsub_ps:
3861 ; X64: # %bb.0: # %entry
3862 ; X64-NEXT: kmovw %edi, %k1
3863 ; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3866 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3867 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3868 %1 = bitcast i16 %__U to <16 x i1>
3869 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3873 define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3874 ; X86-LABEL: test_mm512_fnmadd_ps:
3875 ; X86: # %bb.0: # %entry
3876 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
3877 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3880 ; X64-LABEL: test_mm512_fnmadd_ps:
3881 ; X64: # %bb.0: # %entry
3882 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
3883 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3886 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3887 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3891 define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3892 ; X86-LABEL: test_mm512_mask3_fnmadd_ps:
3893 ; X86: # %bb.0: # %entry
3894 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3895 ; X86-NEXT: kmovw %eax, %k1
3896 ; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3897 ; X86-NEXT: vmovaps %zmm2, %zmm0
3900 ; X64-LABEL: test_mm512_mask3_fnmadd_ps:
3901 ; X64: # %bb.0: # %entry
3902 ; X64-NEXT: kmovw %edi, %k1
3903 ; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3904 ; X64-NEXT: vmovaps %zmm2, %zmm0
3907 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3908 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3909 %1 = bitcast i16 %__U to <16 x i1>
3910 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3914 define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3915 ; X86-LABEL: test_mm512_maskz_fnmadd_ps:
3916 ; X86: # %bb.0: # %entry
3917 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3918 ; X86-NEXT: kmovw %eax, %k1
3919 ; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3922 ; X64-LABEL: test_mm512_maskz_fnmadd_ps:
3923 ; X64: # %bb.0: # %entry
3924 ; X64-NEXT: kmovw %edi, %k1
3925 ; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3928 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3929 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3930 %1 = bitcast i16 %__U to <16 x i1>
3931 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3935 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3936 ; CHECK-LABEL: test_mm512_fnmsub_ps:
3937 ; CHECK: # %bb.0: # %entry
3938 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3939 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3940 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3941 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3942 ; CHECK-NEXT: ret{{[l|q]}}
3944 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3945 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3946 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3950 define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3951 ; X86-LABEL: test_mm512_maskz_fnmsub_ps:
3952 ; X86: # %bb.0: # %entry
3953 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3954 ; X86-NEXT: kmovw %eax, %k1
3955 ; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3958 ; X64-LABEL: test_mm512_maskz_fnmsub_ps:
3959 ; X64: # %bb.0: # %entry
3960 ; X64-NEXT: kmovw %edi, %k1
3961 ; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3964 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3965 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3966 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3967 %1 = bitcast i16 %__U to <16 x i1>
3968 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3972 define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3973 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
3974 ; CHECK: # %bb.0: # %entry
3975 ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3976 ; CHECK-NEXT: ret{{[l|q]}}
3978 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3982 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3984 define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3985 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
3986 ; X86: # %bb.0: # %entry
3987 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3988 ; X86-NEXT: kmovw %eax, %k1
3989 ; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3992 ; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
3993 ; X64: # %bb.0: # %entry
3994 ; X64-NEXT: kmovw %edi, %k1
3995 ; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3998 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3999 %1 = bitcast i8 %__U to <8 x i1>
4000 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4004 define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4005 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
4006 ; X86: # %bb.0: # %entry
4007 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4008 ; X86-NEXT: kmovw %eax, %k1
4009 ; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4010 ; X86-NEXT: vmovapd %zmm2, %zmm0
4013 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
4014 ; X64: # %bb.0: # %entry
4015 ; X64-NEXT: kmovw %edi, %k1
4016 ; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4017 ; X64-NEXT: vmovapd %zmm2, %zmm0
4020 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
4021 %1 = bitcast i8 %__U to <8 x i1>
4022 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4026 define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4027 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
4028 ; X86: # %bb.0: # %entry
4029 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4030 ; X86-NEXT: kmovw %eax, %k1
4031 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4034 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
4035 ; X64: # %bb.0: # %entry
4036 ; X64-NEXT: kmovw %edi, %k1
4037 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4040 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
4041 %1 = bitcast i8 %__U to <8 x i1>
4042 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4046 define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4047 ; X86-LABEL: test_mm512_fmsubadd_round_pd:
4048 ; X86: # %bb.0: # %entry
4049 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
4050 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4053 ; X64-LABEL: test_mm512_fmsubadd_round_pd:
4054 ; X64: # %bb.0: # %entry
4055 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
4056 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4059 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4060 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4064 define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4065 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
4066 ; X86: # %bb.0: # %entry
4067 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4068 ; X86-NEXT: kmovw %eax, %k1
4069 ; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4072 ; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
4073 ; X64: # %bb.0: # %entry
4074 ; X64-NEXT: kmovw %edi, %k1
4075 ; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4078 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4079 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4080 %1 = bitcast i8 %__U to <8 x i1>
4081 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4085 define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4086 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4087 ; X86: # %bb.0: # %entry
4088 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4089 ; X86-NEXT: kmovw %eax, %k1
4090 ; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4093 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4094 ; X64: # %bb.0: # %entry
4095 ; X64-NEXT: kmovw %edi, %k1
4096 ; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4099 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4100 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4101 %1 = bitcast i8 %__U to <8 x i1>
4102 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4106 define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4107 ; CHECK-LABEL: test_mm512_fmaddsub_pd:
4108 ; CHECK: # %bb.0: # %entry
4109 ; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4110 ; CHECK-NEXT: ret{{[l|q]}}
4112 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4113 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4114 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4115 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4119 define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4120 ; X86-LABEL: test_mm512_mask_fmaddsub_pd:
4121 ; X86: # %bb.0: # %entry
4122 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4123 ; X86-NEXT: kmovw %eax, %k1
4124 ; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4127 ; X64-LABEL: test_mm512_mask_fmaddsub_pd:
4128 ; X64: # %bb.0: # %entry
4129 ; X64-NEXT: kmovw %edi, %k1
4130 ; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4133 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4134 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4135 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4136 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4137 %4 = bitcast i8 %__U to <8 x i1>
4138 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
4142 define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4143 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
4144 ; X86: # %bb.0: # %entry
4145 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4146 ; X86-NEXT: kmovw %eax, %k1
4147 ; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4148 ; X86-NEXT: vmovapd %zmm2, %zmm0
4151 ; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
4152 ; X64: # %bb.0: # %entry
4153 ; X64-NEXT: kmovw %edi, %k1
4154 ; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4155 ; X64-NEXT: vmovapd %zmm2, %zmm0
4158 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4159 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4160 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4161 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4162 %4 = bitcast i8 %__U to <8 x i1>
4163 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
4167 define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4168 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
4169 ; X86: # %bb.0: # %entry
4170 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4171 ; X86-NEXT: kmovw %eax, %k1
4172 ; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4175 ; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
4176 ; X64: # %bb.0: # %entry
4177 ; X64-NEXT: kmovw %edi, %k1
4178 ; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4181 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4182 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4183 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4184 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4185 %4 = bitcast i8 %__U to <8 x i1>
4186 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
4190 define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4191 ; CHECK-LABEL: test_mm512_fmsubadd_pd:
4192 ; CHECK: # %bb.0: # %entry
4193 ; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4194 ; CHECK-NEXT: ret{{[l|q]}}
4196 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4197 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4198 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4199 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4203 define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4204 ; X86-LABEL: test_mm512_mask_fmsubadd_pd:
4205 ; X86: # %bb.0: # %entry
4206 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4207 ; X86-NEXT: kmovw %eax, %k1
4208 ; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4211 ; X64-LABEL: test_mm512_mask_fmsubadd_pd:
4212 ; X64: # %bb.0: # %entry
4213 ; X64-NEXT: kmovw %edi, %k1
4214 ; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4217 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4218 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4219 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4220 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4221 %3 = bitcast i8 %__U to <8 x i1>
4222 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
4226 define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4227 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
4228 ; X86: # %bb.0: # %entry
4229 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4230 ; X86-NEXT: kmovw %eax, %k1
4231 ; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4234 ; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
4235 ; X64: # %bb.0: # %entry
4236 ; X64-NEXT: kmovw %edi, %k1
4237 ; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4240 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4241 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4242 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4243 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4244 %3 = bitcast i8 %__U to <8 x i1>
4245 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
4249 define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4250 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
4251 ; CHECK: # %bb.0: # %entry
4252 ; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4253 ; CHECK-NEXT: ret{{[l|q]}}
4255 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4259 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
4261 define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4262 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
4263 ; X86: # %bb.0: # %entry
4264 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4265 ; X86-NEXT: kmovw %eax, %k1
4266 ; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4269 ; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
4270 ; X64: # %bb.0: # %entry
4271 ; X64-NEXT: kmovw %edi, %k1
4272 ; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4275 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4276 %1 = bitcast i16 %__U to <16 x i1>
4277 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4281 define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4282 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4283 ; X86: # %bb.0: # %entry
4284 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4285 ; X86-NEXT: kmovw %eax, %k1
4286 ; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4287 ; X86-NEXT: vmovaps %zmm2, %zmm0
4290 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4291 ; X64: # %bb.0: # %entry
4292 ; X64-NEXT: kmovw %edi, %k1
4293 ; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4294 ; X64-NEXT: vmovaps %zmm2, %zmm0
4297 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4298 %1 = bitcast i16 %__U to <16 x i1>
4299 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4303 define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4304 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4305 ; X86: # %bb.0: # %entry
4306 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4307 ; X86-NEXT: kmovw %eax, %k1
4308 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4311 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4312 ; X64: # %bb.0: # %entry
4313 ; X64-NEXT: kmovw %edi, %k1
4314 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4317 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4318 %1 = bitcast i16 %__U to <16 x i1>
4319 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4323 define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4324 ; X86-LABEL: test_mm512_fmsubadd_round_ps:
4325 ; X86: # %bb.0: # %entry
4326 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
4327 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4330 ; X64-LABEL: test_mm512_fmsubadd_round_ps:
4331 ; X64: # %bb.0: # %entry
4332 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
4333 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4336 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4337 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4341 define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4342 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
4343 ; X86: # %bb.0: # %entry
4344 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4345 ; X86-NEXT: kmovw %eax, %k1
4346 ; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4349 ; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
4350 ; X64: # %bb.0: # %entry
4351 ; X64-NEXT: kmovw %edi, %k1
4352 ; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4355 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4356 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4357 %1 = bitcast i16 %__U to <16 x i1>
4358 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4362 define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4363 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4364 ; X86: # %bb.0: # %entry
4365 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4366 ; X86-NEXT: kmovw %eax, %k1
4367 ; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4370 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4371 ; X64: # %bb.0: # %entry
4372 ; X64-NEXT: kmovw %edi, %k1
4373 ; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4376 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4377 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4378 %1 = bitcast i16 %__U to <16 x i1>
4379 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4383 define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4384 ; CHECK-LABEL: test_mm512_fmaddsub_ps:
4385 ; CHECK: # %bb.0: # %entry
4386 ; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4387 ; CHECK-NEXT: ret{{[l|q]}}
4389 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4390 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4391 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4392 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4396 define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4397 ; X86-LABEL: test_mm512_mask_fmaddsub_ps:
4398 ; X86: # %bb.0: # %entry
4399 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4400 ; X86-NEXT: kmovw %eax, %k1
4401 ; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4404 ; X64-LABEL: test_mm512_mask_fmaddsub_ps:
4405 ; X64: # %bb.0: # %entry
4406 ; X64-NEXT: kmovw %edi, %k1
4407 ; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4410 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4411 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4412 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4413 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4414 %4 = bitcast i16 %__U to <16 x i1>
4415 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
4419 define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4420 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
4421 ; X86: # %bb.0: # %entry
4422 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4423 ; X86-NEXT: kmovw %eax, %k1
4424 ; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4425 ; X86-NEXT: vmovaps %zmm2, %zmm0
4428 ; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
4429 ; X64: # %bb.0: # %entry
4430 ; X64-NEXT: kmovw %edi, %k1
4431 ; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4432 ; X64-NEXT: vmovaps %zmm2, %zmm0
4435 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4436 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4437 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4438 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4439 %4 = bitcast i16 %__U to <16 x i1>
4440 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
4444 define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4445 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
4446 ; X86: # %bb.0: # %entry
4447 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4448 ; X86-NEXT: kmovw %eax, %k1
4449 ; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4452 ; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
4453 ; X64: # %bb.0: # %entry
4454 ; X64-NEXT: kmovw %edi, %k1
4455 ; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4458 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4459 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4460 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4461 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4462 %4 = bitcast i16 %__U to <16 x i1>
4463 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
4467 define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4468 ; CHECK-LABEL: test_mm512_fmsubadd_ps:
4469 ; CHECK: # %bb.0: # %entry
4470 ; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4471 ; CHECK-NEXT: ret{{[l|q]}}
4473 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4474 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4475 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4476 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4480 define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4481 ; X86-LABEL: test_mm512_mask_fmsubadd_ps:
4482 ; X86: # %bb.0: # %entry
4483 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4484 ; X86-NEXT: kmovw %eax, %k1
4485 ; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4488 ; X64-LABEL: test_mm512_mask_fmsubadd_ps:
4489 ; X64: # %bb.0: # %entry
4490 ; X64-NEXT: kmovw %edi, %k1
4491 ; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4494 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4495 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4496 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4497 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4498 %3 = bitcast i16 %__U to <16 x i1>
4499 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
4503 define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4504 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
4505 ; X86: # %bb.0: # %entry
4506 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4507 ; X86-NEXT: kmovw %eax, %k1
4508 ; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4511 ; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
4512 ; X64: # %bb.0: # %entry
4513 ; X64-NEXT: kmovw %edi, %k1
4514 ; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4517 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4518 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4519 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4520 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4521 %3 = bitcast i16 %__U to <16 x i1>
4522 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
4526 define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4527 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
4528 ; X86: # %bb.0: # %entry
4529 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4530 ; X86-NEXT: kmovw %eax, %k1
4531 ; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4532 ; X86-NEXT: vmovapd %zmm2, %zmm0
4535 ; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
4536 ; X64: # %bb.0: # %entry
4537 ; X64-NEXT: kmovw %edi, %k1
4538 ; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4539 ; X64-NEXT: vmovapd %zmm2, %zmm0
4542 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4543 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4544 %1 = bitcast i8 %__U to <8 x i1>
4545 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4549 define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4550 ; X86-LABEL: test_mm512_mask3_fmsub_pd:
4551 ; X86: # %bb.0: # %entry
4552 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4553 ; X86-NEXT: kmovw %eax, %k1
4554 ; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4555 ; X86-NEXT: vmovapd %zmm2, %zmm0
4558 ; X64-LABEL: test_mm512_mask3_fmsub_pd:
4559 ; X64: # %bb.0: # %entry
4560 ; X64-NEXT: kmovw %edi, %k1
4561 ; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4562 ; X64-NEXT: vmovapd %zmm2, %zmm0
4565 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4566 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4567 %1 = bitcast i8 %__U to <8 x i1>
4568 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4572 define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4573 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
4574 ; X86: # %bb.0: # %entry
4575 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4576 ; X86-NEXT: kmovw %eax, %k1
4577 ; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4578 ; X86-NEXT: vmovaps %zmm2, %zmm0
4581 ; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
4582 ; X64: # %bb.0: # %entry
4583 ; X64-NEXT: kmovw %edi, %k1
4584 ; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4585 ; X64-NEXT: vmovaps %zmm2, %zmm0
4588 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4589 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4590 %1 = bitcast i16 %__U to <16 x i1>
4591 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4595 define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4596 ; X86-LABEL: test_mm512_mask3_fmsub_ps:
4597 ; X86: # %bb.0: # %entry
4598 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4599 ; X86-NEXT: kmovw %eax, %k1
4600 ; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4601 ; X86-NEXT: vmovaps %zmm2, %zmm0
4604 ; X64-LABEL: test_mm512_mask3_fmsub_ps:
4605 ; X64: # %bb.0: # %entry
4606 ; X64-NEXT: kmovw %edi, %k1
4607 ; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4608 ; X64-NEXT: vmovaps %zmm2, %zmm0
4611 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4612 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4613 %1 = bitcast i16 %__U to <16 x i1>
4614 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4618 define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4619 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4620 ; X86: # %bb.0: # %entry
4621 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4622 ; X86-NEXT: kmovw %eax, %k1
4623 ; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4624 ; X86-NEXT: vmovapd %zmm2, %zmm0
4627 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4628 ; X64: # %bb.0: # %entry
4629 ; X64-NEXT: kmovw %edi, %k1
4630 ; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4631 ; X64-NEXT: vmovapd %zmm2, %zmm0
4634 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4635 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4636 %1 = bitcast i8 %__U to <8 x i1>
4637 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4641 define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4642 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
4643 ; X86: # %bb.0: # %entry
4644 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4645 ; X86-NEXT: kmovw %eax, %k1
4646 ; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4647 ; X86-NEXT: vmovapd %zmm2, %zmm0
4650 ; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
4651 ; X64: # %bb.0: # %entry
4652 ; X64-NEXT: kmovw %edi, %k1
4653 ; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4654 ; X64-NEXT: vmovapd %zmm2, %zmm0
4657 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4658 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4659 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4660 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4661 %3 = bitcast i8 %__U to <8 x i1>
4662 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
4666 define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4667 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4668 ; X86: # %bb.0: # %entry
4669 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4670 ; X86-NEXT: kmovw %eax, %k1
4671 ; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4672 ; X86-NEXT: vmovaps %zmm2, %zmm0
4675 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4676 ; X64: # %bb.0: # %entry
4677 ; X64-NEXT: kmovw %edi, %k1
4678 ; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4679 ; X64-NEXT: vmovaps %zmm2, %zmm0
4682 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4683 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4684 %1 = bitcast i16 %__U to <16 x i1>
4685 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4689 define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4690 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
4691 ; X86: # %bb.0: # %entry
4692 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4693 ; X86-NEXT: kmovw %eax, %k1
4694 ; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4695 ; X86-NEXT: vmovaps %zmm2, %zmm0
4698 ; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
4699 ; X64: # %bb.0: # %entry
4700 ; X64-NEXT: kmovw %edi, %k1
4701 ; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4702 ; X64-NEXT: vmovaps %zmm2, %zmm0
4705 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4706 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4707 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4708 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4709 %3 = bitcast i16 %__U to <16 x i1>
4710 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
4714 define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4715 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
4716 ; X86: # %bb.0: # %entry
4717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4718 ; X86-NEXT: kmovw %eax, %k1
4719 ; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4722 ; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
4723 ; X64: # %bb.0: # %entry
4724 ; X64-NEXT: kmovw %edi, %k1
4725 ; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4728 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4729 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
4730 %1 = bitcast i8 %__U to <8 x i1>
4731 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4735 define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4736 ; X86-LABEL: test_mm512_mask_fnmadd_pd:
4737 ; X86: # %bb.0: # %entry
4738 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4739 ; X86-NEXT: kmovw %eax, %k1
4740 ; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4743 ; X64-LABEL: test_mm512_mask_fnmadd_pd:
4744 ; X64: # %bb.0: # %entry
4745 ; X64-NEXT: kmovw %edi, %k1
4746 ; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4749 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4750 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
4751 %1 = bitcast i8 %__U to <8 x i1>
4752 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4756 define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4757 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
4758 ; X86: # %bb.0: # %entry
4759 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4760 ; X86-NEXT: kmovw %eax, %k1
4761 ; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4764 ; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
4765 ; X64: # %bb.0: # %entry
4766 ; X64-NEXT: kmovw %edi, %k1
4767 ; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4770 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4771 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
4772 %1 = bitcast i16 %__U to <16 x i1>
4773 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4777 define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4778 ; X86-LABEL: test_mm512_mask_fnmadd_ps:
4779 ; X86: # %bb.0: # %entry
4780 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4781 ; X86-NEXT: kmovw %eax, %k1
4782 ; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4785 ; X64-LABEL: test_mm512_mask_fnmadd_ps:
4786 ; X64: # %bb.0: # %entry
4787 ; X64-NEXT: kmovw %edi, %k1
4788 ; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4791 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4792 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
4793 %1 = bitcast i16 %__U to <16 x i1>
4794 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4798 define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4799 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
4800 ; X86: # %bb.0: # %entry
4801 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4802 ; X86-NEXT: kmovw %eax, %k1
4803 ; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4806 ; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
4807 ; X64: # %bb.0: # %entry
4808 ; X64-NEXT: kmovw %edi, %k1
4809 ; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4812 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4813 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4814 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4815 %1 = bitcast i8 %__U to <8 x i1>
4816 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4820 define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4821 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
4822 ; X86: # %bb.0: # %entry
4823 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4824 ; X86-NEXT: kmovw %eax, %k1
4825 ; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4826 ; X86-NEXT: vmovapd %zmm2, %zmm0
4829 ; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
4830 ; X64: # %bb.0: # %entry
4831 ; X64-NEXT: kmovw %edi, %k1
4832 ; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4833 ; X64-NEXT: vmovapd %zmm2, %zmm0
4836 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4837 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4838 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4839 %1 = bitcast i8 %__U to <8 x i1>
4840 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4844 define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4845 ; X86-LABEL: test_mm512_mask_fnmsub_pd:
4846 ; X86: # %bb.0: # %entry
4847 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4848 ; X86-NEXT: kmovw %eax, %k1
4849 ; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4852 ; X64-LABEL: test_mm512_mask_fnmsub_pd:
4853 ; X64: # %bb.0: # %entry
4854 ; X64-NEXT: kmovw %edi, %k1
4855 ; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4858 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4859 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4860 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4861 %1 = bitcast i8 %__U to <8 x i1>
4862 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4866 define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4867 ; X86-LABEL: test_mm512_mask3_fnmsub_pd:
4868 ; X86: # %bb.0: # %entry
4869 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4870 ; X86-NEXT: kmovw %eax, %k1
4871 ; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4872 ; X86-NEXT: vmovapd %zmm2, %zmm0
4875 ; X64-LABEL: test_mm512_mask3_fnmsub_pd:
4876 ; X64: # %bb.0: # %entry
4877 ; X64-NEXT: kmovw %edi, %k1
4878 ; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4879 ; X64-NEXT: vmovapd %zmm2, %zmm0
4882 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4883 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4884 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4885 %1 = bitcast i8 %__U to <8 x i1>
4886 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4890 define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4891 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
4892 ; X86: # %bb.0: # %entry
4893 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4894 ; X86-NEXT: kmovw %eax, %k1
4895 ; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4898 ; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
4899 ; X64: # %bb.0: # %entry
4900 ; X64-NEXT: kmovw %edi, %k1
4901 ; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4904 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4905 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4906 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4907 %1 = bitcast i16 %__U to <16 x i1>
4908 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4912 define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4913 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
4914 ; X86: # %bb.0: # %entry
4915 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4916 ; X86-NEXT: kmovw %eax, %k1
4917 ; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4918 ; X86-NEXT: vmovaps %zmm2, %zmm0
4921 ; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
4922 ; X64: # %bb.0: # %entry
4923 ; X64-NEXT: kmovw %edi, %k1
4924 ; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4925 ; X64-NEXT: vmovaps %zmm2, %zmm0
4928 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4929 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4930 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4931 %1 = bitcast i16 %__U to <16 x i1>
4932 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4936 define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4937 ; X86-LABEL: test_mm512_mask_fnmsub_ps:
4938 ; X86: # %bb.0: # %entry
4939 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4940 ; X86-NEXT: kmovw %eax, %k1
4941 ; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4944 ; X64-LABEL: test_mm512_mask_fnmsub_ps:
4945 ; X64: # %bb.0: # %entry
4946 ; X64-NEXT: kmovw %edi, %k1
4947 ; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4950 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4951 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4952 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4953 %1 = bitcast i16 %__U to <16 x i1>
4954 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4958 define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4959 ; X86-LABEL: test_mm512_mask3_fnmsub_ps:
4960 ; X86: # %bb.0: # %entry
4961 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4962 ; X86-NEXT: kmovw %eax, %k1
4963 ; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4964 ; X86-NEXT: vmovaps %zmm2, %zmm0
4967 ; X64-LABEL: test_mm512_mask3_fnmsub_ps:
4968 ; X64: # %bb.0: # %entry
4969 ; X64-NEXT: kmovw %edi, %k1
4970 ; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4971 ; X64-NEXT: vmovaps %zmm2, %zmm0
4974 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4975 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4976 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4977 %1 = bitcast i16 %__U to <16 x i1>
4978 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4982 define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4983 ; X86-LABEL: test_mm_mask_fmadd_ss:
4984 ; X86: # %bb.0: # %entry
4985 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4986 ; X86-NEXT: kmovw %eax, %k1
4987 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
4990 ; X64-LABEL: test_mm_mask_fmadd_ss:
4991 ; X64: # %bb.0: # %entry
4992 ; X64-NEXT: kmovw %edi, %k1
4993 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
4996 %0 = extractelement <4 x float> %__W, i64 0
4997 %1 = extractelement <4 x float> %__A, i64 0
4998 %2 = extractelement <4 x float> %__B, i64 0
4999 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5001 %tobool.i = icmp eq i8 %4, 0
5002 %vecext1.i = extractelement <4 x float> %__W, i32 0
5003 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5004 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5005 ret <4 x float> %vecins.i
5008 define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5009 ; X86-LABEL: test_mm_mask_fmadd_round_ss:
5010 ; X86: # %bb.0: # %entry
5011 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5012 ; X86-NEXT: kmovw %eax, %k1
5013 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5016 ; X64-LABEL: test_mm_mask_fmadd_round_ss:
5017 ; X64: # %bb.0: # %entry
5018 ; X64-NEXT: kmovw %edi, %k1
5019 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5022 %0 = extractelement <4 x float> %__W, i64 0
5023 %1 = extractelement <4 x float> %__A, i64 0
5024 %2 = extractelement <4 x float> %__B, i64 0
5025 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5026 %4 = bitcast i8 %__U to <8 x i1>
5027 %5 = extractelement <8 x i1> %4, i64 0
5028 %6 = select i1 %5, float %3, float %0
5029 %7 = insertelement <4 x float> %__W, float %6, i64 0
5033 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
5035 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5036 ; X86-LABEL: test_mm_maskz_fmadd_ss:
5037 ; X86: # %bb.0: # %entry
5038 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5039 ; X86-NEXT: kmovw %eax, %k1
5040 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5043 ; X64-LABEL: test_mm_maskz_fmadd_ss:
5044 ; X64: # %bb.0: # %entry
5045 ; X64-NEXT: kmovw %edi, %k1
5046 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5049 %0 = extractelement <4 x float> %__A, i64 0
5050 %1 = extractelement <4 x float> %__B, i64 0
5051 %2 = extractelement <4 x float> %__C, i64 0
5052 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5054 %tobool.i = icmp eq i8 %4, 0
5055 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5056 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5057 ret <4 x float> %vecins.i
5060 define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5061 ; X86-LABEL: test_mm_maskz_fmadd_round_ss:
5062 ; X86: # %bb.0: # %entry
5063 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5064 ; X86-NEXT: kmovw %eax, %k1
5065 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5068 ; X64-LABEL: test_mm_maskz_fmadd_round_ss:
5069 ; X64: # %bb.0: # %entry
5070 ; X64-NEXT: kmovw %edi, %k1
5071 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5074 %0 = extractelement <4 x float> %__A, i64 0
5075 %1 = extractelement <4 x float> %__B, i64 0
5076 %2 = extractelement <4 x float> %__C, i64 0
5077 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5078 %4 = bitcast i8 %__U to <8 x i1>
5079 %5 = extractelement <8 x i1> %4, i64 0
5080 %6 = select i1 %5, float %3, float 0.000000e+00
5081 %7 = insertelement <4 x float> %__A, float %6, i64 0
5085 define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5086 ; X86-LABEL: test_mm_mask3_fmadd_ss:
5087 ; X86: # %bb.0: # %entry
5088 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5089 ; X86-NEXT: kmovw %eax, %k1
5090 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5091 ; X86-NEXT: vmovaps %xmm2, %xmm0
5094 ; X64-LABEL: test_mm_mask3_fmadd_ss:
5095 ; X64: # %bb.0: # %entry
5096 ; X64-NEXT: kmovw %edi, %k1
5097 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5098 ; X64-NEXT: vmovaps %xmm2, %xmm0
5101 %0 = extractelement <4 x float> %__W, i64 0
5102 %1 = extractelement <4 x float> %__X, i64 0
5103 %2 = extractelement <4 x float> %__Y, i64 0
5104 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5106 %tobool.i = icmp eq i8 %4, 0
5107 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5108 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5109 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5110 ret <4 x float> %vecins.i
5113 define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5114 ; X86-LABEL: test_mm_mask3_fmadd_round_ss:
5115 ; X86: # %bb.0: # %entry
5116 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5117 ; X86-NEXT: kmovw %eax, %k1
5118 ; X86-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5119 ; X86-NEXT: vmovaps %xmm2, %xmm0
5122 ; X64-LABEL: test_mm_mask3_fmadd_round_ss:
5123 ; X64: # %bb.0: # %entry
5124 ; X64-NEXT: kmovw %edi, %k1
5125 ; X64-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5126 ; X64-NEXT: vmovaps %xmm2, %xmm0
5129 %0 = extractelement <4 x float> %__W, i64 0
5130 %1 = extractelement <4 x float> %__X, i64 0
5131 %2 = extractelement <4 x float> %__Y, i64 0
5132 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5133 %4 = bitcast i8 %__U to <8 x i1>
5134 %5 = extractelement <8 x i1> %4, i64 0
5135 %6 = select i1 %5, float %3, float %2
5136 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5140 define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5141 ; X86-LABEL: test_mm_mask_fmsub_ss:
5142 ; X86: # %bb.0: # %entry
5143 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5144 ; X86-NEXT: kmovw %eax, %k1
5145 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5148 ; X64-LABEL: test_mm_mask_fmsub_ss:
5149 ; X64: # %bb.0: # %entry
5150 ; X64-NEXT: kmovw %edi, %k1
5151 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5154 %0 = extractelement <4 x float> %__W, i64 0
5155 %1 = extractelement <4 x float> %__A, i64 0
5156 %.rhs.i = extractelement <4 x float> %__B, i64 0
5157 %2 = fsub float -0.000000e+00, %.rhs.i
5158 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5160 %tobool.i = icmp eq i8 %4, 0
5161 %vecext1.i = extractelement <4 x float> %__W, i32 0
5162 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5163 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5164 ret <4 x float> %vecins.i
5167 define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5168 ; X86-LABEL: test_mm_mask_fmsub_round_ss:
5169 ; X86: # %bb.0: # %entry
5170 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5171 ; X86-NEXT: kmovw %eax, %k1
5172 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5175 ; X64-LABEL: test_mm_mask_fmsub_round_ss:
5176 ; X64: # %bb.0: # %entry
5177 ; X64-NEXT: kmovw %edi, %k1
5178 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5181 %0 = extractelement <4 x float> %__W, i64 0
5182 %1 = extractelement <4 x float> %__A, i64 0
5183 %.rhs = extractelement <4 x float> %__B, i64 0
5184 %2 = fsub float -0.000000e+00, %.rhs
5185 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5186 %4 = bitcast i8 %__U to <8 x i1>
5187 %5 = extractelement <8 x i1> %4, i64 0
5188 %6 = select i1 %5, float %3, float %0
5189 %7 = insertelement <4 x float> %__W, float %6, i64 0
5193 define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5194 ; X86-LABEL: test_mm_maskz_fmsub_ss:
5195 ; X86: # %bb.0: # %entry
5196 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5197 ; X86-NEXT: kmovw %eax, %k1
5198 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5201 ; X64-LABEL: test_mm_maskz_fmsub_ss:
5202 ; X64: # %bb.0: # %entry
5203 ; X64-NEXT: kmovw %edi, %k1
5204 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5207 %0 = extractelement <4 x float> %__A, i64 0
5208 %1 = extractelement <4 x float> %__B, i64 0
5209 %.rhs.i = extractelement <4 x float> %__C, i64 0
5210 %2 = fsub float -0.000000e+00, %.rhs.i
5211 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5213 %tobool.i = icmp eq i8 %4, 0
5214 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5215 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5216 ret <4 x float> %vecins.i
5219 define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5220 ; X86-LABEL: test_mm_maskz_fmsub_round_ss:
5221 ; X86: # %bb.0: # %entry
5222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5223 ; X86-NEXT: kmovw %eax, %k1
5224 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5227 ; X64-LABEL: test_mm_maskz_fmsub_round_ss:
5228 ; X64: # %bb.0: # %entry
5229 ; X64-NEXT: kmovw %edi, %k1
5230 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5233 %0 = extractelement <4 x float> %__A, i64 0
5234 %1 = extractelement <4 x float> %__B, i64 0
5235 %.rhs = extractelement <4 x float> %__C, i64 0
5236 %2 = fsub float -0.000000e+00, %.rhs
5237 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5238 %4 = bitcast i8 %__U to <8 x i1>
5239 %5 = extractelement <8 x i1> %4, i64 0
5240 %6 = select i1 %5, float %3, float 0.000000e+00
5241 %7 = insertelement <4 x float> %__A, float %6, i64 0
5245 define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5246 ; X86-LABEL: test_mm_mask3_fmsub_ss:
5247 ; X86: # %bb.0: # %entry
5248 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5249 ; X86-NEXT: kmovw %eax, %k1
5250 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5251 ; X86-NEXT: vmovaps %xmm2, %xmm0
5254 ; X64-LABEL: test_mm_mask3_fmsub_ss:
5255 ; X64: # %bb.0: # %entry
5256 ; X64-NEXT: kmovw %edi, %k1
5257 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5258 ; X64-NEXT: vmovaps %xmm2, %xmm0
5261 %0 = extractelement <4 x float> %__W, i64 0
5262 %1 = extractelement <4 x float> %__X, i64 0
5263 %.rhs.i = extractelement <4 x float> %__Y, i64 0
5264 %2 = fsub float -0.000000e+00, %.rhs.i
5265 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5267 %tobool.i = icmp eq i8 %4, 0
5268 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5269 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5270 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5271 ret <4 x float> %vecins.i
5274 define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5275 ; X86-LABEL: test_mm_mask3_fmsub_round_ss:
5276 ; X86: # %bb.0: # %entry
5277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5278 ; X86-NEXT: kmovw %eax, %k1
5279 ; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5280 ; X86-NEXT: vmovaps %xmm2, %xmm0
5283 ; X64-LABEL: test_mm_mask3_fmsub_round_ss:
5284 ; X64: # %bb.0: # %entry
5285 ; X64-NEXT: kmovw %edi, %k1
5286 ; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5287 ; X64-NEXT: vmovaps %xmm2, %xmm0
5290 %0 = extractelement <4 x float> %__W, i64 0
5291 %1 = extractelement <4 x float> %__X, i64 0
5292 %.rhs = extractelement <4 x float> %__Y, i64 0
5293 %2 = fsub float -0.000000e+00, %.rhs
5294 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5295 %4 = bitcast i8 %__U to <8 x i1>
5296 %5 = extractelement <8 x i1> %4, i64 0
5297 %6 = select i1 %5, float %3, float %.rhs
5298 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5302 define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5303 ; X86-LABEL: test_mm_mask_fnmadd_ss:
5304 ; X86: # %bb.0: # %entry
5305 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5306 ; X86-NEXT: kmovw %eax, %k1
5307 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5310 ; X64-LABEL: test_mm_mask_fnmadd_ss:
5311 ; X64: # %bb.0: # %entry
5312 ; X64-NEXT: kmovw %edi, %k1
5313 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5316 %0 = extractelement <4 x float> %__W, i64 0
5317 %.rhs.i = extractelement <4 x float> %__A, i64 0
5318 %1 = fsub float -0.000000e+00, %.rhs.i
5319 %2 = extractelement <4 x float> %__B, i64 0
5320 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5322 %tobool.i = icmp eq i8 %4, 0
5323 %vecext1.i = extractelement <4 x float> %__W, i32 0
5324 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5325 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5326 ret <4 x float> %vecins.i
5329 define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5330 ; X86-LABEL: test_mm_mask_fnmadd_round_ss:
5331 ; X86: # %bb.0: # %entry
5332 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5333 ; X86-NEXT: kmovw %eax, %k1
5334 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5337 ; X64-LABEL: test_mm_mask_fnmadd_round_ss:
5338 ; X64: # %bb.0: # %entry
5339 ; X64-NEXT: kmovw %edi, %k1
5340 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5343 %0 = extractelement <4 x float> %__W, i64 0
5344 %.rhs = extractelement <4 x float> %__A, i64 0
5345 %1 = fsub float -0.000000e+00, %.rhs
5346 %2 = extractelement <4 x float> %__B, i64 0
5347 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5348 %4 = bitcast i8 %__U to <8 x i1>
5349 %5 = extractelement <8 x i1> %4, i64 0
5350 %6 = select i1 %5, float %3, float %0
5351 %7 = insertelement <4 x float> %__W, float %6, i64 0
5355 define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5356 ; X86-LABEL: test_mm_maskz_fnmadd_ss:
5357 ; X86: # %bb.0: # %entry
5358 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5359 ; X86-NEXT: kmovw %eax, %k1
5360 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
5363 ; X64-LABEL: test_mm_maskz_fnmadd_ss:
5364 ; X64: # %bb.0: # %entry
5365 ; X64-NEXT: kmovw %edi, %k1
5366 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
5369 %0 = extractelement <4 x float> %__A, i64 0
5370 %.rhs.i = extractelement <4 x float> %__B, i64 0
5371 %1 = fsub float -0.000000e+00, %.rhs.i
5372 %2 = extractelement <4 x float> %__C, i64 0
5373 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5375 %tobool.i = icmp eq i8 %4, 0
5376 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5377 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5378 ret <4 x float> %vecins.i
5381 define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5382 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
5383 ; X86: # %bb.0: # %entry
5384 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5385 ; X86-NEXT: kmovw %eax, %k1
5386 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5389 ; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
5390 ; X64: # %bb.0: # %entry
5391 ; X64-NEXT: kmovw %edi, %k1
5392 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5395 %0 = extractelement <4 x float> %__A, i64 0
5396 %.rhs = extractelement <4 x float> %__B, i64 0
5397 %1 = fsub float -0.000000e+00, %.rhs
5398 %2 = extractelement <4 x float> %__C, i64 0
5399 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5400 %4 = bitcast i8 %__U to <8 x i1>
5401 %5 = extractelement <8 x i1> %4, i64 0
5402 %6 = select i1 %5, float %3, float 0.000000e+00
5403 %7 = insertelement <4 x float> %__A, float %6, i64 0
5407 define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5408 ; X86-LABEL: test_mm_mask3_fnmadd_ss:
5409 ; X86: # %bb.0: # %entry
5410 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5411 ; X86-NEXT: kmovw %eax, %k1
5412 ; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
5413 ; X86-NEXT: vmovaps %xmm2, %xmm0
5416 ; X64-LABEL: test_mm_mask3_fnmadd_ss:
5417 ; X64: # %bb.0: # %entry
5418 ; X64-NEXT: kmovw %edi, %k1
5419 ; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
5420 ; X64-NEXT: vmovaps %xmm2, %xmm0
5423 %0 = extractelement <4 x float> %__W, i64 0
5424 %.rhs.i = extractelement <4 x float> %__X, i64 0
5425 %1 = fsub float -0.000000e+00, %.rhs.i
5426 %2 = extractelement <4 x float> %__Y, i64 0
5427 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5429 %tobool.i = icmp eq i8 %4, 0
5430 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5431 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5432 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5433 ret <4 x float> %vecins.i
5436 define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5437 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
5438 ; X86: # %bb.0: # %entry
5439 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5440 ; X86-NEXT: kmovw %eax, %k1
5441 ; X86-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5442 ; X86-NEXT: vmovaps %xmm2, %xmm0
5445 ; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
5446 ; X64: # %bb.0: # %entry
5447 ; X64-NEXT: kmovw %edi, %k1
5448 ; X64-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5449 ; X64-NEXT: vmovaps %xmm2, %xmm0
5452 %0 = extractelement <4 x float> %__W, i64 0
5453 %.rhs = extractelement <4 x float> %__X, i64 0
5454 %1 = fsub float -0.000000e+00, %.rhs
5455 %2 = extractelement <4 x float> %__Y, i64 0
5456 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5457 %4 = bitcast i8 %__U to <8 x i1>
5458 %5 = extractelement <8 x i1> %4, i64 0
5459 %6 = select i1 %5, float %3, float %2
5460 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5464 define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5465 ; X86-LABEL: test_mm_mask_fnmsub_ss:
5466 ; X86: # %bb.0: # %entry
5467 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5468 ; X86-NEXT: kmovw %eax, %k1
5469 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
5472 ; X64-LABEL: test_mm_mask_fnmsub_ss:
5473 ; X64: # %bb.0: # %entry
5474 ; X64-NEXT: kmovw %edi, %k1
5475 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
5478 %0 = extractelement <4 x float> %__W, i64 0
5479 %.rhs.i = extractelement <4 x float> %__A, i64 0
5480 %1 = fsub float -0.000000e+00, %.rhs.i
5481 %.rhs7.i = extractelement <4 x float> %__B, i64 0
5482 %2 = fsub float -0.000000e+00, %.rhs7.i
5483 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5485 %tobool.i = icmp eq i8 %4, 0
5486 %vecext2.i = extractelement <4 x float> %__W, i32 0
5487 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5488 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5489 ret <4 x float> %vecins.i
5492 define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5493 ; X86-LABEL: test_mm_mask_fnmsub_round_ss:
5494 ; X86: # %bb.0: # %entry
5495 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5496 ; X86-NEXT: kmovw %eax, %k1
5497 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5500 ; X64-LABEL: test_mm_mask_fnmsub_round_ss:
5501 ; X64: # %bb.0: # %entry
5502 ; X64-NEXT: kmovw %edi, %k1
5503 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5506 %0 = extractelement <4 x float> %__W, i64 0
5507 %.rhs = extractelement <4 x float> %__A, i64 0
5508 %1 = fsub float -0.000000e+00, %.rhs
5509 %.rhs2 = extractelement <4 x float> %__B, i64 0
5510 %2 = fsub float -0.000000e+00, %.rhs2
5511 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5512 %4 = bitcast i8 %__U to <8 x i1>
5513 %5 = extractelement <8 x i1> %4, i64 0
5514 %6 = select i1 %5, float %3, float %0
5515 %7 = insertelement <4 x float> %__W, float %6, i64 0
5519 define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5520 ; X86-LABEL: test_mm_maskz_fnmsub_ss:
5521 ; X86: # %bb.0: # %entry
5522 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5523 ; X86-NEXT: kmovw %eax, %k1
5524 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
5527 ; X64-LABEL: test_mm_maskz_fnmsub_ss:
5528 ; X64: # %bb.0: # %entry
5529 ; X64-NEXT: kmovw %edi, %k1
5530 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
5533 %0 = extractelement <4 x float> %__A, i64 0
5534 %.rhs.i = extractelement <4 x float> %__B, i64 0
5535 %1 = fsub float -0.000000e+00, %.rhs.i
5536 %.rhs5.i = extractelement <4 x float> %__C, i64 0
5537 %2 = fsub float -0.000000e+00, %.rhs5.i
5538 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5540 %tobool.i = icmp eq i8 %4, 0
5541 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5542 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5543 ret <4 x float> %vecins.i
5546 define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5547 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
5548 ; X86: # %bb.0: # %entry
5549 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5550 ; X86-NEXT: kmovw %eax, %k1
5551 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5554 ; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
5555 ; X64: # %bb.0: # %entry
5556 ; X64-NEXT: kmovw %edi, %k1
5557 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5560 %0 = extractelement <4 x float> %__A, i64 0
5561 %.rhs = extractelement <4 x float> %__B, i64 0
5562 %1 = fsub float -0.000000e+00, %.rhs
5563 %.rhs2 = extractelement <4 x float> %__C, i64 0
5564 %2 = fsub float -0.000000e+00, %.rhs2
5565 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5566 %4 = bitcast i8 %__U to <8 x i1>
5567 %5 = extractelement <8 x i1> %4, i64 0
5568 %6 = select i1 %5, float %3, float 0.000000e+00
5569 %7 = insertelement <4 x float> %__A, float %6, i64 0
5573 define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5574 ; X86-LABEL: test_mm_mask3_fnmsub_ss:
5575 ; X86: # %bb.0: # %entry
5576 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5577 ; X86-NEXT: kmovw %eax, %k1
5578 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5579 ; X86-NEXT: vmovaps %xmm2, %xmm0
5582 ; X64-LABEL: test_mm_mask3_fnmsub_ss:
5583 ; X64: # %bb.0: # %entry
5584 ; X64-NEXT: kmovw %edi, %k1
5585 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5586 ; X64-NEXT: vmovaps %xmm2, %xmm0
5589 %0 = extractelement <4 x float> %__W, i64 0
5590 %.rhs.i = extractelement <4 x float> %__X, i64 0
5591 %1 = fsub float -0.000000e+00, %.rhs.i
5592 %.rhs7.i = extractelement <4 x float> %__Y, i64 0
5593 %2 = fsub float -0.000000e+00, %.rhs7.i
5594 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5596 %tobool.i = icmp eq i8 %4, 0
5597 %vecext2.i = extractelement <4 x float> %__Y, i32 0
5598 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5599 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5600 ret <4 x float> %vecins.i
5603 define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5604 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
5605 ; X86: # %bb.0: # %entry
5606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5607 ; X86-NEXT: kmovw %eax, %k1
5608 ; X86-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5609 ; X86-NEXT: vmovaps %xmm2, %xmm0
5612 ; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
5613 ; X64: # %bb.0: # %entry
5614 ; X64-NEXT: kmovw %edi, %k1
5615 ; X64-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5616 ; X64-NEXT: vmovaps %xmm2, %xmm0
5619 %0 = extractelement <4 x float> %__W, i64 0
5620 %.rhs = extractelement <4 x float> %__X, i64 0
5621 %1 = fsub float -0.000000e+00, %.rhs
5622 %.rhs1 = extractelement <4 x float> %__Y, i64 0
5623 %2 = fsub float -0.000000e+00, %.rhs1
5624 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5625 %4 = bitcast i8 %__U to <8 x i1>
5626 %5 = extractelement <8 x i1> %4, i64 0
5627 %6 = select i1 %5, float %3, float %.rhs1
5628 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5632 define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5633 ; X86-LABEL: test_mm_mask_fmadd_sd:
5634 ; X86: # %bb.0: # %entry
5635 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5636 ; X86-NEXT: kmovw %eax, %k1
5637 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
5640 ; X64-LABEL: test_mm_mask_fmadd_sd:
5641 ; X64: # %bb.0: # %entry
5642 ; X64-NEXT: kmovw %edi, %k1
5643 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
5646 %0 = extractelement <2 x double> %__W, i64 0
5647 %1 = extractelement <2 x double> %__A, i64 0
5648 %2 = extractelement <2 x double> %__B, i64 0
5649 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5651 %tobool.i = icmp eq i8 %4, 0
5652 %vecext1.i = extractelement <2 x double> %__W, i32 0
5653 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5654 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5655 ret <2 x double> %vecins.i
5658 define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5659 ; X86-LABEL: test_mm_mask_fmadd_round_sd:
5660 ; X86: # %bb.0: # %entry
5661 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5662 ; X86-NEXT: kmovw %eax, %k1
5663 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5666 ; X64-LABEL: test_mm_mask_fmadd_round_sd:
5667 ; X64: # %bb.0: # %entry
5668 ; X64-NEXT: kmovw %edi, %k1
5669 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5672 %0 = extractelement <2 x double> %__W, i64 0
5673 %1 = extractelement <2 x double> %__A, i64 0
5674 %2 = extractelement <2 x double> %__B, i64 0
5675 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5676 %4 = bitcast i8 %__U to <8 x i1>
5677 %5 = extractelement <8 x i1> %4, i64 0
5678 %6 = select i1 %5, double %3, double %0
5679 %7 = insertelement <2 x double> %__W, double %6, i64 0
5683 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
5685 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5686 ; X86-LABEL: test_mm_maskz_fmadd_sd:
5687 ; X86: # %bb.0: # %entry
5688 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5689 ; X86-NEXT: kmovw %eax, %k1
5690 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5693 ; X64-LABEL: test_mm_maskz_fmadd_sd:
5694 ; X64: # %bb.0: # %entry
5695 ; X64-NEXT: kmovw %edi, %k1
5696 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5699 %0 = extractelement <2 x double> %__A, i64 0
5700 %1 = extractelement <2 x double> %__B, i64 0
5701 %2 = extractelement <2 x double> %__C, i64 0
5702 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5704 %tobool.i = icmp eq i8 %4, 0
5705 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5706 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5707 ret <2 x double> %vecins.i
5710 define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5711 ; X86-LABEL: test_mm_maskz_fmadd_round_sd:
5712 ; X86: # %bb.0: # %entry
5713 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5714 ; X86-NEXT: kmovw %eax, %k1
5715 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5718 ; X64-LABEL: test_mm_maskz_fmadd_round_sd:
5719 ; X64: # %bb.0: # %entry
5720 ; X64-NEXT: kmovw %edi, %k1
5721 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5724 %0 = extractelement <2 x double> %__A, i64 0
5725 %1 = extractelement <2 x double> %__B, i64 0
5726 %2 = extractelement <2 x double> %__C, i64 0
5727 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5728 %4 = bitcast i8 %__U to <8 x i1>
5729 %5 = extractelement <8 x i1> %4, i64 0
5730 %6 = select i1 %5, double %3, double 0.000000e+00
5731 %7 = insertelement <2 x double> %__A, double %6, i64 0
5735 define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5736 ; X86-LABEL: test_mm_mask3_fmadd_sd:
5737 ; X86: # %bb.0: # %entry
5738 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5739 ; X86-NEXT: kmovw %eax, %k1
5740 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5741 ; X86-NEXT: vmovapd %xmm2, %xmm0
5744 ; X64-LABEL: test_mm_mask3_fmadd_sd:
5745 ; X64: # %bb.0: # %entry
5746 ; X64-NEXT: kmovw %edi, %k1
5747 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5748 ; X64-NEXT: vmovapd %xmm2, %xmm0
5751 %0 = extractelement <2 x double> %__W, i64 0
5752 %1 = extractelement <2 x double> %__X, i64 0
5753 %2 = extractelement <2 x double> %__Y, i64 0
5754 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5756 %tobool.i = icmp eq i8 %4, 0
5757 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5758 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5759 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5760 ret <2 x double> %vecins.i
5763 define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5764 ; X86-LABEL: test_mm_mask3_fmadd_round_sd:
5765 ; X86: # %bb.0: # %entry
5766 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5767 ; X86-NEXT: kmovw %eax, %k1
5768 ; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5769 ; X86-NEXT: vmovapd %xmm2, %xmm0
5772 ; X64-LABEL: test_mm_mask3_fmadd_round_sd:
5773 ; X64: # %bb.0: # %entry
5774 ; X64-NEXT: kmovw %edi, %k1
5775 ; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5776 ; X64-NEXT: vmovapd %xmm2, %xmm0
5779 %0 = extractelement <2 x double> %__W, i64 0
5780 %1 = extractelement <2 x double> %__X, i64 0
5781 %2 = extractelement <2 x double> %__Y, i64 0
5782 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5783 %4 = bitcast i8 %__U to <8 x i1>
5784 %5 = extractelement <8 x i1> %4, i64 0
5785 %6 = select i1 %5, double %3, double %2
5786 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5790 define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5791 ; X86-LABEL: test_mm_mask_fmsub_sd:
5792 ; X86: # %bb.0: # %entry
5793 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5794 ; X86-NEXT: kmovw %eax, %k1
5795 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5798 ; X64-LABEL: test_mm_mask_fmsub_sd:
5799 ; X64: # %bb.0: # %entry
5800 ; X64-NEXT: kmovw %edi, %k1
5801 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5804 %0 = extractelement <2 x double> %__W, i64 0
5805 %1 = extractelement <2 x double> %__A, i64 0
5806 %.rhs.i = extractelement <2 x double> %__B, i64 0
5807 %2 = fsub double -0.000000e+00, %.rhs.i
5808 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5810 %tobool.i = icmp eq i8 %4, 0
5811 %vecext1.i = extractelement <2 x double> %__W, i32 0
5812 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5813 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5814 ret <2 x double> %vecins.i
5817 define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5818 ; X86-LABEL: test_mm_mask_fmsub_round_sd:
5819 ; X86: # %bb.0: # %entry
5820 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5821 ; X86-NEXT: kmovw %eax, %k1
5822 ; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5825 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
5826 ; X64: # %bb.0: # %entry
5827 ; X64-NEXT: kmovw %edi, %k1
5828 ; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5831 %0 = extractelement <2 x double> %__W, i64 0
5832 %1 = extractelement <2 x double> %__A, i64 0
5833 %.rhs = extractelement <2 x double> %__B, i64 0
5834 %2 = fsub double -0.000000e+00, %.rhs
5835 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5836 %4 = bitcast i8 %__U to <8 x i1>
5837 %5 = extractelement <8 x i1> %4, i64 0
5838 %6 = select i1 %5, double %3, double %0
5839 %7 = insertelement <2 x double> %__W, double %6, i64 0
5843 define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5844 ; X86-LABEL: test_mm_maskz_fmsub_sd:
5845 ; X86: # %bb.0: # %entry
5846 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5847 ; X86-NEXT: kmovw %eax, %k1
5848 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5851 ; X64-LABEL: test_mm_maskz_fmsub_sd:
5852 ; X64: # %bb.0: # %entry
5853 ; X64-NEXT: kmovw %edi, %k1
5854 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5857 %0 = extractelement <2 x double> %__A, i64 0
5858 %1 = extractelement <2 x double> %__B, i64 0
5859 %.rhs.i = extractelement <2 x double> %__C, i64 0
5860 %2 = fsub double -0.000000e+00, %.rhs.i
5861 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5863 %tobool.i = icmp eq i8 %4, 0
5864 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5865 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5866 ret <2 x double> %vecins.i
5869 define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5870 ; X86-LABEL: test_mm_maskz_fmsub_round_sd:
5871 ; X86: # %bb.0: # %entry
5872 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5873 ; X86-NEXT: kmovw %eax, %k1
5874 ; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5877 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
5878 ; X64: # %bb.0: # %entry
5879 ; X64-NEXT: kmovw %edi, %k1
5880 ; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5883 %0 = extractelement <2 x double> %__A, i64 0
5884 %1 = extractelement <2 x double> %__B, i64 0
5885 %.rhs = extractelement <2 x double> %__C, i64 0
5886 %2 = fsub double -0.000000e+00, %.rhs
5887 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5888 %4 = bitcast i8 %__U to <8 x i1>
5889 %5 = extractelement <8 x i1> %4, i64 0
5890 %6 = select i1 %5, double %3, double 0.000000e+00
5891 %7 = insertelement <2 x double> %__A, double %6, i64 0
5895 define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5896 ; X86-LABEL: test_mm_mask3_fmsub_sd:
5897 ; X86: # %bb.0: # %entry
5898 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5899 ; X86-NEXT: kmovw %eax, %k1
5900 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5901 ; X86-NEXT: vmovapd %xmm2, %xmm0
5904 ; X64-LABEL: test_mm_mask3_fmsub_sd:
5905 ; X64: # %bb.0: # %entry
5906 ; X64-NEXT: kmovw %edi, %k1
5907 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5908 ; X64-NEXT: vmovapd %xmm2, %xmm0
5911 %0 = extractelement <2 x double> %__W, i64 0
5912 %1 = extractelement <2 x double> %__X, i64 0
5913 %.rhs.i = extractelement <2 x double> %__Y, i64 0
5914 %2 = fsub double -0.000000e+00, %.rhs.i
5915 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5917 %tobool.i = icmp eq i8 %4, 0
5918 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5919 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5920 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5921 ret <2 x double> %vecins.i
5924 define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5925 ; X86-LABEL: test_mm_mask3_fmsub_round_sd:
5926 ; X86: # %bb.0: # %entry
5927 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5928 ; X86-NEXT: kmovw %eax, %k1
5929 ; X86-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5930 ; X86-NEXT: vmovapd %xmm2, %xmm0
5933 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
5934 ; X64: # %bb.0: # %entry
5935 ; X64-NEXT: kmovw %edi, %k1
5936 ; X64-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5937 ; X64-NEXT: vmovapd %xmm2, %xmm0
5940 %0 = extractelement <2 x double> %__W, i64 0
5941 %1 = extractelement <2 x double> %__X, i64 0
5942 %.rhs = extractelement <2 x double> %__Y, i64 0
5943 %2 = fsub double -0.000000e+00, %.rhs
5944 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5945 %4 = bitcast i8 %__U to <8 x i1>
5946 %5 = extractelement <8 x i1> %4, i64 0
5947 %6 = select i1 %5, double %3, double %.rhs
5948 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5952 define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5953 ; X86-LABEL: test_mm_mask_fnmadd_sd:
5954 ; X86: # %bb.0: # %entry
5955 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5956 ; X86-NEXT: kmovw %eax, %k1
5957 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5960 ; X64-LABEL: test_mm_mask_fnmadd_sd:
5961 ; X64: # %bb.0: # %entry
5962 ; X64-NEXT: kmovw %edi, %k1
5963 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5966 %0 = extractelement <2 x double> %__W, i64 0
5967 %.rhs.i = extractelement <2 x double> %__A, i64 0
5968 %1 = fsub double -0.000000e+00, %.rhs.i
5969 %2 = extractelement <2 x double> %__B, i64 0
5970 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5972 %tobool.i = icmp eq i8 %4, 0
5973 %vecext1.i = extractelement <2 x double> %__W, i32 0
5974 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5975 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5976 ret <2 x double> %vecins.i
5979 define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5980 ; X86-LABEL: test_mm_mask_fnmadd_round_sd:
5981 ; X86: # %bb.0: # %entry
5982 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5983 ; X86-NEXT: kmovw %eax, %k1
5984 ; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5987 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
5988 ; X64: # %bb.0: # %entry
5989 ; X64-NEXT: kmovw %edi, %k1
5990 ; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5993 %0 = extractelement <2 x double> %__W, i64 0
5994 %.rhs = extractelement <2 x double> %__A, i64 0
5995 %1 = fsub double -0.000000e+00, %.rhs
5996 %2 = extractelement <2 x double> %__B, i64 0
5997 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5998 %4 = bitcast i8 %__U to <8 x i1>
5999 %5 = extractelement <8 x i1> %4, i64 0
6000 %6 = select i1 %5, double %3, double %0
6001 %7 = insertelement <2 x double> %__W, double %6, i64 0
6005 define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6006 ; X86-LABEL: test_mm_maskz_fnmadd_sd:
6007 ; X86: # %bb.0: # %entry
6008 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6009 ; X86-NEXT: kmovw %eax, %k1
6010 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
6013 ; X64-LABEL: test_mm_maskz_fnmadd_sd:
6014 ; X64: # %bb.0: # %entry
6015 ; X64-NEXT: kmovw %edi, %k1
6016 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
6019 %0 = extractelement <2 x double> %__A, i64 0
6020 %.rhs.i = extractelement <2 x double> %__B, i64 0
6021 %1 = fsub double -0.000000e+00, %.rhs.i
6022 %2 = extractelement <2 x double> %__C, i64 0
6023 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6025 %tobool.i = icmp eq i8 %4, 0
6026 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6027 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6028 ret <2 x double> %vecins.i
6031 define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6032 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
6033 ; X86: # %bb.0: # %entry
6034 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6035 ; X86-NEXT: kmovw %eax, %k1
6036 ; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6039 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
6040 ; X64: # %bb.0: # %entry
6041 ; X64-NEXT: kmovw %edi, %k1
6042 ; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6045 %0 = extractelement <2 x double> %__A, i64 0
6046 %.rhs = extractelement <2 x double> %__B, i64 0
6047 %1 = fsub double -0.000000e+00, %.rhs
6048 %2 = extractelement <2 x double> %__C, i64 0
6049 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6050 %4 = bitcast i8 %__U to <8 x i1>
6051 %5 = extractelement <8 x i1> %4, i64 0
6052 %6 = select i1 %5, double %3, double 0.000000e+00
6053 %7 = insertelement <2 x double> %__A, double %6, i64 0
6057 define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6058 ; X86-LABEL: test_mm_mask3_fnmadd_sd:
6059 ; X86: # %bb.0: # %entry
6060 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6061 ; X86-NEXT: kmovw %eax, %k1
6062 ; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
6063 ; X86-NEXT: vmovapd %xmm2, %xmm0
6066 ; X64-LABEL: test_mm_mask3_fnmadd_sd:
6067 ; X64: # %bb.0: # %entry
6068 ; X64-NEXT: kmovw %edi, %k1
6069 ; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
6070 ; X64-NEXT: vmovapd %xmm2, %xmm0
6073 %0 = extractelement <2 x double> %__W, i64 0
6074 %.rhs.i = extractelement <2 x double> %__X, i64 0
6075 %1 = fsub double -0.000000e+00, %.rhs.i
6076 %2 = extractelement <2 x double> %__Y, i64 0
6077 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6079 %tobool.i = icmp eq i8 %4, 0
6080 %vecext1.i = extractelement <2 x double> %__Y, i32 0
6081 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
6082 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6083 ret <2 x double> %vecins.i
6086 define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6087 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
6088 ; X86: # %bb.0: # %entry
6089 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6090 ; X86-NEXT: kmovw %eax, %k1
6091 ; X86-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6092 ; X86-NEXT: vmovapd %xmm2, %xmm0
6095 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
6096 ; X64: # %bb.0: # %entry
6097 ; X64-NEXT: kmovw %edi, %k1
6098 ; X64-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6099 ; X64-NEXT: vmovapd %xmm2, %xmm0
6102 %0 = extractelement <2 x double> %__W, i64 0
6103 %.rhs = extractelement <2 x double> %__X, i64 0
6104 %1 = fsub double -0.000000e+00, %.rhs
6105 %2 = extractelement <2 x double> %__Y, i64 0
6106 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6107 %4 = bitcast i8 %__U to <8 x i1>
6108 %5 = extractelement <8 x i1> %4, i64 0
6109 %6 = select i1 %5, double %3, double %2
6110 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6114 define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6115 ; X86-LABEL: test_mm_mask_fnmsub_sd:
6116 ; X86: # %bb.0: # %entry
6117 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6118 ; X86-NEXT: kmovw %eax, %k1
6119 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
6122 ; X64-LABEL: test_mm_mask_fnmsub_sd:
6123 ; X64: # %bb.0: # %entry
6124 ; X64-NEXT: kmovw %edi, %k1
6125 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
6128 %0 = extractelement <2 x double> %__W, i64 0
6129 %.rhs.i = extractelement <2 x double> %__A, i64 0
6130 %1 = fsub double -0.000000e+00, %.rhs.i
6131 %.rhs7.i = extractelement <2 x double> %__B, i64 0
6132 %2 = fsub double -0.000000e+00, %.rhs7.i
6133 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6135 %tobool.i = icmp eq i8 %4, 0
6136 %vecext2.i = extractelement <2 x double> %__W, i32 0
6137 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6138 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
6139 ret <2 x double> %vecins.i
6142 define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6143 ; X86-LABEL: test_mm_mask_fnmsub_round_sd:
6144 ; X86: # %bb.0: # %entry
6145 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6146 ; X86-NEXT: kmovw %eax, %k1
6147 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6150 ; X64-LABEL: test_mm_mask_fnmsub_round_sd:
6151 ; X64: # %bb.0: # %entry
6152 ; X64-NEXT: kmovw %edi, %k1
6153 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6156 %0 = extractelement <2 x double> %__W, i64 0
6157 %.rhs = extractelement <2 x double> %__A, i64 0
6158 %1 = fsub double -0.000000e+00, %.rhs
6159 %.rhs2 = extractelement <2 x double> %__B, i64 0
6160 %2 = fsub double -0.000000e+00, %.rhs2
6161 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6162 %4 = bitcast i8 %__U to <8 x i1>
6163 %5 = extractelement <8 x i1> %4, i64 0
6164 %6 = select i1 %5, double %3, double %0
6165 %7 = insertelement <2 x double> %__W, double %6, i64 0
6169 define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6170 ; X86-LABEL: test_mm_maskz_fnmsub_sd:
6171 ; X86: # %bb.0: # %entry
6172 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6173 ; X86-NEXT: kmovw %eax, %k1
6174 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
6177 ; X64-LABEL: test_mm_maskz_fnmsub_sd:
6178 ; X64: # %bb.0: # %entry
6179 ; X64-NEXT: kmovw %edi, %k1
6180 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
6183 %0 = extractelement <2 x double> %__A, i64 0
6184 %.rhs.i = extractelement <2 x double> %__B, i64 0
6185 %1 = fsub double -0.000000e+00, %.rhs.i
6186 %.rhs5.i = extractelement <2 x double> %__C, i64 0
6187 %2 = fsub double -0.000000e+00, %.rhs5.i
6188 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6190 %tobool.i = icmp eq i8 %4, 0
6191 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6192 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6193 ret <2 x double> %vecins.i
6196 define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6197 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
6198 ; X86: # %bb.0: # %entry
6199 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6200 ; X86-NEXT: kmovw %eax, %k1
6201 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6204 ; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
6205 ; X64: # %bb.0: # %entry
6206 ; X64-NEXT: kmovw %edi, %k1
6207 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6210 %0 = extractelement <2 x double> %__A, i64 0
6211 %.rhs = extractelement <2 x double> %__B, i64 0
6212 %1 = fsub double -0.000000e+00, %.rhs
6213 %.rhs2 = extractelement <2 x double> %__C, i64 0
6214 %2 = fsub double -0.000000e+00, %.rhs2
6215 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6216 %4 = bitcast i8 %__U to <8 x i1>
6217 %5 = extractelement <8 x i1> %4, i64 0
6218 %6 = select i1 %5, double %3, double 0.000000e+00
6219 %7 = insertelement <2 x double> %__A, double %6, i64 0
6223 define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6224 ; X86-LABEL: test_mm_mask3_fnmsub_sd:
6225 ; X86: # %bb.0: # %entry
6226 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6227 ; X86-NEXT: kmovw %eax, %k1
6228 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
6229 ; X86-NEXT: vmovapd %xmm2, %xmm0
6232 ; X64-LABEL: test_mm_mask3_fnmsub_sd:
6233 ; X64: # %bb.0: # %entry
6234 ; X64-NEXT: kmovw %edi, %k1
6235 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
6236 ; X64-NEXT: vmovapd %xmm2, %xmm0
6239 %0 = extractelement <2 x double> %__W, i64 0
6240 %.rhs.i = extractelement <2 x double> %__X, i64 0
6241 %1 = fsub double -0.000000e+00, %.rhs.i
6242 %.rhs7.i = extractelement <2 x double> %__Y, i64 0
6243 %2 = fsub double -0.000000e+00, %.rhs7.i
6244 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6246 %tobool.i = icmp eq i8 %4, 0
6247 %vecext2.i = extractelement <2 x double> %__Y, i32 0
6248 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6249 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6250 ret <2 x double> %vecins.i
6253 define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6254 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
6255 ; X86: # %bb.0: # %entry
6256 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6257 ; X86-NEXT: kmovw %eax, %k1
6258 ; X86-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6259 ; X86-NEXT: vmovapd %xmm2, %xmm0
6262 ; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
6263 ; X64: # %bb.0: # %entry
6264 ; X64-NEXT: kmovw %edi, %k1
6265 ; X64-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6266 ; X64-NEXT: vmovapd %xmm2, %xmm0
6269 %0 = extractelement <2 x double> %__W, i64 0
6270 %.rhs = extractelement <2 x double> %__X, i64 0
6271 %1 = fsub double -0.000000e+00, %.rhs
6272 %.rhs1 = extractelement <2 x double> %__Y, i64 0
6273 %2 = fsub double -0.000000e+00, %.rhs1
6274 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6275 %4 = bitcast i8 %__U to <8 x i1>
6276 %5 = extractelement <8 x i1> %4, i64 0
6277 %6 = select i1 %5, double %3, double %.rhs1
6278 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6282 define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6283 ; X86-LABEL: test_mm512_mask_expandloadu_epi64:
6284 ; X86: # %bb.0: # %entry
6285 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6286 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6287 ; X86-NEXT: kmovw %ecx, %k1
6288 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1}
6291 ; X64-LABEL: test_mm512_mask_expandloadu_epi64:
6292 ; X64: # %bb.0: # %entry
6293 ; X64-NEXT: kmovw %edi, %k1
6294 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1}
6297 %0 = bitcast i8* %__P to i64*
6298 %1 = bitcast i8 %__U to <8 x i1>
6299 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
6303 define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6304 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
6305 ; X86: # %bb.0: # %entry
6306 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6307 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6308 ; X86-NEXT: kmovw %ecx, %k1
6309 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
6312 ; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
6313 ; X64: # %bb.0: # %entry
6314 ; X64-NEXT: kmovw %edi, %k1
6315 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
6318 %0 = bitcast i8* %__P to i64*
6319 %1 = bitcast i8 %__U to <8 x i1>
6320 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
6324 define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
6325 ; X86-LABEL: test_mm512_mask_expandloadu_pd:
6326 ; X86: # %bb.0: # %entry
6327 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6328 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6329 ; X86-NEXT: kmovw %ecx, %k1
6330 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1}
6333 ; X64-LABEL: test_mm512_mask_expandloadu_pd:
6334 ; X64: # %bb.0: # %entry
6335 ; X64-NEXT: kmovw %edi, %k1
6336 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1}
6339 %0 = bitcast i8* %__P to double*
6340 %1 = bitcast i8 %__U to <8 x i1>
6341 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
6345 define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6346 ; X86-LABEL: test_mm512_maskz_expandloadu_pd:
6347 ; X86: # %bb.0: # %entry
6348 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6349 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6350 ; X86-NEXT: kmovw %ecx, %k1
6351 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z}
6354 ; X64-LABEL: test_mm512_maskz_expandloadu_pd:
6355 ; X64: # %bb.0: # %entry
6356 ; X64-NEXT: kmovw %edi, %k1
6357 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z}
6360 %0 = bitcast i8* %__P to double*
6361 %1 = bitcast i8 %__U to <8 x i1>
6362 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
6366 define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
6367 ; X86-LABEL: test_mm512_mask_expandloadu_epi32:
6368 ; X86: # %bb.0: # %entry
6369 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6370 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6371 ; X86-NEXT: kmovw %ecx, %k1
6372 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1}
6375 ; X64-LABEL: test_mm512_mask_expandloadu_epi32:
6376 ; X64: # %bb.0: # %entry
6377 ; X64-NEXT: kmovw %edi, %k1
6378 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1}
6381 %0 = bitcast <8 x i64> %__W to <16 x i32>
6382 %1 = bitcast i8* %__P to i32*
6383 %2 = bitcast i16 %__U to <16 x i1>
6384 %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
6385 %4 = bitcast <16 x i32> %3 to <8 x i64>
6389 define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
6390 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
6391 ; X86: # %bb.0: # %entry
6392 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6393 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6394 ; X86-NEXT: kmovw %ecx, %k1
6395 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z}
6398 ; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
6399 ; X64: # %bb.0: # %entry
6400 ; X64-NEXT: kmovw %edi, %k1
6401 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z}
6404 %0 = bitcast i8* %__P to i32*
6405 %1 = bitcast i16 %__U to <16 x i1>
6406 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
6407 %3 = bitcast <16 x i32> %2 to <8 x i64>
6411 define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
6412 ; X86-LABEL: test_mm512_mask_expandloadu_ps:
6413 ; X86: # %bb.0: # %entry
6414 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6415 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6416 ; X86-NEXT: kmovw %ecx, %k1
6417 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1}
6420 ; X64-LABEL: test_mm512_mask_expandloadu_ps:
6421 ; X64: # %bb.0: # %entry
6422 ; X64-NEXT: kmovw %edi, %k1
6423 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1}
6426 %0 = bitcast i8* %__P to float*
6427 %1 = bitcast i16 %__U to <16 x i1>
6428 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
6432 define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
6433 ; X86-LABEL: test_mm512_maskz_expandloadu_ps:
6434 ; X86: # %bb.0: # %entry
6435 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6436 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6437 ; X86-NEXT: kmovw %ecx, %k1
6438 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z}
6441 ; X64-LABEL: test_mm512_maskz_expandloadu_ps:
6442 ; X64: # %bb.0: # %entry
6443 ; X64-NEXT: kmovw %edi, %k1
6444 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z}
6447 %0 = bitcast i8* %__P to float*
6448 %1 = bitcast i16 %__U to <16 x i1>
6449 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
6453 define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
6454 ; X86-LABEL: test_mm512_mask_compressstoreu_pd:
6455 ; X86: # %bb.0: # %entry
6456 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6457 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6458 ; X86-NEXT: kmovw %eax, %k1
6459 ; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1}
6460 ; X86-NEXT: vzeroupper
6463 ; X64-LABEL: test_mm512_mask_compressstoreu_pd:
6464 ; X64: # %bb.0: # %entry
6465 ; X64-NEXT: kmovw %esi, %k1
6466 ; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
6467 ; X64-NEXT: vzeroupper
6470 %0 = bitcast i8* %__P to double*
6471 %1 = bitcast i8 %__U to <8 x i1>
6472 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
6476 define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
6477 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
6478 ; X86: # %bb.0: # %entry
6479 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6480 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6481 ; X86-NEXT: kmovw %eax, %k1
6482 ; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1}
6483 ; X86-NEXT: vzeroupper
6486 ; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
6487 ; X64: # %bb.0: # %entry
6488 ; X64-NEXT: kmovw %esi, %k1
6489 ; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
6490 ; X64-NEXT: vzeroupper
6493 %0 = bitcast i8* %__P to i64*
6494 %1 = bitcast i8 %__U to <8 x i1>
6495 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
6499 define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
6500 ; X86-LABEL: test_mm512_mask_compressstoreu_ps:
6501 ; X86: # %bb.0: # %entry
6502 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6503 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6504 ; X86-NEXT: kmovw %eax, %k1
6505 ; X86-NEXT: vcompressps %zmm0, (%ecx) {%k1}
6506 ; X86-NEXT: vzeroupper
6509 ; X64-LABEL: test_mm512_mask_compressstoreu_ps:
6510 ; X64: # %bb.0: # %entry
6511 ; X64-NEXT: kmovw %esi, %k1
6512 ; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1}
6513 ; X64-NEXT: vzeroupper
6516 %0 = bitcast i8* %__P to float*
6517 %1 = bitcast i16 %__U to <16 x i1>
6518 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
6522 define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
6523 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
6524 ; X86: # %bb.0: # %entry
6525 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6526 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6527 ; X86-NEXT: kmovw %eax, %k1
6528 ; X86-NEXT: vpcompressd %zmm0, (%ecx) {%k1}
6529 ; X86-NEXT: vzeroupper
6532 ; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
6533 ; X64: # %bb.0: # %entry
6534 ; X64-NEXT: kmovw %esi, %k1
6535 ; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1}
6536 ; X64-NEXT: vzeroupper
6539 %0 = bitcast <8 x i64> %__A to <16 x i32>
6540 %1 = bitcast i8* %__P to i32*
6541 %2 = bitcast i16 %__U to <16 x i1>
6542 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
6546 define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
6547 ; X86-LABEL: test_mm512_reduce_add_epi64:
6548 ; X86: # %bb.0: # %entry
6549 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6550 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6551 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6552 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6553 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6554 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6555 ; X86-NEXT: vmovd %xmm0, %eax
6556 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6557 ; X86-NEXT: vzeroupper
6560 ; X64-LABEL: test_mm512_reduce_add_epi64:
6561 ; X64: # %bb.0: # %entry
6562 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6563 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6564 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6565 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6566 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6567 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6568 ; X64-NEXT: vmovq %xmm0, %rax
6569 ; X64-NEXT: vzeroupper
6572 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6573 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6574 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6575 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6576 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6577 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6578 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6579 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6580 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6584 define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
6585 ; X86-LABEL: test_mm512_reduce_mul_epi64:
6586 ; X86: # %bb.0: # %entry
6587 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6588 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm2
6589 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6590 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm3
6591 ; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6592 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6593 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6594 ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6595 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6596 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6597 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6598 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6599 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6600 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6601 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6602 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6603 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6604 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6605 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6606 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6607 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6608 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6609 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6610 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6611 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6612 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6613 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6614 ; X86-NEXT: vmovd %xmm0, %eax
6615 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6616 ; X86-NEXT: vzeroupper
6619 ; X64-LABEL: test_mm512_reduce_mul_epi64:
6620 ; X64: # %bb.0: # %entry
6621 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6622 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
6623 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6624 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
6625 ; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6626 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6627 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6628 ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6629 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6630 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6631 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6632 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6633 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6634 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6635 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6636 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6637 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6638 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6639 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6640 ; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
6641 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6642 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6643 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6644 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6645 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6646 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6647 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6648 ; X64-NEXT: vmovq %xmm0, %rax
6649 ; X64-NEXT: vzeroupper
6652 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6653 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6654 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6655 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6656 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6657 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6658 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6659 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6660 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6664 define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
6665 ; X86-LABEL: test_mm512_reduce_or_epi64:
6666 ; X86: # %bb.0: # %entry
6667 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6668 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6669 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6670 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6671 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6672 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6673 ; X86-NEXT: vmovd %xmm0, %eax
6674 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6675 ; X86-NEXT: vzeroupper
6678 ; X64-LABEL: test_mm512_reduce_or_epi64:
6679 ; X64: # %bb.0: # %entry
6680 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6681 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6682 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6683 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6684 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6685 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6686 ; X64-NEXT: vmovq %xmm0, %rax
6687 ; X64-NEXT: vzeroupper
6690 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6691 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6692 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6693 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6694 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6695 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6696 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6697 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6698 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6702 define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
6703 ; X86-LABEL: test_mm512_reduce_and_epi64:
6704 ; X86: # %bb.0: # %entry
6705 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6706 ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
6707 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6708 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6709 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6710 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6711 ; X86-NEXT: vmovd %xmm0, %eax
6712 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6713 ; X86-NEXT: vzeroupper
6716 ; X64-LABEL: test_mm512_reduce_and_epi64:
6717 ; X64: # %bb.0: # %entry
6718 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6719 ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
6720 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6721 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6722 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6723 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6724 ; X64-NEXT: vmovq %xmm0, %rax
6725 ; X64-NEXT: vzeroupper
6728 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6729 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6730 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6731 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6732 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6733 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6734 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6735 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6736 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6740 define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6741 ; X86-LABEL: test_mm512_mask_reduce_add_epi64:
6742 ; X86: # %bb.0: # %entry
6743 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6744 ; X86-NEXT: kmovw %eax, %k1
6745 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6746 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6747 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6748 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6749 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6750 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6751 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6752 ; X86-NEXT: vmovd %xmm0, %eax
6753 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6754 ; X86-NEXT: vzeroupper
6757 ; X64-LABEL: test_mm512_mask_reduce_add_epi64:
6758 ; X64: # %bb.0: # %entry
6759 ; X64-NEXT: kmovw %edi, %k1
6760 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6761 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6762 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6763 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6764 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6765 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6766 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6767 ; X64-NEXT: vmovq %xmm0, %rax
6768 ; X64-NEXT: vzeroupper
6771 %0 = bitcast i8 %__M to <8 x i1>
6772 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6773 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6774 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6775 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6776 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6777 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6778 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6779 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6780 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6781 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6785 define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6786 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
6787 ; X86: # %bb.0: # %entry
6788 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6789 ; X86-NEXT: kmovw %eax, %k1
6790 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
6791 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6792 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6793 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2
6794 ; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6795 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm3
6796 ; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6797 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6798 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6799 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6800 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6801 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6802 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6803 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6804 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6805 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6806 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6807 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6808 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6809 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6810 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6811 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6812 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6813 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6814 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6815 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6816 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6817 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6818 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6819 ; X86-NEXT: vmovd %xmm0, %eax
6820 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6821 ; X86-NEXT: vzeroupper
6824 ; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
6825 ; X64: # %bb.0: # %entry
6826 ; X64-NEXT: kmovw %edi, %k1
6827 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
6828 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6829 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6830 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm2
6831 ; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6832 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm3
6833 ; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6834 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6835 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6836 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6837 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6838 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6839 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6840 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6841 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6842 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6843 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6844 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6845 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6846 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6847 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6848 ; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
6849 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6850 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6851 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6852 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6853 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6854 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6855 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6856 ; X64-NEXT: vmovq %xmm0, %rax
6857 ; X64-NEXT: vzeroupper
6860 %0 = bitcast i8 %__M to <8 x i1>
6861 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
6862 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6863 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6864 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6865 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6866 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6867 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6868 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6869 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6870 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6874 define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6875 ; X86-LABEL: test_mm512_mask_reduce_and_epi64:
6876 ; X86: # %bb.0: # %entry
6877 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6878 ; X86-NEXT: kmovw %eax, %k1
6879 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6880 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6881 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6882 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
6883 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6884 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6885 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6886 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6887 ; X86-NEXT: vmovd %xmm0, %eax
6888 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6889 ; X86-NEXT: vzeroupper
6892 ; X64-LABEL: test_mm512_mask_reduce_and_epi64:
6893 ; X64: # %bb.0: # %entry
6894 ; X64-NEXT: kmovw %edi, %k1
6895 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6896 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6897 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6898 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
6899 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6900 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6901 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6902 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6903 ; X64-NEXT: vmovq %xmm0, %rax
6904 ; X64-NEXT: vzeroupper
6907 %0 = bitcast i8 %__M to <8 x i1>
6908 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
6909 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6910 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6911 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6912 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6913 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6914 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6915 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6916 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6917 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6921 define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6922 ; X86-LABEL: test_mm512_mask_reduce_or_epi64:
6923 ; X86: # %bb.0: # %entry
6924 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6925 ; X86-NEXT: kmovw %eax, %k1
6926 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6927 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6928 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6929 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6930 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6931 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6932 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6933 ; X86-NEXT: vmovd %xmm0, %eax
6934 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6935 ; X86-NEXT: vzeroupper
6938 ; X64-LABEL: test_mm512_mask_reduce_or_epi64:
6939 ; X64: # %bb.0: # %entry
6940 ; X64-NEXT: kmovw %edi, %k1
6941 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6942 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6943 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6944 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6945 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6946 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6947 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6948 ; X64-NEXT: vmovq %xmm0, %rax
6949 ; X64-NEXT: vzeroupper
6952 %0 = bitcast i8 %__M to <8 x i1>
6953 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6954 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6955 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6956 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6957 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6958 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6959 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6960 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6961 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6962 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6966 define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
6967 ; CHECK-LABEL: test_mm512_reduce_add_epi32:
6968 ; CHECK: # %bb.0: # %entry
6969 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6970 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6971 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6972 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6973 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6974 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6975 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
6976 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6977 ; CHECK-NEXT: vmovd %xmm0, %eax
6978 ; CHECK-NEXT: vzeroupper
6979 ; CHECK-NEXT: ret{{[l|q]}}
6981 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6982 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6983 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6984 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6985 %add.i = add <8 x i32> %0, %1
6986 %2 = bitcast <8 x i32> %add.i to <4 x i64>
6987 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6988 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6989 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6990 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6991 %add5.i = add <4 x i32> %3, %4
6992 %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6993 %add6.i = add <4 x i32> %shuffle.i, %add5.i
6994 %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6995 %add8.i = add <4 x i32> %shuffle7.i, %add6.i
6996 %vecext.i = extractelement <4 x i32> %add8.i, i32 0
7000 define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
7001 ; CHECK-LABEL: test_mm512_reduce_mul_epi32:
7002 ; CHECK: # %bb.0: # %entry
7003 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7004 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
7005 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
7006 ; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7007 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7008 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7009 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7010 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7011 ; CHECK-NEXT: vmovd %xmm0, %eax
7012 ; CHECK-NEXT: vzeroupper
7013 ; CHECK-NEXT: ret{{[l|q]}}
7015 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7016 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
7017 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7018 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
7019 %mul.i = mul <8 x i32> %0, %1
7020 %2 = bitcast <8 x i32> %mul.i to <4 x i64>
7021 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7022 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
7023 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7024 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
7025 %mul5.i = mul <4 x i32> %3, %4
7026 %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7027 %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
7028 %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7029 %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
7030 %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
7034 define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
7035 ; CHECK-LABEL: test_mm512_reduce_or_epi32:
7036 ; CHECK: # %bb.0: # %entry
7037 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7038 ; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
7039 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
7040 ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
7041 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7042 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
7043 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7044 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
7045 ; CHECK-NEXT: vmovd %xmm0, %eax
7046 ; CHECK-NEXT: vzeroupper
7047 ; CHECK-NEXT: ret{{[l|q]}}
7049 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7050 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7051 %or25.i = or <4 x i64> %extract.i, %extract2.i
7052 %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7053 %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7054 %or526.i = or <2 x i64> %extract3.i, %extract4.i
7055 %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
7056 %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7057 %or6.i = or <4 x i32> %shuffle.i, %or5.i
7058 %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7059 %or8.i = or <4 x i32> %shuffle7.i, %or6.i
7060 %vecext.i = extractelement <4 x i32> %or8.i, i32 0
7064 define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
7065 ; CHECK-LABEL: test_mm512_reduce_and_epi32:
7066 ; CHECK: # %bb.0: # %entry
7067 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7068 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
7069 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
7070 ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
7071 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7072 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7073 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7074 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7075 ; CHECK-NEXT: vmovd %xmm0, %eax
7076 ; CHECK-NEXT: vzeroupper
7077 ; CHECK-NEXT: ret{{[l|q]}}
7079 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7080 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7081 %and25.i = and <4 x i64> %extract.i, %extract2.i
7082 %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7083 %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7084 %and526.i = and <2 x i64> %extract3.i, %extract4.i
7085 %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
7086 %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7087 %and6.i = and <4 x i32> %shuffle.i, %and5.i
7088 %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7089 %and8.i = and <4 x i32> %shuffle7.i, %and6.i
7090 %vecext.i = extractelement <4 x i32> %and8.i, i32 0
7094 define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7095 ; X86-LABEL: test_mm512_mask_reduce_add_epi32:
7096 ; X86: # %bb.0: # %entry
7097 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7098 ; X86-NEXT: kmovw %eax, %k1
7099 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7100 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7101 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7102 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7103 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7104 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7105 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7106 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7107 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7108 ; X86-NEXT: vmovd %xmm0, %eax
7109 ; X86-NEXT: vzeroupper
7112 ; X64-LABEL: test_mm512_mask_reduce_add_epi32:
7113 ; X64: # %bb.0: # %entry
7114 ; X64-NEXT: kmovw %edi, %k1
7115 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7116 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7117 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7118 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7119 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7120 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7121 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7122 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7123 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7124 ; X64-NEXT: vmovd %xmm0, %eax
7125 ; X64-NEXT: vzeroupper
7128 %0 = bitcast <8 x i64> %__W to <16 x i32>
7129 %1 = bitcast i16 %__M to <16 x i1>
7130 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7131 %3 = bitcast <16 x i32> %2 to <8 x i64>
7132 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7133 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7134 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7135 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
7136 %add.i = add <8 x i32> %4, %5
7137 %6 = bitcast <8 x i32> %add.i to <4 x i64>
7138 %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7139 %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
7140 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7141 %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
7142 %add6.i = add <4 x i32> %7, %8
7143 %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7144 %add7.i = add <4 x i32> %shuffle.i, %add6.i
7145 %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7146 %add9.i = add <4 x i32> %shuffle8.i, %add7.i
7147 %vecext.i = extractelement <4 x i32> %add9.i, i32 0
7151 define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7152 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
7153 ; X86: # %bb.0: # %entry
7154 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7155 ; X86-NEXT: kmovw %eax, %k1
7156 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7157 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7158 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7159 ; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7160 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7161 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7162 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7163 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7164 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7165 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7166 ; X86-NEXT: vmovd %xmm0, %eax
7167 ; X86-NEXT: vzeroupper
7170 ; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
7171 ; X64: # %bb.0: # %entry
7172 ; X64-NEXT: kmovw %edi, %k1
7173 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7174 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7175 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7176 ; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7177 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7178 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7179 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7180 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7181 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7182 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7183 ; X64-NEXT: vmovd %xmm0, %eax
7184 ; X64-NEXT: vzeroupper
7187 %0 = bitcast <8 x i64> %__W to <16 x i32>
7188 %1 = bitcast i16 %__M to <16 x i1>
7189 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
7190 %3 = bitcast <16 x i32> %2 to <8 x i64>
7191 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7192 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7193 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7194 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
7195 %mul.i = mul <8 x i32> %4, %5
7196 %6 = bitcast <8 x i32> %mul.i to <4 x i64>
7197 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7198 %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
7199 %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7200 %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
7201 %mul7.i = mul <4 x i32> %7, %8
7202 %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7203 %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
7204 %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7205 %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
7206 %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
7210 define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7211 ; X86-LABEL: test_mm512_mask_reduce_and_epi32:
7212 ; X86: # %bb.0: # %entry
7213 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7214 ; X86-NEXT: kmovw %eax, %k1
7215 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7216 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7217 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7218 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
7219 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7220 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
7221 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7222 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7223 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7224 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7225 ; X86-NEXT: vmovd %xmm0, %eax
7226 ; X86-NEXT: vzeroupper
7229 ; X64-LABEL: test_mm512_mask_reduce_and_epi32:
7230 ; X64: # %bb.0: # %entry
7231 ; X64-NEXT: kmovw %edi, %k1
7232 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7233 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7234 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7235 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
7236 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7237 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
7238 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7239 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7240 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7241 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7242 ; X64-NEXT: vmovd %xmm0, %eax
7243 ; X64-NEXT: vzeroupper
7246 %0 = bitcast <8 x i64> %__W to <16 x i32>
7247 %1 = bitcast i16 %__M to <16 x i1>
7248 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
7249 %3 = bitcast <16 x i32> %2 to <8 x i64>
7250 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7251 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7252 %and28.i = and <4 x i64> %extract.i, %extract4.i
7253 %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7254 %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7255 %and729.i = and <2 x i64> %extract5.i, %extract6.i
7256 %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
7257 %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7258 %and8.i = and <4 x i32> %shuffle.i, %and7.i
7259 %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7260 %and10.i = and <4 x i32> %shuffle9.i, %and8.i
7261 %vecext.i = extractelement <4 x i32> %and10.i, i32 0
7265 define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7266 ; X86-LABEL: test_mm512_mask_reduce_or_epi32:
7267 ; X86: # %bb.0: # %entry
7268 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7269 ; X86-NEXT: kmovw %eax, %k1
7270 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7271 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7272 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
7273 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7274 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
7275 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7276 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7277 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7278 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7279 ; X86-NEXT: vmovd %xmm0, %eax
7280 ; X86-NEXT: vzeroupper
7283 ; X64-LABEL: test_mm512_mask_reduce_or_epi32:
7284 ; X64: # %bb.0: # %entry
7285 ; X64-NEXT: kmovw %edi, %k1
7286 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7287 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7288 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
7289 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7290 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
7291 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7292 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7293 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7294 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7295 ; X64-NEXT: vmovd %xmm0, %eax
7296 ; X64-NEXT: vzeroupper
7299 %0 = bitcast <8 x i64> %__W to <16 x i32>
7300 %1 = bitcast i16 %__M to <16 x i1>
7301 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7302 %3 = bitcast <16 x i32> %2 to <8 x i64>
7303 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7304 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7305 %or27.i = or <4 x i64> %extract.i, %extract3.i
7306 %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7307 %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7308 %or628.i = or <2 x i64> %extract4.i, %extract5.i
7309 %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
7310 %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7311 %or7.i = or <4 x i32> %shuffle.i, %or6.i
7312 %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7313 %or9.i = or <4 x i32> %shuffle8.i, %or7.i
7314 %vecext.i = extractelement <4 x i32> %or9.i, i32 0
7318 define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
7319 ; X86-LABEL: test_mm512_reduce_add_pd:
7320 ; X86: # %bb.0: # %entry
7321 ; X86-NEXT: pushl %ebp
7322 ; X86-NEXT: .cfi_def_cfa_offset 8
7323 ; X86-NEXT: .cfi_offset %ebp, -8
7324 ; X86-NEXT: movl %esp, %ebp
7325 ; X86-NEXT: .cfi_def_cfa_register %ebp
7326 ; X86-NEXT: andl $-8, %esp
7327 ; X86-NEXT: subl $8, %esp
7328 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7329 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7330 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7331 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7332 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7333 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7334 ; X86-NEXT: vmovsd %xmm0, (%esp)
7335 ; X86-NEXT: fldl (%esp)
7336 ; X86-NEXT: movl %ebp, %esp
7337 ; X86-NEXT: popl %ebp
7338 ; X86-NEXT: .cfi_def_cfa %esp, 4
7339 ; X86-NEXT: vzeroupper
7342 ; X64-LABEL: test_mm512_reduce_add_pd:
7343 ; X64: # %bb.0: # %entry
7344 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7345 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7346 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7347 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7348 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7349 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7350 ; X64-NEXT: vzeroupper
7353 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7354 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7355 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7356 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7357 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7358 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7359 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7360 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7361 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7362 ret double %vecext.i
7365 define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
7366 ; X86-LABEL: test_mm512_reduce_mul_pd:
7367 ; X86: # %bb.0: # %entry
7368 ; X86-NEXT: pushl %ebp
7369 ; X86-NEXT: .cfi_def_cfa_offset 8
7370 ; X86-NEXT: .cfi_offset %ebp, -8
7371 ; X86-NEXT: movl %esp, %ebp
7372 ; X86-NEXT: .cfi_def_cfa_register %ebp
7373 ; X86-NEXT: andl $-8, %esp
7374 ; X86-NEXT: subl $8, %esp
7375 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7376 ; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7377 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7378 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7379 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7380 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7381 ; X86-NEXT: vmovsd %xmm0, (%esp)
7382 ; X86-NEXT: fldl (%esp)
7383 ; X86-NEXT: movl %ebp, %esp
7384 ; X86-NEXT: popl %ebp
7385 ; X86-NEXT: .cfi_def_cfa %esp, 4
7386 ; X86-NEXT: vzeroupper
7389 ; X64-LABEL: test_mm512_reduce_mul_pd:
7390 ; X64: # %bb.0: # %entry
7391 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7392 ; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7393 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7394 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7395 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7396 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7397 ; X64-NEXT: vzeroupper
7400 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7401 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7402 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7403 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7404 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7405 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7406 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7407 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7408 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7409 ret double %vecext.i
7412 define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
7413 ; X86-LABEL: test_mm512_reduce_add_ps:
7414 ; X86: # %bb.0: # %entry
7415 ; X86-NEXT: pushl %eax
7416 ; X86-NEXT: .cfi_def_cfa_offset 8
7417 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7418 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7419 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7420 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7421 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7422 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7423 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7424 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
7425 ; X86-NEXT: vmovss %xmm0, (%esp)
7426 ; X86-NEXT: flds (%esp)
7427 ; X86-NEXT: popl %eax
7428 ; X86-NEXT: .cfi_def_cfa_offset 4
7429 ; X86-NEXT: vzeroupper
7432 ; X64-LABEL: test_mm512_reduce_add_ps:
7433 ; X64: # %bb.0: # %entry
7434 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7435 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7436 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7437 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7438 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7439 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7440 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7441 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
7442 ; X64-NEXT: vzeroupper
7445 %0 = bitcast <16 x float> %__W to <8 x double>
7446 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7447 %1 = bitcast <4 x double> %extract.i to <8 x float>
7448 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7449 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7450 %add.i = fadd <8 x float> %1, %2
7451 %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7452 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7453 %add5.i = fadd <4 x float> %extract3.i, %extract4.i
7454 %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7455 %add6.i = fadd <4 x float> %add5.i, %shuffle.i
7456 %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7457 %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
7458 %vecext.i = extractelement <4 x float> %add8.i, i32 0
7462 define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
7463 ; X86-LABEL: test_mm512_reduce_mul_ps:
7464 ; X86: # %bb.0: # %entry
7465 ; X86-NEXT: pushl %eax
7466 ; X86-NEXT: .cfi_def_cfa_offset 8
7467 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7468 ; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0
7469 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7470 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7471 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7472 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7473 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7474 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
7475 ; X86-NEXT: vmovss %xmm0, (%esp)
7476 ; X86-NEXT: flds (%esp)
7477 ; X86-NEXT: popl %eax
7478 ; X86-NEXT: .cfi_def_cfa_offset 4
7479 ; X86-NEXT: vzeroupper
7482 ; X64-LABEL: test_mm512_reduce_mul_ps:
7483 ; X64: # %bb.0: # %entry
7484 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7485 ; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
7486 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7487 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7488 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7489 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7490 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7491 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
7492 ; X64-NEXT: vzeroupper
7495 %0 = bitcast <16 x float> %__W to <8 x double>
7496 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7497 %1 = bitcast <4 x double> %extract.i to <8 x float>
7498 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7499 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7500 %mul.i = fmul <8 x float> %1, %2
7501 %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7502 %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7503 %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
7504 %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7505 %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
7506 %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7507 %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
7508 %vecext.i = extractelement <4 x float> %mul8.i, i32 0
7512 define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
7513 ; X86-LABEL: test_mm512_mask_reduce_add_pd:
7514 ; X86: # %bb.0: # %entry
7515 ; X86-NEXT: pushl %ebp
7516 ; X86-NEXT: .cfi_def_cfa_offset 8
7517 ; X86-NEXT: .cfi_offset %ebp, -8
7518 ; X86-NEXT: movl %esp, %ebp
7519 ; X86-NEXT: .cfi_def_cfa_register %ebp
7520 ; X86-NEXT: andl $-8, %esp
7521 ; X86-NEXT: subl $8, %esp
7522 ; X86-NEXT: movb 8(%ebp), %al
7523 ; X86-NEXT: kmovw %eax, %k1
7524 ; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7525 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7526 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7527 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7528 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7529 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7530 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7531 ; X86-NEXT: vmovsd %xmm0, (%esp)
7532 ; X86-NEXT: fldl (%esp)
7533 ; X86-NEXT: movl %ebp, %esp
7534 ; X86-NEXT: popl %ebp
7535 ; X86-NEXT: .cfi_def_cfa %esp, 4
7536 ; X86-NEXT: vzeroupper
7539 ; X64-LABEL: test_mm512_mask_reduce_add_pd:
7540 ; X64: # %bb.0: # %entry
7541 ; X64-NEXT: kmovw %edi, %k1
7542 ; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7543 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7544 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7545 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7546 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7547 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7548 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7549 ; X64-NEXT: vzeroupper
7552 %0 = bitcast i8 %__M to <8 x i1>
7553 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
7554 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7555 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7556 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7557 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7558 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7559 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7560 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7561 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7562 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7563 ret double %vecext.i
7566 define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
7567 ; X86-LABEL: test_mm512_mask_reduce_mul_pd:
7568 ; X86: # %bb.0: # %entry
7569 ; X86-NEXT: pushl %ebp
7570 ; X86-NEXT: .cfi_def_cfa_offset 8
7571 ; X86-NEXT: .cfi_offset %ebp, -8
7572 ; X86-NEXT: movl %esp, %ebp
7573 ; X86-NEXT: .cfi_def_cfa_register %ebp
7574 ; X86-NEXT: andl $-8, %esp
7575 ; X86-NEXT: subl $8, %esp
7576 ; X86-NEXT: movb 8(%ebp), %al
7577 ; X86-NEXT: kmovw %eax, %k1
7578 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7579 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7580 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7581 ; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7582 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7583 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7584 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7585 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7586 ; X86-NEXT: vmovsd %xmm0, (%esp)
7587 ; X86-NEXT: fldl (%esp)
7588 ; X86-NEXT: movl %ebp, %esp
7589 ; X86-NEXT: popl %ebp
7590 ; X86-NEXT: .cfi_def_cfa %esp, 4
7591 ; X86-NEXT: vzeroupper
7594 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
7595 ; X64: # %bb.0: # %entry
7596 ; X64-NEXT: kmovw %edi, %k1
7597 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7598 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7599 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7600 ; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7601 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7602 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7603 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7604 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7605 ; X64-NEXT: vzeroupper
7608 %0 = bitcast i8 %__M to <8 x i1>
7609 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
7610 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7611 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7612 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7613 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7614 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7615 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7616 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7617 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7618 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7619 ret double %vecext.i
7622 define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
7623 ; X86-LABEL: test_mm512_mask_reduce_add_ps:
7624 ; X86: # %bb.0: # %entry
7625 ; X86-NEXT: pushl %eax
7626 ; X86-NEXT: .cfi_def_cfa_offset 8
7627 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7628 ; X86-NEXT: kmovw %eax, %k1
7629 ; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7630 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7631 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7632 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7633 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7634 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7635 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7636 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7637 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
7638 ; X86-NEXT: vmovss %xmm0, (%esp)
7639 ; X86-NEXT: flds (%esp)
7640 ; X86-NEXT: popl %eax
7641 ; X86-NEXT: .cfi_def_cfa_offset 4
7642 ; X86-NEXT: vzeroupper
7645 ; X64-LABEL: test_mm512_mask_reduce_add_ps:
7646 ; X64: # %bb.0: # %entry
7647 ; X64-NEXT: kmovw %edi, %k1
7648 ; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7649 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7650 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7651 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7652 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7653 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7654 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7655 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7656 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
7657 ; X64-NEXT: vzeroupper
7660 %0 = bitcast i16 %__M to <16 x i1>
7661 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
7662 %2 = bitcast <16 x float> %1 to <8 x double>
7663 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7664 %3 = bitcast <4 x double> %extract.i to <8 x float>
7665 %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7666 %4 = bitcast <4 x double> %extract3.i to <8 x float>
7667 %add.i = fadd <8 x float> %3, %4
7668 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7669 %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7670 %add6.i = fadd <4 x float> %extract4.i, %extract5.i
7671 %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7672 %add7.i = fadd <4 x float> %add6.i, %shuffle.i
7673 %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7674 %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
7675 %vecext.i = extractelement <4 x float> %add9.i, i32 0
7679 define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
7680 ; X86-LABEL: test_mm512_mask_reduce_mul_ps:
7681 ; X86: # %bb.0: # %entry
7682 ; X86-NEXT: pushl %eax
7683 ; X86-NEXT: .cfi_def_cfa_offset 8
7684 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7685 ; X86-NEXT: kmovw %eax, %k1
7686 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7687 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7688 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7689 ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0
7690 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7691 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7692 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7693 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7694 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7695 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
7696 ; X86-NEXT: vmovss %xmm0, (%esp)
7697 ; X86-NEXT: flds (%esp)
7698 ; X86-NEXT: popl %eax
7699 ; X86-NEXT: .cfi_def_cfa_offset 4
7700 ; X86-NEXT: vzeroupper
7703 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
7704 ; X64: # %bb.0: # %entry
7705 ; X64-NEXT: kmovw %edi, %k1
7706 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7707 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7708 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7709 ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0
7710 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7711 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7712 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7713 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7714 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7715 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
7716 ; X64-NEXT: vzeroupper
7719 %0 = bitcast i16 %__M to <16 x i1>
7720 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
7721 %2 = bitcast <16 x float> %1 to <8 x double>
7722 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7723 %3 = bitcast <4 x double> %extract.i to <8 x float>
7724 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7725 %4 = bitcast <4 x double> %extract4.i to <8 x float>
7726 %mul.i = fmul <8 x float> %3, %4
7727 %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7728 %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7729 %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
7730 %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7731 %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
7732 %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7733 %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
7734 %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7738 define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
7739 ; X86-LABEL: test_mm512_reduce_max_epi64:
7740 ; X86: # %bb.0: # %entry
7741 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7742 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7743 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7744 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7745 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7746 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7747 ; X86-NEXT: vmovd %xmm0, %eax
7748 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7749 ; X86-NEXT: vzeroupper
7752 ; X64-LABEL: test_mm512_reduce_max_epi64:
7753 ; X64: # %bb.0: # %entry
7754 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7755 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7756 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7757 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7758 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7759 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7760 ; X64-NEXT: vmovq %xmm0, %rax
7761 ; X64-NEXT: vzeroupper
7764 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7765 %0 = icmp slt <8 x i64> %shuffle.i, %__W
7766 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7767 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7768 %2 = icmp sgt <8 x i64> %1, %shuffle1.i
7769 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7770 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7771 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7772 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7773 %vecext.i = extractelement <8 x i64> %5, i32 0
7777 define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
7778 ; X86-LABEL: test_mm512_reduce_max_epu64:
7779 ; X86: # %bb.0: # %entry
7780 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7781 ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7782 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7783 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7784 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7785 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7786 ; X86-NEXT: vmovd %xmm0, %eax
7787 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7788 ; X86-NEXT: vzeroupper
7791 ; X64-LABEL: test_mm512_reduce_max_epu64:
7792 ; X64: # %bb.0: # %entry
7793 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7794 ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7795 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7796 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7797 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7798 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7799 ; X64-NEXT: vmovq %xmm0, %rax
7800 ; X64-NEXT: vzeroupper
7803 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7804 %0 = icmp ult <8 x i64> %shuffle.i, %__W
7805 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7806 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7807 %2 = icmp ugt <8 x i64> %1, %shuffle1.i
7808 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7809 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7810 %4 = icmp ugt <8 x i64> %3, %shuffle3.i
7811 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7812 %vecext.i = extractelement <8 x i64> %5, i32 0
7816 define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
7817 ; X86-LABEL: test_mm512_reduce_max_pd:
7818 ; X86: # %bb.0: # %entry
7819 ; X86-NEXT: pushl %ebp
7820 ; X86-NEXT: .cfi_def_cfa_offset 8
7821 ; X86-NEXT: .cfi_offset %ebp, -8
7822 ; X86-NEXT: movl %esp, %ebp
7823 ; X86-NEXT: .cfi_def_cfa_register %ebp
7824 ; X86-NEXT: andl $-8, %esp
7825 ; X86-NEXT: subl $8, %esp
7826 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7827 ; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7828 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7829 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7830 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7831 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
7832 ; X86-NEXT: vmovsd %xmm0, (%esp)
7833 ; X86-NEXT: fldl (%esp)
7834 ; X86-NEXT: movl %ebp, %esp
7835 ; X86-NEXT: popl %ebp
7836 ; X86-NEXT: .cfi_def_cfa %esp, 4
7837 ; X86-NEXT: vzeroupper
7840 ; X64-LABEL: test_mm512_reduce_max_pd:
7841 ; X64: # %bb.0: # %entry
7842 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7843 ; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7844 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7845 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7846 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7847 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
7848 ; X64-NEXT: vzeroupper
7851 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7852 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7853 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7854 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7855 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7856 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7857 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7858 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
7859 %vecext.i = extractelement <2 x double> %2, i32 0
7860 ret double %vecext.i
7863 define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
7864 ; X86-LABEL: test_mm512_reduce_min_epi64:
7865 ; X86: # %bb.0: # %entry
7866 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7867 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7868 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7869 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7870 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7871 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7872 ; X86-NEXT: vmovd %xmm0, %eax
7873 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7874 ; X86-NEXT: vzeroupper
7877 ; X64-LABEL: test_mm512_reduce_min_epi64:
7878 ; X64: # %bb.0: # %entry
7879 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7880 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7881 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7882 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7883 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7884 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7885 ; X64-NEXT: vmovq %xmm0, %rax
7886 ; X64-NEXT: vzeroupper
7889 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7890 %0 = icmp sgt <8 x i64> %shuffle.i, %__W
7891 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7892 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7893 %2 = icmp slt <8 x i64> %1, %shuffle1.i
7894 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7895 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7896 %4 = icmp slt <8 x i64> %3, %shuffle3.i
7897 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7898 %vecext.i = extractelement <8 x i64> %5, i32 0
7902 define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
7903 ; X86-LABEL: test_mm512_reduce_min_epu64:
7904 ; X86: # %bb.0: # %entry
7905 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7906 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7907 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7908 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7909 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7910 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7911 ; X86-NEXT: vmovd %xmm0, %eax
7912 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7913 ; X86-NEXT: vzeroupper
7916 ; X64-LABEL: test_mm512_reduce_min_epu64:
7917 ; X64: # %bb.0: # %entry
7918 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7919 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7920 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7921 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7922 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7923 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7924 ; X64-NEXT: vmovq %xmm0, %rax
7925 ; X64-NEXT: vzeroupper
7928 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7929 %0 = icmp ugt <8 x i64> %shuffle.i, %__W
7930 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7931 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7932 %2 = icmp ult <8 x i64> %1, %shuffle1.i
7933 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7934 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7935 %4 = icmp ult <8 x i64> %3, %shuffle3.i
7936 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7937 %vecext.i = extractelement <8 x i64> %5, i32 0
7941 define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
7942 ; X86-LABEL: test_mm512_reduce_min_pd:
7943 ; X86: # %bb.0: # %entry
7944 ; X86-NEXT: pushl %ebp
7945 ; X86-NEXT: .cfi_def_cfa_offset 8
7946 ; X86-NEXT: .cfi_offset %ebp, -8
7947 ; X86-NEXT: movl %esp, %ebp
7948 ; X86-NEXT: .cfi_def_cfa_register %ebp
7949 ; X86-NEXT: andl $-8, %esp
7950 ; X86-NEXT: subl $8, %esp
7951 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7952 ; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0
7953 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7954 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
7955 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7956 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
7957 ; X86-NEXT: vmovsd %xmm0, (%esp)
7958 ; X86-NEXT: fldl (%esp)
7959 ; X86-NEXT: movl %ebp, %esp
7960 ; X86-NEXT: popl %ebp
7961 ; X86-NEXT: .cfi_def_cfa %esp, 4
7962 ; X86-NEXT: vzeroupper
7965 ; X64-LABEL: test_mm512_reduce_min_pd:
7966 ; X64: # %bb.0: # %entry
7967 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7968 ; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
7969 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7970 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
7971 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7972 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
7973 ; X64-NEXT: vzeroupper
7976 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7977 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7978 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7979 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7980 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7981 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7982 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7983 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
7984 %vecext.i = extractelement <2 x double> %2, i32 0
7985 ret double %vecext.i
7988 define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7989 ; X86-LABEL: test_mm512_mask_reduce_max_epi64:
7990 ; X86: # %bb.0: # %entry
7991 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7992 ; X86-NEXT: kmovw %eax, %k1
7993 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
7994 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7995 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7996 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7997 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7998 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7999 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8000 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
8001 ; X86-NEXT: vmovd %xmm0, %eax
8002 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8003 ; X86-NEXT: vzeroupper
8006 ; X64-LABEL: test_mm512_mask_reduce_max_epi64:
8007 ; X64: # %bb.0: # %entry
8008 ; X64-NEXT: kmovw %edi, %k1
8009 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
8010 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8011 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8012 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
8013 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8014 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
8015 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8016 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
8017 ; X64-NEXT: vmovq %xmm0, %rax
8018 ; X64-NEXT: vzeroupper
8021 %0 = bitcast i8 %__M to <8 x i1>
8022 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
8023 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8024 %2 = icmp sgt <8 x i64> %1, %shuffle.i
8025 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8026 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8027 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
8028 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8029 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8030 %6 = icmp sgt <8 x i64> %5, %shuffle5.i
8031 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8032 %vecext.i = extractelement <8 x i64> %7, i32 0
8036 define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8037 ; X86-LABEL: test_mm512_mask_reduce_max_epu64:
8038 ; X86: # %bb.0: # %entry
8039 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8040 ; X86-NEXT: kmovw %eax, %k1
8041 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8042 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8043 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8044 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8045 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8046 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8047 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8048 ; X86-NEXT: vmovd %xmm0, %eax
8049 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8050 ; X86-NEXT: vzeroupper
8053 ; X64-LABEL: test_mm512_mask_reduce_max_epu64:
8054 ; X64: # %bb.0: # %entry
8055 ; X64-NEXT: kmovw %edi, %k1
8056 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8057 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8058 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8059 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8060 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8061 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8062 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8063 ; X64-NEXT: vmovq %xmm0, %rax
8064 ; X64-NEXT: vzeroupper
8067 %0 = bitcast i8 %__M to <8 x i1>
8068 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
8069 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8070 %2 = icmp ugt <8 x i64> %1, %shuffle.i
8071 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8072 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8073 %4 = icmp ugt <8 x i64> %3, %shuffle2.i
8074 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
8075 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8076 %6 = icmp ugt <8 x i64> %5, %shuffle4.i
8077 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
8078 %vecext.i = extractelement <8 x i64> %7, i32 0
8082 define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
8083 ; X86-LABEL: test_mm512_mask_reduce_max_pd:
8084 ; X86: # %bb.0: # %entry
8085 ; X86-NEXT: pushl %ebp
8086 ; X86-NEXT: .cfi_def_cfa_offset 8
8087 ; X86-NEXT: .cfi_offset %ebp, -8
8088 ; X86-NEXT: movl %esp, %ebp
8089 ; X86-NEXT: .cfi_def_cfa_register %ebp
8090 ; X86-NEXT: andl $-8, %esp
8091 ; X86-NEXT: subl $8, %esp
8092 ; X86-NEXT: movb 8(%ebp), %al
8093 ; X86-NEXT: kmovw %eax, %k1
8094 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8095 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8096 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8097 ; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8098 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8099 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8100 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8101 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
8102 ; X86-NEXT: vmovsd %xmm0, (%esp)
8103 ; X86-NEXT: fldl (%esp)
8104 ; X86-NEXT: movl %ebp, %esp
8105 ; X86-NEXT: popl %ebp
8106 ; X86-NEXT: .cfi_def_cfa %esp, 4
8107 ; X86-NEXT: vzeroupper
8110 ; X64-LABEL: test_mm512_mask_reduce_max_pd:
8111 ; X64: # %bb.0: # %entry
8112 ; X64-NEXT: kmovw %edi, %k1
8113 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8114 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8115 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8116 ; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8117 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8118 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8119 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8120 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
8121 ; X64-NEXT: vzeroupper
8124 %0 = bitcast i8 %__M to <8 x i1>
8125 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
8126 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8127 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8128 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
8129 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8130 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8131 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
8132 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8133 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
8134 %vecext.i = extractelement <2 x double> %4, i32 0
8135 ret double %vecext.i
8138 define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
8139 ; X86-LABEL: test_mm512_mask_reduce_min_epi64:
8140 ; X86: # %bb.0: # %entry
8141 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8142 ; X86-NEXT: kmovw %eax, %k1
8143 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
8144 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8145 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8146 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8147 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8148 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8149 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8150 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8151 ; X86-NEXT: vmovd %xmm0, %eax
8152 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8153 ; X86-NEXT: vzeroupper
8156 ; X64-LABEL: test_mm512_mask_reduce_min_epi64:
8157 ; X64: # %bb.0: # %entry
8158 ; X64-NEXT: kmovw %edi, %k1
8159 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
8160 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8161 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8162 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8163 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8164 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8165 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8166 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8167 ; X64-NEXT: vmovq %xmm0, %rax
8168 ; X64-NEXT: vzeroupper
8171 %0 = bitcast i8 %__M to <8 x i1>
8172 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
8173 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8174 %2 = icmp slt <8 x i64> %1, %shuffle.i
8175 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8176 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8177 %4 = icmp slt <8 x i64> %3, %shuffle3.i
8178 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8179 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8180 %6 = icmp slt <8 x i64> %5, %shuffle5.i
8181 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8182 %vecext.i = extractelement <8 x i64> %7, i32 0
8186 define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8187 ; X86-LABEL: test_mm512_mask_reduce_min_epu64:
8188 ; X86: # %bb.0: # %entry
8189 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8190 ; X86-NEXT: kmovw %eax, %k1
8191 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8192 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8193 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8194 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8195 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8196 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8197 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8198 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8199 ; X86-NEXT: vmovd %xmm0, %eax
8200 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8201 ; X86-NEXT: vzeroupper
8204 ; X64-LABEL: test_mm512_mask_reduce_min_epu64:
8205 ; X64: # %bb.0: # %entry
8206 ; X64-NEXT: kmovw %edi, %k1
8207 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8208 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8209 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8210 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8211 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8212 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8213 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8214 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8215 ; X64-NEXT: vmovq %xmm0, %rax
8216 ; X64-NEXT: vzeroupper
8219 %0 = bitcast i8 %__M to <8 x i1>
8220 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
8221 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8222 %2 = icmp ult <8 x i64> %1, %shuffle.i
8223 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8224 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8225 %4 = icmp ult <8 x i64> %3, %shuffle3.i
8226 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8227 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8228 %6 = icmp ult <8 x i64> %5, %shuffle5.i
8229 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8230 %vecext.i = extractelement <8 x i64> %7, i32 0
8234 define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
8235 ; X86-LABEL: test_mm512_mask_reduce_min_pd:
8236 ; X86: # %bb.0: # %entry
8237 ; X86-NEXT: pushl %ebp
8238 ; X86-NEXT: .cfi_def_cfa_offset 8
8239 ; X86-NEXT: .cfi_offset %ebp, -8
8240 ; X86-NEXT: movl %esp, %ebp
8241 ; X86-NEXT: .cfi_def_cfa_register %ebp
8242 ; X86-NEXT: andl $-8, %esp
8243 ; X86-NEXT: subl $8, %esp
8244 ; X86-NEXT: movb 8(%ebp), %al
8245 ; X86-NEXT: kmovw %eax, %k1
8246 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8247 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8248 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8249 ; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0
8250 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8251 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
8252 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8253 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
8254 ; X86-NEXT: vmovsd %xmm0, (%esp)
8255 ; X86-NEXT: fldl (%esp)
8256 ; X86-NEXT: movl %ebp, %esp
8257 ; X86-NEXT: popl %ebp
8258 ; X86-NEXT: .cfi_def_cfa %esp, 4
8259 ; X86-NEXT: vzeroupper
8262 ; X64-LABEL: test_mm512_mask_reduce_min_pd:
8263 ; X64: # %bb.0: # %entry
8264 ; X64-NEXT: kmovw %edi, %k1
8265 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8266 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8267 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8268 ; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0
8269 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8270 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
8271 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8272 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
8273 ; X64-NEXT: vzeroupper
8276 %0 = bitcast i8 %__M to <8 x i1>
8277 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
8278 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8279 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8280 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
8281 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8282 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8283 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
8284 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8285 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
8286 %vecext.i = extractelement <2 x double> %4, i32 0
8287 ret double %vecext.i
8290 define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
8291 ; CHECK-LABEL: test_mm512_reduce_max_epi32:
8292 ; CHECK: # %bb.0: # %entry
8293 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8294 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
8295 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8296 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8297 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8298 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8299 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8300 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8301 ; CHECK-NEXT: vmovd %xmm0, %eax
8302 ; CHECK-NEXT: vzeroupper
8303 ; CHECK-NEXT: ret{{[l|q]}}
8305 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8306 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8307 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8308 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8309 %2 = icmp sgt <8 x i32> %0, %1
8310 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8311 %4 = bitcast <8 x i32> %3 to <4 x i64>
8312 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8313 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8314 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8315 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8316 %7 = icmp sgt <4 x i32> %5, %6
8317 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8318 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8319 %9 = icmp sgt <4 x i32> %8, %shuffle.i
8320 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8321 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8322 %11 = icmp sgt <4 x i32> %10, %shuffle8.i
8323 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8324 %vecext.i = extractelement <4 x i32> %12, i32 0
8328 define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
8329 ; CHECK-LABEL: test_mm512_reduce_max_epu32:
8330 ; CHECK: # %bb.0: # %entry
8331 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8332 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8333 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8334 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8335 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8336 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8337 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8338 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8339 ; CHECK-NEXT: vmovd %xmm0, %eax
8340 ; CHECK-NEXT: vzeroupper
8341 ; CHECK-NEXT: ret{{[l|q]}}
8343 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8344 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8345 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8346 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8347 %2 = icmp ugt <8 x i32> %0, %1
8348 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8349 %4 = bitcast <8 x i32> %3 to <4 x i64>
8350 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8351 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8352 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8353 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8354 %7 = icmp ugt <4 x i32> %5, %6
8355 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8356 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8357 %9 = icmp ugt <4 x i32> %8, %shuffle.i
8358 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8359 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8360 %11 = icmp ugt <4 x i32> %10, %shuffle8.i
8361 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8362 %vecext.i = extractelement <4 x i32> %12, i32 0
8366 define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
8367 ; X86-LABEL: test_mm512_reduce_max_ps:
8368 ; X86: # %bb.0: # %entry
8369 ; X86-NEXT: pushl %eax
8370 ; X86-NEXT: .cfi_def_cfa_offset 8
8371 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8372 ; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8373 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8374 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8375 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8376 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8377 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8378 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8379 ; X86-NEXT: vmovss %xmm0, (%esp)
8380 ; X86-NEXT: flds (%esp)
8381 ; X86-NEXT: popl %eax
8382 ; X86-NEXT: .cfi_def_cfa_offset 4
8383 ; X86-NEXT: vzeroupper
8386 ; X64-LABEL: test_mm512_reduce_max_ps:
8387 ; X64: # %bb.0: # %entry
8388 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8389 ; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8390 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8391 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8392 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8393 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8394 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8395 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8396 ; X64-NEXT: vzeroupper
8399 %0 = bitcast <16 x float> %__W to <8 x double>
8400 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8401 %1 = bitcast <4 x double> %extract.i to <8 x float>
8402 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8403 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8404 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
8405 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8406 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8407 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8408 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8409 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
8410 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8411 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8412 %vecext.i = extractelement <4 x float> %6, i32 0
8416 define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
8417 ; CHECK-LABEL: test_mm512_reduce_min_epi32:
8418 ; CHECK: # %bb.0: # %entry
8419 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8420 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
8421 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8422 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8423 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8424 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8425 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8426 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8427 ; CHECK-NEXT: vmovd %xmm0, %eax
8428 ; CHECK-NEXT: vzeroupper
8429 ; CHECK-NEXT: ret{{[l|q]}}
8431 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8432 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8433 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8434 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8435 %2 = icmp slt <8 x i32> %0, %1
8436 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8437 %4 = bitcast <8 x i32> %3 to <4 x i64>
8438 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8439 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8440 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8441 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8442 %7 = icmp slt <4 x i32> %5, %6
8443 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8444 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8445 %9 = icmp slt <4 x i32> %8, %shuffle.i
8446 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8447 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8448 %11 = icmp slt <4 x i32> %10, %shuffle8.i
8449 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8450 %vecext.i = extractelement <4 x i32> %12, i32 0
8454 define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
8455 ; CHECK-LABEL: test_mm512_reduce_min_epu32:
8456 ; CHECK: # %bb.0: # %entry
8457 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8458 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
8459 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8460 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8461 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8462 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8463 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8464 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8465 ; CHECK-NEXT: vmovd %xmm0, %eax
8466 ; CHECK-NEXT: vzeroupper
8467 ; CHECK-NEXT: ret{{[l|q]}}
8469 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8470 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8471 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8472 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8473 %2 = icmp ult <8 x i32> %0, %1
8474 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8475 %4 = bitcast <8 x i32> %3 to <4 x i64>
8476 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8477 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8478 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8479 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8480 %7 = icmp ult <4 x i32> %5, %6
8481 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8482 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8483 %9 = icmp ult <4 x i32> %8, %shuffle.i
8484 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8485 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8486 %11 = icmp ult <4 x i32> %10, %shuffle8.i
8487 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8488 %vecext.i = extractelement <4 x i32> %12, i32 0
8492 define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
8493 ; X86-LABEL: test_mm512_reduce_min_ps:
8494 ; X86: # %bb.0: # %entry
8495 ; X86-NEXT: pushl %eax
8496 ; X86-NEXT: .cfi_def_cfa_offset 8
8497 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8498 ; X86-NEXT: vminps %ymm1, %ymm0, %ymm0
8499 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8500 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8501 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8502 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8503 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8504 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
8505 ; X86-NEXT: vmovss %xmm0, (%esp)
8506 ; X86-NEXT: flds (%esp)
8507 ; X86-NEXT: popl %eax
8508 ; X86-NEXT: .cfi_def_cfa_offset 4
8509 ; X86-NEXT: vzeroupper
8512 ; X64-LABEL: test_mm512_reduce_min_ps:
8513 ; X64: # %bb.0: # %entry
8514 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8515 ; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
8516 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8517 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8518 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8519 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8520 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8521 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
8522 ; X64-NEXT: vzeroupper
8525 %0 = bitcast <16 x float> %__W to <8 x double>
8526 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8527 %1 = bitcast <4 x double> %extract.i to <8 x float>
8528 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8529 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8530 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
8531 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8532 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8533 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8534 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8535 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
8536 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8537 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8538 %vecext.i = extractelement <4 x float> %6, i32 0
8542 define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8543 ; X86-LABEL: test_mm512_mask_reduce_max_epi32:
8544 ; X86: # %bb.0: # %entry
8545 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8546 ; X86-NEXT: kmovw %eax, %k1
8547 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8548 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8549 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8550 ; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8551 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8552 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8553 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8554 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8555 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8556 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8557 ; X86-NEXT: vmovd %xmm0, %eax
8558 ; X86-NEXT: vzeroupper
8561 ; X64-LABEL: test_mm512_mask_reduce_max_epi32:
8562 ; X64: # %bb.0: # %entry
8563 ; X64-NEXT: kmovw %edi, %k1
8564 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8565 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8566 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8567 ; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8568 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8569 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8570 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8571 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8572 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8573 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8574 ; X64-NEXT: vmovd %xmm0, %eax
8575 ; X64-NEXT: vzeroupper
8578 %0 = bitcast <8 x i64> %__W to <16 x i32>
8579 %1 = bitcast i16 %__M to <16 x i1>
8580 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
8581 %3 = bitcast <16 x i32> %2 to <8 x i64>
8582 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8583 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8584 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8585 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8586 %6 = icmp sgt <8 x i32> %4, %5
8587 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8588 %8 = bitcast <8 x i32> %7 to <4 x i64>
8589 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8590 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8591 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8592 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8593 %11 = icmp sgt <4 x i32> %9, %10
8594 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8595 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8596 %13 = icmp sgt <4 x i32> %12, %shuffle.i
8597 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8598 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8599 %15 = icmp sgt <4 x i32> %14, %shuffle10.i
8600 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8601 %vecext.i = extractelement <4 x i32> %16, i32 0
8605 define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8606 ; X86-LABEL: test_mm512_mask_reduce_max_epu32:
8607 ; X86: # %bb.0: # %entry
8608 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8609 ; X86-NEXT: kmovw %eax, %k1
8610 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8611 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8612 ; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8613 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8614 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8615 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8616 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8617 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8618 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8619 ; X86-NEXT: vmovd %xmm0, %eax
8620 ; X86-NEXT: vzeroupper
8623 ; X64-LABEL: test_mm512_mask_reduce_max_epu32:
8624 ; X64: # %bb.0: # %entry
8625 ; X64-NEXT: kmovw %edi, %k1
8626 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8627 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8628 ; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8629 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8630 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8631 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8632 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8633 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8634 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8635 ; X64-NEXT: vmovd %xmm0, %eax
8636 ; X64-NEXT: vzeroupper
8639 %0 = bitcast <8 x i64> %__W to <16 x i32>
8640 %1 = bitcast i16 %__M to <16 x i1>
8641 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
8642 %3 = bitcast <16 x i32> %2 to <8 x i64>
8643 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8644 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8645 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8646 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
8647 %6 = icmp ugt <8 x i32> %4, %5
8648 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8649 %8 = bitcast <8 x i32> %7 to <4 x i64>
8650 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8651 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8652 %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
8653 %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
8654 %11 = icmp ugt <4 x i32> %9, %10
8655 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8656 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8657 %13 = icmp ugt <4 x i32> %12, %shuffle.i
8658 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8659 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8660 %15 = icmp ugt <4 x i32> %14, %shuffle9.i
8661 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
8662 %vecext.i = extractelement <4 x i32> %16, i32 0
8666 define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
8667 ; X86-LABEL: test_mm512_mask_reduce_max_ps:
8668 ; X86: # %bb.0: # %entry
8669 ; X86-NEXT: pushl %eax
8670 ; X86-NEXT: .cfi_def_cfa_offset 8
8671 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8672 ; X86-NEXT: kmovw %eax, %k1
8673 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8674 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8675 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8676 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8677 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8678 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8679 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8680 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8681 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8682 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8683 ; X86-NEXT: vmovss %xmm0, (%esp)
8684 ; X86-NEXT: flds (%esp)
8685 ; X86-NEXT: popl %eax
8686 ; X86-NEXT: .cfi_def_cfa_offset 4
8687 ; X86-NEXT: vzeroupper
8690 ; X64-LABEL: test_mm512_mask_reduce_max_ps:
8691 ; X64: # %bb.0: # %entry
8692 ; X64-NEXT: kmovw %edi, %k1
8693 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8694 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8695 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8696 ; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8697 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8698 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8699 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8700 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8701 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8702 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8703 ; X64-NEXT: vzeroupper
8706 %0 = bitcast i16 %__M to <16 x i1>
8707 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
8708 %2 = bitcast <16 x float> %1 to <8 x double>
8709 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8710 %3 = bitcast <4 x double> %extract.i to <8 x float>
8711 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8712 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8713 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
8714 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8715 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8716 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8717 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8718 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
8719 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8720 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8721 %vecext.i = extractelement <4 x float> %8, i32 0
8725 define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8726 ; X86-LABEL: test_mm512_mask_reduce_min_epi32:
8727 ; X86: # %bb.0: # %entry
8728 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8729 ; X86-NEXT: kmovw %eax, %k1
8730 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8731 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8732 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8733 ; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8734 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8735 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8736 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8737 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8738 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8739 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8740 ; X86-NEXT: vmovd %xmm0, %eax
8741 ; X86-NEXT: vzeroupper
8744 ; X64-LABEL: test_mm512_mask_reduce_min_epi32:
8745 ; X64: # %bb.0: # %entry
8746 ; X64-NEXT: kmovw %edi, %k1
8747 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8748 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8749 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8750 ; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8751 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8752 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8753 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8754 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8755 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8756 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8757 ; X64-NEXT: vmovd %xmm0, %eax
8758 ; X64-NEXT: vzeroupper
8761 %0 = bitcast <8 x i64> %__W to <16 x i32>
8762 %1 = bitcast i16 %__M to <16 x i1>
8763 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
8764 %3 = bitcast <16 x i32> %2 to <8 x i64>
8765 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8766 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8767 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8768 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8769 %6 = icmp slt <8 x i32> %4, %5
8770 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8771 %8 = bitcast <8 x i32> %7 to <4 x i64>
8772 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8773 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8774 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8775 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8776 %11 = icmp slt <4 x i32> %9, %10
8777 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8778 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8779 %13 = icmp slt <4 x i32> %12, %shuffle.i
8780 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8781 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8782 %15 = icmp slt <4 x i32> %14, %shuffle10.i
8783 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8784 %vecext.i = extractelement <4 x i32> %16, i32 0
8788 define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8789 ; X86-LABEL: test_mm512_mask_reduce_min_epu32:
8790 ; X86: # %bb.0: # %entry
8791 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8792 ; X86-NEXT: kmovw %eax, %k1
8793 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8794 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8795 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8796 ; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0
8797 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8798 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8799 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8800 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8801 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8802 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8803 ; X86-NEXT: vmovd %xmm0, %eax
8804 ; X86-NEXT: vzeroupper
8807 ; X64-LABEL: test_mm512_mask_reduce_min_epu32:
8808 ; X64: # %bb.0: # %entry
8809 ; X64-NEXT: kmovw %edi, %k1
8810 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8811 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8812 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8813 ; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0
8814 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8815 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8816 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8817 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8818 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8819 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8820 ; X64-NEXT: vmovd %xmm0, %eax
8821 ; X64-NEXT: vzeroupper
8824 %0 = bitcast <8 x i64> %__W to <16 x i32>
8825 %1 = bitcast i16 %__M to <16 x i1>
8826 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
8827 %3 = bitcast <16 x i32> %2 to <8 x i64>
8828 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8829 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8830 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8831 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8832 %6 = icmp ult <8 x i32> %4, %5
8833 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8834 %8 = bitcast <8 x i32> %7 to <4 x i64>
8835 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8836 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8837 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8838 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8839 %11 = icmp ult <4 x i32> %9, %10
8840 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8841 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8842 %13 = icmp ult <4 x i32> %12, %shuffle.i
8843 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8844 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8845 %15 = icmp ult <4 x i32> %14, %shuffle10.i
8846 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8847 %vecext.i = extractelement <4 x i32> %16, i32 0
8851 define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
8852 ; X86-LABEL: test_mm512_mask_reduce_min_ps:
8853 ; X86: # %bb.0: # %entry
8854 ; X86-NEXT: pushl %eax
8855 ; X86-NEXT: .cfi_def_cfa_offset 8
8856 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8857 ; X86-NEXT: kmovw %eax, %k1
8858 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8859 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8860 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8861 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0
8862 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8863 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8864 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8865 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8866 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8867 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
8868 ; X86-NEXT: vmovss %xmm0, (%esp)
8869 ; X86-NEXT: flds (%esp)
8870 ; X86-NEXT: popl %eax
8871 ; X86-NEXT: .cfi_def_cfa_offset 4
8872 ; X86-NEXT: vzeroupper
8875 ; X64-LABEL: test_mm512_mask_reduce_min_ps:
8876 ; X64: # %bb.0: # %entry
8877 ; X64-NEXT: kmovw %edi, %k1
8878 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8879 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8880 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8881 ; X64-NEXT: vminps %ymm0, %ymm1, %ymm0
8882 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8883 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8884 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8885 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8886 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8887 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
8888 ; X64-NEXT: vzeroupper
8891 %0 = bitcast i16 %__M to <16 x i1>
8892 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
8893 %2 = bitcast <16 x float> %1 to <8 x double>
8894 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8895 %3 = bitcast <4 x double> %extract.i to <8 x float>
8896 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8897 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8898 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
8899 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8900 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8901 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8902 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8903 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
8904 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8905 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8906 %vecext.i = extractelement <4 x float> %8, i32 0
8910 define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8911 ; X86-LABEL: test_mm512_mask_max_pd:
8912 ; X86: # %bb.0: # %entry
8913 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8914 ; X86-NEXT: kmovw %eax, %k1
8915 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8918 ; X64-LABEL: test_mm512_mask_max_pd:
8919 ; X64: # %bb.0: # %entry
8920 ; X64-NEXT: kmovw %edi, %k1
8921 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8924 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8925 %1 = bitcast i8 %__U to <8 x i1>
8926 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8930 define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8931 ; X86-LABEL: test_mm512_maskz_max_pd:
8932 ; X86: # %bb.0: # %entry
8933 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8934 ; X86-NEXT: kmovw %eax, %k1
8935 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8938 ; X64-LABEL: test_mm512_maskz_max_pd:
8939 ; X64: # %bb.0: # %entry
8940 ; X64-NEXT: kmovw %edi, %k1
8941 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8944 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8945 %1 = bitcast i8 %__U to <8 x i1>
8946 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8950 define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8951 ; X86-LABEL: test_mm512_mask_max_ps:
8952 ; X86: # %bb.0: # %entry
8953 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8954 ; X86-NEXT: kmovw %eax, %k1
8955 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8958 ; X64-LABEL: test_mm512_mask_max_ps:
8959 ; X64: # %bb.0: # %entry
8960 ; X64-NEXT: kmovw %edi, %k1
8961 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8964 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8965 %1 = bitcast i16 %__U to <16 x i1>
8966 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
8970 define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8971 ; X86-LABEL: test_mm512_mask_max_round_pd:
8972 ; X86: # %bb.0: # %entry
8973 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8974 ; X86-NEXT: kmovw %eax, %k1
8975 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8978 ; X64-LABEL: test_mm512_mask_max_round_pd:
8979 ; X64: # %bb.0: # %entry
8980 ; X64-NEXT: kmovw %edi, %k1
8981 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8984 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8985 %1 = bitcast i8 %__U to <8 x i1>
8986 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8990 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
8992 define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8993 ; X86-LABEL: test_mm512_maskz_max_round_pd:
8994 ; X86: # %bb.0: # %entry
8995 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8996 ; X86-NEXT: kmovw %eax, %k1
8997 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9000 ; X64-LABEL: test_mm512_maskz_max_round_pd:
9001 ; X64: # %bb.0: # %entry
9002 ; X64-NEXT: kmovw %edi, %k1
9003 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9006 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9007 %1 = bitcast i8 %__U to <8 x i1>
9008 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9012 define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
9013 ; CHECK-LABEL: test_mm512_max_round_pd:
9014 ; CHECK: # %bb.0: # %entry
9015 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
9016 ; CHECK-NEXT: ret{{[l|q]}}
9018 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9022 define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9023 ; X86-LABEL: test_mm512_maskz_max_ps:
9024 ; X86: # %bb.0: # %entry
9025 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9026 ; X86-NEXT: kmovw %eax, %k1
9027 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9030 ; X64-LABEL: test_mm512_maskz_max_ps:
9031 ; X64: # %bb.0: # %entry
9032 ; X64-NEXT: kmovw %edi, %k1
9033 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9036 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9037 %1 = bitcast i16 %__U to <16 x i1>
9038 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9042 define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9043 ; X86-LABEL: test_mm512_mask_max_round_ps:
9044 ; X86: # %bb.0: # %entry
9045 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9046 ; X86-NEXT: kmovw %eax, %k1
9047 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9050 ; X64-LABEL: test_mm512_mask_max_round_ps:
9051 ; X64: # %bb.0: # %entry
9052 ; X64-NEXT: kmovw %edi, %k1
9053 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9056 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9057 %1 = bitcast i16 %__U to <16 x i1>
9058 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9062 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
9064 define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9065 ; X86-LABEL: test_mm512_maskz_max_round_ps:
9066 ; X86: # %bb.0: # %entry
9067 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9068 ; X86-NEXT: kmovw %eax, %k1
9069 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9072 ; X64-LABEL: test_mm512_maskz_max_round_ps:
9073 ; X64: # %bb.0: # %entry
9074 ; X64-NEXT: kmovw %edi, %k1
9075 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9078 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9079 %1 = bitcast i16 %__U to <16 x i1>
9080 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9084 define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
9085 ; CHECK-LABEL: test_mm512_max_round_ps:
9086 ; CHECK: # %bb.0: # %entry
9087 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
9088 ; CHECK-NEXT: ret{{[l|q]}}
9090 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9094 define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9095 ; X86-LABEL: test_mm512_mask_min_pd:
9096 ; X86: # %bb.0: # %entry
9097 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9098 ; X86-NEXT: kmovw %eax, %k1
9099 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9102 ; X64-LABEL: test_mm512_mask_min_pd:
9103 ; X64: # %bb.0: # %entry
9104 ; X64-NEXT: kmovw %edi, %k1
9105 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9108 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9109 %1 = bitcast i8 %__U to <8 x i1>
9110 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9114 define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9115 ; X86-LABEL: test_mm512_maskz_min_pd:
9116 ; X86: # %bb.0: # %entry
9117 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9118 ; X86-NEXT: kmovw %eax, %k1
9119 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9122 ; X64-LABEL: test_mm512_maskz_min_pd:
9123 ; X64: # %bb.0: # %entry
9124 ; X64-NEXT: kmovw %edi, %k1
9125 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9128 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9129 %1 = bitcast i8 %__U to <8 x i1>
9130 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9134 define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9135 ; X86-LABEL: test_mm512_mask_min_round_pd:
9136 ; X86: # %bb.0: # %entry
9137 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9138 ; X86-NEXT: kmovw %eax, %k1
9139 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9142 ; X64-LABEL: test_mm512_mask_min_round_pd:
9143 ; X64: # %bb.0: # %entry
9144 ; X64-NEXT: kmovw %edi, %k1
9145 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9148 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9149 %1 = bitcast i8 %__U to <8 x i1>
9150 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9154 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
9156 define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9157 ; X86-LABEL: test_mm512_maskz_min_round_pd:
9158 ; X86: # %bb.0: # %entry
9159 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9160 ; X86-NEXT: kmovw %eax, %k1
9161 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9164 ; X64-LABEL: test_mm512_maskz_min_round_pd:
9165 ; X64: # %bb.0: # %entry
9166 ; X64-NEXT: kmovw %edi, %k1
9167 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9170 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9171 %1 = bitcast i8 %__U to <8 x i1>
9172 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9176 define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
9177 ; CHECK-LABEL: test_mm512_min_round_pd:
9178 ; CHECK: # %bb.0: # %entry
9179 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
9180 ; CHECK-NEXT: ret{{[l|q]}}
9182 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9186 define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9187 ; X86-LABEL: test_mm512_mask_min_ps:
9188 ; X86: # %bb.0: # %entry
9189 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9190 ; X86-NEXT: kmovw %eax, %k1
9191 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9194 ; X64-LABEL: test_mm512_mask_min_ps:
9195 ; X64: # %bb.0: # %entry
9196 ; X64-NEXT: kmovw %edi, %k1
9197 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9200 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9201 %1 = bitcast i16 %__U to <16 x i1>
9202 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9206 define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9207 ; X86-LABEL: test_mm512_maskz_min_ps:
9208 ; X86: # %bb.0: # %entry
9209 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9210 ; X86-NEXT: kmovw %eax, %k1
9211 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9214 ; X64-LABEL: test_mm512_maskz_min_ps:
9215 ; X64: # %bb.0: # %entry
9216 ; X64-NEXT: kmovw %edi, %k1
9217 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9220 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9221 %1 = bitcast i16 %__U to <16 x i1>
9222 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9226 define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9227 ; X86-LABEL: test_mm512_mask_min_round_ps:
9228 ; X86: # %bb.0: # %entry
9229 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9230 ; X86-NEXT: kmovw %eax, %k1
9231 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9234 ; X64-LABEL: test_mm512_mask_min_round_ps:
9235 ; X64: # %bb.0: # %entry
9236 ; X64-NEXT: kmovw %edi, %k1
9237 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9240 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9241 %1 = bitcast i16 %__U to <16 x i1>
9242 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9246 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
9248 define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9249 ; X86-LABEL: test_mm512_maskz_min_round_ps:
9250 ; X86: # %bb.0: # %entry
9251 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9252 ; X86-NEXT: kmovw %eax, %k1
9253 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9256 ; X64-LABEL: test_mm512_maskz_min_round_ps:
9257 ; X64: # %bb.0: # %entry
9258 ; X64-NEXT: kmovw %edi, %k1
9259 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9262 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9263 %1 = bitcast i16 %__U to <16 x i1>
9264 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9268 define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
9269 ; CHECK-LABEL: test_mm512_min_round_ps:
9270 ; CHECK: # %bb.0: # %entry
9271 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
9272 ; CHECK-NEXT: ret{{[l|q]}}
9274 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9278 define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
9279 ; CHECK-LABEL: test_mm512_sqrt_pd:
9280 ; CHECK: # %bb.0: # %entry
9281 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
9282 ; CHECK-NEXT: ret{{[l|q]}}
9284 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
9288 define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9289 ; X86-LABEL: test_mm512_mask_sqrt_pd:
9290 ; X86: # %bb.0: # %entry
9291 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9292 ; X86-NEXT: kmovw %eax, %k1
9293 ; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9296 ; X64-LABEL: test_mm512_mask_sqrt_pd:
9297 ; X64: # %bb.0: # %entry
9298 ; X64-NEXT: kmovw %edi, %k1
9299 ; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9302 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9303 %1 = bitcast i8 %__U to <8 x i1>
9304 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9308 define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
9309 ; X86-LABEL: test_mm512_maskz_sqrt_pd:
9310 ; X86: # %bb.0: # %entry
9311 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9312 ; X86-NEXT: kmovw %eax, %k1
9313 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9316 ; X64-LABEL: test_mm512_maskz_sqrt_pd:
9317 ; X64: # %bb.0: # %entry
9318 ; X64-NEXT: kmovw %edi, %k1
9319 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9322 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9323 %1 = bitcast i8 %__U to <8 x i1>
9324 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9328 define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9329 ; X86-LABEL: test_mm512_mask_sqrt_round_pd:
9330 ; X86: # %bb.0: # %entry
9331 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9332 ; X86-NEXT: kmovw %eax, %k1
9333 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9336 ; X64-LABEL: test_mm512_mask_sqrt_round_pd:
9337 ; X64: # %bb.0: # %entry
9338 ; X64-NEXT: kmovw %edi, %k1
9339 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9342 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9343 %1 = bitcast i8 %__U to <8 x i1>
9344 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9348 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
9350 define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
9351 ; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
9352 ; X86: # %bb.0: # %entry
9353 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9354 ; X86-NEXT: kmovw %eax, %k1
9355 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9358 ; X64-LABEL: test_mm512_maskz_sqrt_round_pd:
9359 ; X64: # %bb.0: # %entry
9360 ; X64-NEXT: kmovw %edi, %k1
9361 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9364 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9365 %1 = bitcast i8 %__U to <8 x i1>
9366 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9370 define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
9371 ; CHECK-LABEL: test_mm512_sqrt_round_pd:
9372 ; CHECK: # %bb.0: # %entry
9373 ; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0
9374 ; CHECK-NEXT: ret{{[l|q]}}
9376 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9380 define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
9381 ; CHECK-LABEL: test_mm512_sqrt_ps:
9382 ; CHECK: # %bb.0: # %entry
9383 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
9384 ; CHECK-NEXT: ret{{[l|q]}}
9386 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
9390 define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9391 ; X86-LABEL: test_mm512_mask_sqrt_ps:
9392 ; X86: # %bb.0: # %entry
9393 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9394 ; X86-NEXT: kmovw %eax, %k1
9395 ; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9398 ; X64-LABEL: test_mm512_mask_sqrt_ps:
9399 ; X64: # %bb.0: # %entry
9400 ; X64-NEXT: kmovw %edi, %k1
9401 ; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9404 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9405 %1 = bitcast i16 %__U to <16 x i1>
9406 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9410 define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
9411 ; X86-LABEL: test_mm512_maskz_sqrt_ps:
9412 ; X86: # %bb.0: # %entry
9413 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9414 ; X86-NEXT: kmovw %eax, %k1
9415 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9418 ; X64-LABEL: test_mm512_maskz_sqrt_ps:
9419 ; X64: # %bb.0: # %entry
9420 ; X64-NEXT: kmovw %edi, %k1
9421 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9424 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9425 %1 = bitcast i16 %__U to <16 x i1>
9426 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9430 define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9431 ; X86-LABEL: test_mm512_mask_sqrt_round_ps:
9432 ; X86: # %bb.0: # %entry
9433 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9434 ; X86-NEXT: kmovw %eax, %k1
9435 ; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9438 ; X64-LABEL: test_mm512_mask_sqrt_round_ps:
9439 ; X64: # %bb.0: # %entry
9440 ; X64-NEXT: kmovw %edi, %k1
9441 ; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9444 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9445 %1 = bitcast i16 %__U to <16 x i1>
9446 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9450 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
9452 define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
9453 ; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
9454 ; X86: # %bb.0: # %entry
9455 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9456 ; X86-NEXT: kmovw %eax, %k1
9457 ; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9460 ; X64-LABEL: test_mm512_maskz_sqrt_round_ps:
9461 ; X64: # %bb.0: # %entry
9462 ; X64-NEXT: kmovw %edi, %k1
9463 ; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9466 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9467 %1 = bitcast i16 %__U to <16 x i1>
9468 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9472 define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
9473 ; CHECK-LABEL: test_mm512_sqrt_round_ps:
9474 ; CHECK: # %bb.0: # %entry
9475 ; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0
9476 ; CHECK-NEXT: ret{{[l|q]}}
9478 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9482 define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
9483 ; CHECK-LABEL: test_mm512_rol_epi32:
9484 ; CHECK: # %bb.0: # %entry
9485 ; CHECK-NEXT: vprold $5, %zmm0, %zmm0
9486 ; CHECK-NEXT: ret{{[l|q]}}
9488 %0 = bitcast <8 x i64> %__A to <16 x i32>
9489 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9490 %2 = bitcast <16 x i32> %1 to <8 x i64>
9494 define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9495 ; X86-LABEL: test_mm512_mask_rol_epi32:
9496 ; X86: # %bb.0: # %entry
9497 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9498 ; X86-NEXT: kmovw %eax, %k1
9499 ; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9502 ; X64-LABEL: test_mm512_mask_rol_epi32:
9503 ; X64: # %bb.0: # %entry
9504 ; X64-NEXT: kmovw %edi, %k1
9505 ; X64-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9508 %0 = bitcast <8 x i64> %__A to <16 x i32>
9509 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9510 %2 = bitcast <8 x i64> %__W to <16 x i32>
9511 %3 = bitcast i16 %__U to <16 x i1>
9512 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9513 %5 = bitcast <16 x i32> %4 to <8 x i64>
9517 define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9518 ; X86-LABEL: test_mm512_maskz_rol_epi32:
9519 ; X86: # %bb.0: # %entry
9520 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9521 ; X86-NEXT: kmovw %eax, %k1
9522 ; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9525 ; X64-LABEL: test_mm512_maskz_rol_epi32:
9526 ; X64: # %bb.0: # %entry
9527 ; X64-NEXT: kmovw %edi, %k1
9528 ; X64-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9531 %0 = bitcast <8 x i64> %__A to <16 x i32>
9532 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9533 %2 = bitcast i16 %__U to <16 x i1>
9534 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9535 %4 = bitcast <16 x i32> %3 to <8 x i64>
9539 define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
9540 ; CHECK-LABEL: test_mm512_rol_epi64:
9541 ; CHECK: # %bb.0: # %entry
9542 ; CHECK-NEXT: vprolq $5, %zmm0, %zmm0
9543 ; CHECK-NEXT: ret{{[l|q]}}
9545 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9549 define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9550 ; X86-LABEL: test_mm512_mask_rol_epi64:
9551 ; X86: # %bb.0: # %entry
9552 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9553 ; X86-NEXT: kmovw %eax, %k1
9554 ; X86-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9557 ; X64-LABEL: test_mm512_mask_rol_epi64:
9558 ; X64: # %bb.0: # %entry
9559 ; X64-NEXT: kmovw %edi, %k1
9560 ; X64-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9563 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9564 %1 = bitcast i8 %__U to <8 x i1>
9565 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9569 define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9570 ; X86-LABEL: test_mm512_maskz_rol_epi64:
9571 ; X86: # %bb.0: # %entry
9572 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9573 ; X86-NEXT: kmovw %eax, %k1
9574 ; X86-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9577 ; X64-LABEL: test_mm512_maskz_rol_epi64:
9578 ; X64: # %bb.0: # %entry
9579 ; X64-NEXT: kmovw %edi, %k1
9580 ; X64-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9583 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9584 %1 = bitcast i8 %__U to <8 x i1>
9585 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9589 define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9590 ; CHECK-LABEL: test_mm512_rolv_epi32:
9591 ; CHECK: # %bb.0: # %entry
9592 ; CHECK-NEXT: vprolvd %zmm1, %zmm0, %zmm0
9593 ; CHECK-NEXT: ret{{[l|q]}}
9595 %0 = bitcast <8 x i64> %__A to <16 x i32>
9596 %1 = bitcast <8 x i64> %__B to <16 x i32>
9597 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9598 %3 = bitcast <16 x i32> %2 to <8 x i64>
9602 define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9603 ; X86-LABEL: test_mm512_mask_rolv_epi32:
9604 ; X86: # %bb.0: # %entry
9605 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9606 ; X86-NEXT: kmovw %eax, %k1
9607 ; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9610 ; X64-LABEL: test_mm512_mask_rolv_epi32:
9611 ; X64: # %bb.0: # %entry
9612 ; X64-NEXT: kmovw %edi, %k1
9613 ; X64-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9616 %0 = bitcast <8 x i64> %__A to <16 x i32>
9617 %1 = bitcast <8 x i64> %__B to <16 x i32>
9618 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9619 %3 = bitcast <8 x i64> %__W to <16 x i32>
9620 %4 = bitcast i16 %__U to <16 x i1>
9621 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9622 %6 = bitcast <16 x i32> %5 to <8 x i64>
9626 define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9627 ; X86-LABEL: test_mm512_maskz_rolv_epi32:
9628 ; X86: # %bb.0: # %entry
9629 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9630 ; X86-NEXT: kmovw %eax, %k1
9631 ; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9634 ; X64-LABEL: test_mm512_maskz_rolv_epi32:
9635 ; X64: # %bb.0: # %entry
9636 ; X64-NEXT: kmovw %edi, %k1
9637 ; X64-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9640 %0 = bitcast <8 x i64> %__A to <16 x i32>
9641 %1 = bitcast <8 x i64> %__B to <16 x i32>
9642 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9643 %3 = bitcast i16 %__U to <16 x i1>
9644 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9645 %5 = bitcast <16 x i32> %4 to <8 x i64>
9649 define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9650 ; CHECK-LABEL: test_mm512_rolv_epi64:
9651 ; CHECK: # %bb.0: # %entry
9652 ; CHECK-NEXT: vprolvq %zmm1, %zmm0, %zmm0
9653 ; CHECK-NEXT: ret{{[l|q]}}
9655 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9659 define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9660 ; X86-LABEL: test_mm512_mask_rolv_epi64:
9661 ; X86: # %bb.0: # %entry
9662 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9663 ; X86-NEXT: kmovw %eax, %k1
9664 ; X86-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9667 ; X64-LABEL: test_mm512_mask_rolv_epi64:
9668 ; X64: # %bb.0: # %entry
9669 ; X64-NEXT: kmovw %edi, %k1
9670 ; X64-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9673 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9674 %1 = bitcast i8 %__U to <8 x i1>
9675 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9679 define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9680 ; X86-LABEL: test_mm512_maskz_rolv_epi64:
9681 ; X86: # %bb.0: # %entry
9682 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9683 ; X86-NEXT: kmovw %eax, %k1
9684 ; X86-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9687 ; X64-LABEL: test_mm512_maskz_rolv_epi64:
9688 ; X64: # %bb.0: # %entry
9689 ; X64-NEXT: kmovw %edi, %k1
9690 ; X64-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9693 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9694 %1 = bitcast i8 %__U to <8 x i1>
9695 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9699 define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
9700 ; CHECK-LABEL: test_mm512_ror_epi32:
9701 ; CHECK: # %bb.0: # %entry
9702 ; CHECK-NEXT: vprord $5, %zmm0, %zmm0
9703 ; CHECK-NEXT: ret{{[l|q]}}
9705 %0 = bitcast <8 x i64> %__A to <16 x i32>
9706 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9707 %2 = bitcast <16 x i32> %1 to <8 x i64>
9712 define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9713 ; X86-LABEL: test_mm512_mask_ror_epi32:
9714 ; X86: # %bb.0: # %entry
9715 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9716 ; X86-NEXT: kmovw %eax, %k1
9717 ; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9720 ; X64-LABEL: test_mm512_mask_ror_epi32:
9721 ; X64: # %bb.0: # %entry
9722 ; X64-NEXT: kmovw %edi, %k1
9723 ; X64-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9726 %0 = bitcast <8 x i64> %__A to <16 x i32>
9727 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9728 %2 = bitcast <8 x i64> %__W to <16 x i32>
9729 %3 = bitcast i16 %__U to <16 x i1>
9730 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9731 %5 = bitcast <16 x i32> %4 to <8 x i64>
9735 define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9736 ; X86-LABEL: test_mm512_maskz_ror_epi32:
9737 ; X86: # %bb.0: # %entry
9738 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9739 ; X86-NEXT: kmovw %eax, %k1
9740 ; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9743 ; X64-LABEL: test_mm512_maskz_ror_epi32:
9744 ; X64: # %bb.0: # %entry
9745 ; X64-NEXT: kmovw %edi, %k1
9746 ; X64-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9749 %0 = bitcast <8 x i64> %__A to <16 x i32>
9750 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9751 %2 = bitcast i16 %__U to <16 x i1>
9752 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9753 %4 = bitcast <16 x i32> %3 to <8 x i64>
9757 define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
9758 ; CHECK-LABEL: test_mm512_ror_epi64:
9759 ; CHECK: # %bb.0: # %entry
9760 ; CHECK-NEXT: vprorq $5, %zmm0, %zmm0
9761 ; CHECK-NEXT: ret{{[l|q]}}
9763 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9767 define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9768 ; X86-LABEL: test_mm512_mask_ror_epi64:
9769 ; X86: # %bb.0: # %entry
9770 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9771 ; X86-NEXT: kmovw %eax, %k1
9772 ; X86-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9775 ; X64-LABEL: test_mm512_mask_ror_epi64:
9776 ; X64: # %bb.0: # %entry
9777 ; X64-NEXT: kmovw %edi, %k1
9778 ; X64-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9781 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9782 %1 = bitcast i8 %__U to <8 x i1>
9783 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9787 define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9788 ; X86-LABEL: test_mm512_maskz_ror_epi64:
9789 ; X86: # %bb.0: # %entry
9790 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9791 ; X86-NEXT: kmovw %eax, %k1
9792 ; X86-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9795 ; X64-LABEL: test_mm512_maskz_ror_epi64:
9796 ; X64: # %bb.0: # %entry
9797 ; X64-NEXT: kmovw %edi, %k1
9798 ; X64-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9801 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9802 %1 = bitcast i8 %__U to <8 x i1>
9803 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9807 define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9808 ; CHECK-LABEL: test_mm512_rorv_epi32:
9809 ; CHECK: # %bb.0: # %entry
9810 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
9811 ; CHECK-NEXT: ret{{[l|q]}}
9813 %0 = bitcast <8 x i64> %__A to <16 x i32>
9814 %1 = bitcast <8 x i64> %__B to <16 x i32>
9815 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9816 %3 = bitcast <16 x i32> %2 to <8 x i64>
9820 define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9821 ; X86-LABEL: test_mm512_mask_rorv_epi32:
9822 ; X86: # %bb.0: # %entry
9823 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9824 ; X86-NEXT: kmovw %eax, %k1
9825 ; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9828 ; X64-LABEL: test_mm512_mask_rorv_epi32:
9829 ; X64: # %bb.0: # %entry
9830 ; X64-NEXT: kmovw %edi, %k1
9831 ; X64-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9834 %0 = bitcast <8 x i64> %__A to <16 x i32>
9835 %1 = bitcast <8 x i64> %__B to <16 x i32>
9836 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9837 %3 = bitcast <8 x i64> %__W to <16 x i32>
9838 %4 = bitcast i16 %__U to <16 x i1>
9839 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9840 %6 = bitcast <16 x i32> %5 to <8 x i64>
9844 define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9845 ; X86-LABEL: test_mm512_maskz_rorv_epi32:
9846 ; X86: # %bb.0: # %entry
9847 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9848 ; X86-NEXT: kmovw %eax, %k1
9849 ; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9852 ; X64-LABEL: test_mm512_maskz_rorv_epi32:
9853 ; X64: # %bb.0: # %entry
9854 ; X64-NEXT: kmovw %edi, %k1
9855 ; X64-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9858 %0 = bitcast <8 x i64> %__A to <16 x i32>
9859 %1 = bitcast <8 x i64> %__B to <16 x i32>
9860 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9861 %3 = bitcast i16 %__U to <16 x i1>
9862 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9863 %5 = bitcast <16 x i32> %4 to <8 x i64>
9867 define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9868 ; CHECK-LABEL: test_mm512_rorv_epi64:
9869 ; CHECK: # %bb.0: # %entry
9870 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
9871 ; CHECK-NEXT: ret{{[l|q]}}
9873 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9877 define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9878 ; X86-LABEL: test_mm512_mask_rorv_epi64:
9879 ; X86: # %bb.0: # %entry
9880 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9881 ; X86-NEXT: kmovw %eax, %k1
9882 ; X86-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9885 ; X64-LABEL: test_mm512_mask_rorv_epi64:
9886 ; X64: # %bb.0: # %entry
9887 ; X64-NEXT: kmovw %edi, %k1
9888 ; X64-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9891 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9892 %1 = bitcast i8 %__U to <8 x i1>
9893 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9897 define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9898 ; X86-LABEL: test_mm512_maskz_rorv_epi64:
9899 ; X86: # %bb.0: # %entry
9900 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9901 ; X86-NEXT: kmovw %eax, %k1
9902 ; X86-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9905 ; X64-LABEL: test_mm512_maskz_rorv_epi64:
9906 ; X64: # %bb.0: # %entry
9907 ; X64-NEXT: kmovw %edi, %k1
9908 ; X64-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9911 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9912 %1 = bitcast i8 %__U to <8 x i1>
9913 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9917 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
9918 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
9919 declare float @llvm.fma.f32(float, float, float) #9
9920 declare double @llvm.fma.f64(double, double, double) #9
9921 declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
9922 declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
9923 declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
9924 declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
9925 declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
9926 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
9927 declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
9928 declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
9929 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
9930 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
9931 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
9932 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
9933 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
9934 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
9935 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
9936 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
9937 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
9938 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
9940 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9941 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
9942 declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9943 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)