1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
8 define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
9 ; X86-LABEL: test_mm512_kunpackb:
10 ; X86: # %bb.0: # %entry
11 ; X86-NEXT: pushl %ebp
12 ; X86-NEXT: .cfi_def_cfa_offset 8
13 ; X86-NEXT: .cfi_offset %ebp, -8
14 ; X86-NEXT: movl %esp, %ebp
15 ; X86-NEXT: .cfi_def_cfa_register %ebp
16 ; X86-NEXT: andl $-64, %esp
17 ; X86-NEXT: subl $64, %esp
18 ; X86-NEXT: vmovdqa64 136(%ebp), %zmm3
19 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
20 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
21 ; X86-NEXT: kunpckbw %k0, %k1, %k1
22 ; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
23 ; X86-NEXT: kmovw %k0, %eax
24 ; X86-NEXT: movzwl %ax, %eax
25 ; X86-NEXT: movl %ebp, %esp
27 ; X86-NEXT: .cfi_def_cfa %esp, 4
28 ; X86-NEXT: vzeroupper
31 ; X64-LABEL: test_mm512_kunpackb:
32 ; X64: # %bb.0: # %entry
33 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
34 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
35 ; X64-NEXT: kunpckbw %k0, %k1, %k1
36 ; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
37 ; X64-NEXT: kmovw %k0, %eax
38 ; X64-NEXT: movzwl %ax, %eax
39 ; X64-NEXT: vzeroupper
42 %0 = bitcast <8 x i64> %__E to <16 x i32>
43 %1 = bitcast <8 x i64> %__F to <16 x i32>
44 %2 = bitcast <8 x i64> %__A to <16 x i32>
45 %3 = bitcast <8 x i64> %__B to <16 x i32>
46 %4 = icmp ne <16 x i32> %2, %3
47 %5 = bitcast <8 x i64> %__C to <16 x i32>
48 %6 = bitcast <8 x i64> %__D to <16 x i32>
49 %7 = icmp ne <16 x i32> %5, %6
50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
53 %11 = icmp ne <16 x i32> %0, %1
54 %12 = and <16 x i1> %11, %10
55 %13 = bitcast <16 x i1> %12 to i16
59 define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
60 ; X86-LABEL: test_mm512_kortestc:
61 ; X86: # %bb.0: # %entry
62 ; X86-NEXT: pushl %ebp
63 ; X86-NEXT: .cfi_def_cfa_offset 8
64 ; X86-NEXT: .cfi_offset %ebp, -8
65 ; X86-NEXT: movl %esp, %ebp
66 ; X86-NEXT: .cfi_def_cfa_register %ebp
67 ; X86-NEXT: andl $-64, %esp
68 ; X86-NEXT: subl $64, %esp
69 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
70 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
71 ; X86-NEXT: korw %k0, %k1, %k0
72 ; X86-NEXT: kmovw %k0, %eax
73 ; X86-NEXT: cmpw $-1, %ax
75 ; X86-NEXT: andb $1, %al
76 ; X86-NEXT: movzbl %al, %eax
77 ; X86-NEXT: movl %ebp, %esp
79 ; X86-NEXT: .cfi_def_cfa %esp, 4
80 ; X86-NEXT: vzeroupper
83 ; X64-LABEL: test_mm512_kortestc:
84 ; X64: # %bb.0: # %entry
85 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
86 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
87 ; X64-NEXT: korw %k0, %k1, %k0
88 ; X64-NEXT: kmovw %k0, %eax
89 ; X64-NEXT: cmpw $-1, %ax
91 ; X64-NEXT: andb $1, %al
92 ; X64-NEXT: movzbl %al, %eax
93 ; X64-NEXT: vzeroupper
96 %0 = bitcast <8 x i64> %__A to <16 x i32>
97 %1 = bitcast <8 x i64> %__B to <16 x i32>
98 %2 = icmp ne <16 x i32> %0, %1
99 %3 = bitcast <8 x i64> %__C to <16 x i32>
100 %4 = bitcast <8 x i64> %__D to <16 x i32>
101 %5 = icmp ne <16 x i32> %3, %4
102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16
103 %8 = icmp eq i16 %7, -1
104 %9 = zext i1 %8 to i32
108 define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
109 ; X86-LABEL: test_mm512_kortestz:
110 ; X86: # %bb.0: # %entry
111 ; X86-NEXT: pushl %ebp
112 ; X86-NEXT: .cfi_def_cfa_offset 8
113 ; X86-NEXT: .cfi_offset %ebp, -8
114 ; X86-NEXT: movl %esp, %ebp
115 ; X86-NEXT: .cfi_def_cfa_register %ebp
116 ; X86-NEXT: andl $-64, %esp
117 ; X86-NEXT: subl $64, %esp
118 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
119 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
120 ; X86-NEXT: korw %k0, %k1, %k0
121 ; X86-NEXT: kmovw %k0, %eax
122 ; X86-NEXT: cmpw $0, %ax
124 ; X86-NEXT: andb $1, %al
125 ; X86-NEXT: movzbl %al, %eax
126 ; X86-NEXT: movl %ebp, %esp
127 ; X86-NEXT: popl %ebp
128 ; X86-NEXT: .cfi_def_cfa %esp, 4
129 ; X86-NEXT: vzeroupper
132 ; X64-LABEL: test_mm512_kortestz:
133 ; X64: # %bb.0: # %entry
134 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
135 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
136 ; X64-NEXT: korw %k0, %k1, %k0
137 ; X64-NEXT: kmovw %k0, %eax
138 ; X64-NEXT: cmpw $0, %ax
140 ; X64-NEXT: andb $1, %al
141 ; X64-NEXT: movzbl %al, %eax
142 ; X64-NEXT: vzeroupper
145 %0 = bitcast <8 x i64> %__A to <16 x i32>
146 %1 = bitcast <8 x i64> %__B to <16 x i32>
147 %2 = icmp ne <16 x i32> %0, %1
148 %3 = bitcast <8 x i64> %__C to <16 x i32>
149 %4 = bitcast <8 x i64> %__D to <16 x i32>
150 %5 = icmp ne <16 x i32> %3, %4
151 %6 = or <16 x i1> %5, %2
152 %7 = bitcast <16 x i1> %6 to i16
153 %8 = icmp eq i16 %7, 0
154 %9 = zext i1 %8 to i32
158 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
159 ; CHECK-LABEL: test_mm512_shuffle_f32x4:
160 ; CHECK: # %bb.0: # %entry
161 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
162 ; CHECK-NEXT: ret{{[l|q]}}
164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
165 ret <16 x float> %shuffle
169 define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
170 ; X86-LABEL: test_mm512_mask_shuffle_f32x4:
171 ; X86: # %bb.0: # %entry
172 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
173 ; X86-NEXT: kmovw %eax, %k1
174 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
177 ; X64-LABEL: test_mm512_mask_shuffle_f32x4:
178 ; X64: # %bb.0: # %entry
179 ; X64-NEXT: kmovw %edi, %k1
180 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
183 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
184 %0 = bitcast i16 %__U to <16 x i1>
185 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
189 define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
190 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
191 ; X86: # %bb.0: # %entry
192 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
193 ; X86-NEXT: kmovw %eax, %k1
194 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
197 ; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
198 ; X64: # %bb.0: # %entry
199 ; X64-NEXT: kmovw %edi, %k1
200 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
203 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
204 %0 = bitcast i16 %__U to <16 x i1>
205 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
209 define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
210 ; CHECK-LABEL: test_mm512_shuffle_f64x2:
211 ; CHECK: # %bb.0: # %entry
212 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
213 ; CHECK-NEXT: ret{{[l|q]}}
215 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
216 ret <8 x double> %shuffle
219 define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
220 ; X86-LABEL: test_mm512_mask_shuffle_f64x2:
221 ; X86: # %bb.0: # %entry
222 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
223 ; X86-NEXT: kmovw %eax, %k1
224 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
227 ; X64-LABEL: test_mm512_mask_shuffle_f64x2:
228 ; X64: # %bb.0: # %entry
229 ; X64-NEXT: kmovw %edi, %k1
230 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
233 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
234 %0 = bitcast i8 %__U to <8 x i1>
235 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
239 define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
240 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
241 ; X86: # %bb.0: # %entry
242 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
243 ; X86-NEXT: kmovw %eax, %k1
244 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
247 ; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
248 ; X64: # %bb.0: # %entry
249 ; X64-NEXT: kmovw %edi, %k1
250 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
253 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
254 %0 = bitcast i8 %__U to <8 x i1>
255 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
259 define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
260 ; CHECK-LABEL: test_mm512_shuffle_i32x4:
261 ; CHECK: # %bb.0: # %entry
262 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
263 ; CHECK-NEXT: ret{{[l|q]}}
265 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
266 ret <8 x i64> %shuffle
269 define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
270 ; X86-LABEL: test_mm512_mask_shuffle_i32x4:
271 ; X86: # %bb.0: # %entry
272 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
273 ; X86-NEXT: kmovw %eax, %k1
274 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
277 ; X64-LABEL: test_mm512_mask_shuffle_i32x4:
278 ; X64: # %bb.0: # %entry
279 ; X64-NEXT: kmovw %edi, %k1
280 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
283 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
284 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
285 %1 = bitcast <8 x i64> %__W to <16 x i32>
286 %2 = bitcast i16 %__U to <16 x i1>
287 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
288 %4 = bitcast <16 x i32> %3 to <8 x i64>
292 define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
293 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
294 ; X86: # %bb.0: # %entry
295 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
296 ; X86-NEXT: kmovw %eax, %k1
297 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
300 ; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
301 ; X64: # %bb.0: # %entry
302 ; X64-NEXT: kmovw %edi, %k1
303 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
306 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
307 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
308 %1 = bitcast i16 %__U to <16 x i1>
309 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
310 %3 = bitcast <16 x i32> %2 to <8 x i64>
314 define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
315 ; CHECK-LABEL: test_mm512_shuffle_i64x2:
316 ; CHECK: # %bb.0: # %entry
317 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
318 ; CHECK-NEXT: ret{{[l|q]}}
320 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
321 ret <8 x i64> %shuffle
324 define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
325 ; X86-LABEL: test_mm512_mask_shuffle_i64x2:
326 ; X86: # %bb.0: # %entry
327 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
328 ; X86-NEXT: kmovw %eax, %k1
329 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
332 ; X64-LABEL: test_mm512_mask_shuffle_i64x2:
333 ; X64: # %bb.0: # %entry
334 ; X64-NEXT: kmovw %edi, %k1
335 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
338 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
339 %0 = bitcast i8 %__U to <8 x i1>
340 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
344 define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
345 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
346 ; X86: # %bb.0: # %entry
347 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
348 ; X86-NEXT: kmovw %eax, %k1
349 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
352 ; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
353 ; X64: # %bb.0: # %entry
354 ; X64-NEXT: kmovw %edi, %k1
355 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
358 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
359 %0 = bitcast i8 %__U to <8 x i1>
360 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
365 define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
366 ; CHECK-LABEL: test_mm512_testn_epi32_mask:
367 ; CHECK: # %bb.0: # %entry
368 ; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
369 ; CHECK-NEXT: kmovw %k0, %eax
370 ; CHECK-NEXT: movzwl %ax, %eax
371 ; CHECK-NEXT: vzeroupper
372 ; CHECK-NEXT: ret{{[l|q]}}
374 %and1.i.i = and <8 x i64> %__B, %__A
375 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
376 %1 = icmp eq <16 x i32> %0, zeroinitializer
377 %2 = bitcast <16 x i1> %1 to i16
381 define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
382 ; X86-LABEL: test_mm512_mask_testn_epi32_mask:
383 ; X86: # %bb.0: # %entry
384 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
385 ; X86-NEXT: kmovw %eax, %k1
386 ; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
387 ; X86-NEXT: kmovw %k0, %eax
388 ; X86-NEXT: movzwl %ax, %eax
389 ; X86-NEXT: vzeroupper
392 ; X64-LABEL: test_mm512_mask_testn_epi32_mask:
393 ; X64: # %bb.0: # %entry
394 ; X64-NEXT: kmovw %edi, %k1
395 ; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
396 ; X64-NEXT: kmovw %k0, %eax
397 ; X64-NEXT: movzwl %ax, %eax
398 ; X64-NEXT: vzeroupper
401 %and1.i.i = and <8 x i64> %__B, %__A
402 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
403 %1 = icmp eq <16 x i32> %0, zeroinitializer
404 %2 = bitcast i16 %__U to <16 x i1>
405 %3 = and <16 x i1> %1, %2
406 %4 = bitcast <16 x i1> %3 to i16
410 define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
411 ; CHECK-LABEL: test_mm512_testn_epi64_mask:
412 ; CHECK: # %bb.0: # %entry
413 ; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
414 ; CHECK-NEXT: kmovw %k0, %eax
415 ; CHECK-NEXT: movzbl %al, %eax
416 ; CHECK-NEXT: vzeroupper
417 ; CHECK-NEXT: ret{{[l|q]}}
419 %and1.i.i = and <8 x i64> %__B, %__A
420 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
421 %1 = bitcast <8 x i1> %0 to i8
425 define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
426 ; X86-LABEL: test_mm512_mask_testn_epi64_mask:
427 ; X86: # %bb.0: # %entry
428 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
429 ; X86-NEXT: kmovw %eax, %k1
430 ; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
431 ; X86-NEXT: kmovw %k0, %eax
432 ; X86-NEXT: movzbl %al, %eax
433 ; X86-NEXT: vzeroupper
436 ; X64-LABEL: test_mm512_mask_testn_epi64_mask:
437 ; X64: # %bb.0: # %entry
438 ; X64-NEXT: kmovw %edi, %k1
439 ; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
440 ; X64-NEXT: kmovw %k0, %eax
441 ; X64-NEXT: movzbl %al, %eax
442 ; X64-NEXT: vzeroupper
445 %and1.i.i = and <8 x i64> %__B, %__A
446 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
447 %1 = bitcast i8 %__U to <8 x i1>
448 %2 = and <8 x i1> %0, %1
449 %3 = bitcast <8 x i1> %2 to i8
453 define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
454 ; X86-LABEL: test_mm512_mask_test_epi32_mask:
455 ; X86: # %bb.0: # %entry
456 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
457 ; X86-NEXT: kmovw %eax, %k1
458 ; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
459 ; X86-NEXT: kmovw %k0, %eax
460 ; X86-NEXT: movzwl %ax, %eax
461 ; X86-NEXT: vzeroupper
464 ; X64-LABEL: test_mm512_mask_test_epi32_mask:
465 ; X64: # %bb.0: # %entry
466 ; X64-NEXT: kmovw %edi, %k1
467 ; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
468 ; X64-NEXT: kmovw %k0, %eax
469 ; X64-NEXT: movzwl %ax, %eax
470 ; X64-NEXT: vzeroupper
473 %and1.i.i = and <8 x i64> %__B, %__A
474 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
475 %1 = icmp ne <16 x i32> %0, zeroinitializer
476 %2 = bitcast i16 %__U to <16 x i1>
477 %3 = and <16 x i1> %1, %2
478 %4 = bitcast <16 x i1> %3 to i16
482 define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
483 ; X86-LABEL: test_mm512_mask_test_epi64_mask:
484 ; X86: # %bb.0: # %entry
485 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
486 ; X86-NEXT: kmovw %eax, %k1
487 ; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
488 ; X86-NEXT: kmovw %k0, %eax
489 ; X86-NEXT: movzbl %al, %eax
490 ; X86-NEXT: vzeroupper
493 ; X64-LABEL: test_mm512_mask_test_epi64_mask:
494 ; X64: # %bb.0: # %entry
495 ; X64-NEXT: kmovw %edi, %k1
496 ; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
497 ; X64-NEXT: kmovw %k0, %eax
498 ; X64-NEXT: movzbl %al, %eax
499 ; X64-NEXT: vzeroupper
502 %and1.i.i = and <8 x i64> %__B, %__A
503 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
504 %1 = bitcast i8 %__U to <8 x i1>
505 %2 = and <8 x i1> %0, %1
506 %3 = bitcast <8 x i1> %2 to i8
510 define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
511 ; X86-LABEL: test_mm512_mask_set1_epi32:
512 ; X86: # %bb.0: # %entry
513 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
514 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
515 ; X86-NEXT: kmovw %ecx, %k1
516 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1}
519 ; X64-LABEL: test_mm512_mask_set1_epi32:
520 ; X64: # %bb.0: # %entry
521 ; X64-NEXT: kmovw %edi, %k1
522 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1}
525 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
526 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
527 %0 = bitcast <8 x i64> %__O to <16 x i32>
528 %1 = bitcast i16 %__M to <16 x i1>
529 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
530 %3 = bitcast <16 x i32> %2 to <8 x i64>
534 define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) {
535 ; X86-LABEL: test_mm512_maskz_set1_epi32:
536 ; X86: # %bb.0: # %entry
537 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
538 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
539 ; X86-NEXT: kmovw %ecx, %k1
540 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
543 ; X64-LABEL: test_mm512_maskz_set1_epi32:
544 ; X64: # %bb.0: # %entry
545 ; X64-NEXT: kmovw %edi, %k1
546 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z}
549 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
550 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
551 %0 = bitcast i16 %__M to <16 x i1>
552 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
553 %2 = bitcast <16 x i32> %1 to <8 x i64>
557 define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
558 ; X86-LABEL: test_mm512_mask_set1_epi64:
559 ; X86: # %bb.0: # %entry
560 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
561 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
562 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
563 ; X86-NEXT: kmovw %eax, %k1
564 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
567 ; X64-LABEL: test_mm512_mask_set1_epi64:
568 ; X64: # %bb.0: # %entry
569 ; X64-NEXT: kmovw %edi, %k1
570 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1}
573 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
574 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
575 %0 = bitcast i8 %__M to <8 x i1>
576 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
580 define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
581 ; X86-LABEL: test_mm512_maskz_set1_epi64:
582 ; X86: # %bb.0: # %entry
583 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
584 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
585 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
586 ; X86-NEXT: kmovw %eax, %k1
587 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
590 ; X64-LABEL: test_mm512_maskz_set1_epi64:
591 ; X64: # %bb.0: # %entry
592 ; X64-NEXT: kmovw %edi, %k1
593 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z}
596 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
597 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
598 %0 = bitcast i8 %__M to <8 x i1>
599 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
604 define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
605 ; CHECK-LABEL: test_mm512_broadcastd_epi32:
607 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
608 ; CHECK-NEXT: ret{{[l|q]}}
609 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
610 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
611 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
615 define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
616 ; X86-LABEL: test_mm512_mask_broadcastd_epi32:
618 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
619 ; X86-NEXT: kmovw %eax, %k1
620 ; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
623 ; X64-LABEL: test_mm512_mask_broadcastd_epi32:
625 ; X64-NEXT: kmovw %edi, %k1
626 ; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
628 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
629 %arg1 = bitcast i16 %a1 to <16 x i1>
630 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
631 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
632 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
633 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
637 define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
638 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
640 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
641 ; X86-NEXT: kmovw %eax, %k1
642 ; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
645 ; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
647 ; X64-NEXT: kmovw %edi, %k1
648 ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
650 %arg0 = bitcast i16 %a0 to <16 x i1>
651 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
652 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
653 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
654 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
658 define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
659 ; CHECK-LABEL: test_mm512_broadcastq_epi64:
661 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
662 ; CHECK-NEXT: ret{{[l|q]}}
663 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
667 define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
668 ; X86-LABEL: test_mm512_mask_broadcastq_epi64:
670 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
671 ; X86-NEXT: kmovw %eax, %k1
672 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
675 ; X64-LABEL: test_mm512_mask_broadcastq_epi64:
677 ; X64-NEXT: kmovw %edi, %k1
678 ; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
680 %arg1 = bitcast i8 %a1 to <8 x i1>
681 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
682 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
686 define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
687 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
689 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
690 ; X86-NEXT: kmovw %eax, %k1
691 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
694 ; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
696 ; X64-NEXT: kmovw %edi, %k1
697 ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
699 %arg0 = bitcast i8 %a0 to <8 x i1>
700 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
701 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
705 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
706 ; CHECK-LABEL: test_mm512_broadcastsd_pd:
708 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
709 ; CHECK-NEXT: ret{{[l|q]}}
710 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
711 ret <8 x double> %res
714 define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
715 ; X86-LABEL: test_mm512_mask_broadcastsd_pd:
717 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
718 ; X86-NEXT: kmovw %eax, %k1
719 ; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
722 ; X64-LABEL: test_mm512_mask_broadcastsd_pd:
724 ; X64-NEXT: kmovw %edi, %k1
725 ; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
727 %arg1 = bitcast i8 %a1 to <8 x i1>
728 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
729 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
730 ret <8 x double> %res1
733 define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
734 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
736 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
737 ; X86-NEXT: kmovw %eax, %k1
738 ; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
741 ; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
743 ; X64-NEXT: kmovw %edi, %k1
744 ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
746 %arg0 = bitcast i8 %a0 to <8 x i1>
747 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
748 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
749 ret <8 x double> %res1
752 define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
753 ; CHECK-LABEL: test_mm512_broadcastss_ps:
755 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
756 ; CHECK-NEXT: ret{{[l|q]}}
757 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
758 ret <16 x float> %res
761 define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
762 ; X86-LABEL: test_mm512_mask_broadcastss_ps:
764 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
765 ; X86-NEXT: kmovw %eax, %k1
766 ; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
769 ; X64-LABEL: test_mm512_mask_broadcastss_ps:
771 ; X64-NEXT: kmovw %edi, %k1
772 ; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
774 %arg1 = bitcast i16 %a1 to <16 x i1>
775 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
776 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
777 ret <16 x float> %res1
780 define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
781 ; X86-LABEL: test_mm512_maskz_broadcastss_ps:
783 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
784 ; X86-NEXT: kmovw %eax, %k1
785 ; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
788 ; X64-LABEL: test_mm512_maskz_broadcastss_ps:
790 ; X64-NEXT: kmovw %edi, %k1
791 ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
793 %arg0 = bitcast i16 %a0 to <16 x i1>
794 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
795 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
796 ret <16 x float> %res1
799 define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
800 ; CHECK-LABEL: test_mm512_movedup_pd:
802 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
803 ; CHECK-NEXT: ret{{[l|q]}}
804 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
805 ret <8 x double> %res
808 define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
809 ; X86-LABEL: test_mm512_mask_movedup_pd:
811 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
812 ; X86-NEXT: kmovw %eax, %k1
813 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
816 ; X64-LABEL: test_mm512_mask_movedup_pd:
818 ; X64-NEXT: kmovw %edi, %k1
819 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
821 %arg1 = bitcast i8 %a1 to <8 x i1>
822 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
823 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
824 ret <8 x double> %res1
827 define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
828 ; X86-LABEL: test_mm512_maskz_movedup_pd:
830 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
831 ; X86-NEXT: kmovw %eax, %k1
832 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
835 ; X64-LABEL: test_mm512_maskz_movedup_pd:
837 ; X64-NEXT: kmovw %edi, %k1
838 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
840 %arg0 = bitcast i8 %a0 to <8 x i1>
841 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
842 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
843 ret <8 x double> %res1
846 define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
847 ; CHECK-LABEL: test_mm512_movehdup_ps:
849 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
850 ; CHECK-NEXT: ret{{[l|q]}}
851 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
852 ret <16 x float> %res
855 define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
856 ; X86-LABEL: test_mm512_mask_movehdup_ps:
858 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
859 ; X86-NEXT: kmovw %eax, %k1
860 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
863 ; X64-LABEL: test_mm512_mask_movehdup_ps:
865 ; X64-NEXT: kmovw %edi, %k1
866 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
868 %arg1 = bitcast i16 %a1 to <16 x i1>
869 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
870 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
871 ret <16 x float> %res1
874 define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
875 ; X86-LABEL: test_mm512_maskz_movehdup_ps:
877 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
878 ; X86-NEXT: kmovw %eax, %k1
879 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
882 ; X64-LABEL: test_mm512_maskz_movehdup_ps:
884 ; X64-NEXT: kmovw %edi, %k1
885 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
887 %arg0 = bitcast i16 %a0 to <16 x i1>
888 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
889 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
890 ret <16 x float> %res1
893 define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
894 ; CHECK-LABEL: test_mm512_moveldup_ps:
896 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
897 ; CHECK-NEXT: ret{{[l|q]}}
898 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
899 ret <16 x float> %res
902 define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
903 ; X86-LABEL: test_mm512_mask_moveldup_ps:
905 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
906 ; X86-NEXT: kmovw %eax, %k1
907 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
910 ; X64-LABEL: test_mm512_mask_moveldup_ps:
912 ; X64-NEXT: kmovw %edi, %k1
913 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
915 %arg1 = bitcast i16 %a1 to <16 x i1>
916 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
917 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
918 ret <16 x float> %res1
921 define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
922 ; X86-LABEL: test_mm512_maskz_moveldup_ps:
924 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
925 ; X86-NEXT: kmovw %eax, %k1
926 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
929 ; X64-LABEL: test_mm512_maskz_moveldup_ps:
931 ; X64-NEXT: kmovw %edi, %k1
932 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
934 %arg0 = bitcast i16 %a0 to <16 x i1>
935 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
936 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
937 ret <16 x float> %res1
940 define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
941 ; CHECK-LABEL: test_mm512_permute_pd:
943 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
944 ; CHECK-NEXT: ret{{[l|q]}}
945 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
946 ret <8 x double> %res
949 define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
950 ; X86-LABEL: test_mm512_mask_permute_pd:
952 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
953 ; X86-NEXT: kmovw %eax, %k1
954 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
957 ; X64-LABEL: test_mm512_mask_permute_pd:
959 ; X64-NEXT: kmovw %edi, %k1
960 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
962 %arg1 = bitcast i8 %a1 to <8 x i1>
963 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
964 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
965 ret <8 x double> %res1
968 define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
969 ; X86-LABEL: test_mm512_maskz_permute_pd:
971 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
972 ; X86-NEXT: kmovw %eax, %k1
973 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
976 ; X64-LABEL: test_mm512_maskz_permute_pd:
978 ; X64-NEXT: kmovw %edi, %k1
979 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
981 %arg0 = bitcast i8 %a0 to <8 x i1>
982 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
983 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
984 ret <8 x double> %res1
987 define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
988 ; CHECK-LABEL: test_mm512_permute_ps:
990 ; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
991 ; CHECK-NEXT: ret{{[l|q]}}
992 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
993 ret <16 x float> %res
996 define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
997 ; X86-LABEL: test_mm512_mask_permute_ps:
999 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1000 ; X86-NEXT: kmovw %eax, %k1
1001 ; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1004 ; X64-LABEL: test_mm512_mask_permute_ps:
1006 ; X64-NEXT: kmovw %edi, %k1
1007 ; X64-NEXT: vshufps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1009 %arg1 = bitcast i16 %a1 to <16 x i1>
1010 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1011 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1012 ret <16 x float> %res1
1015 define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
1016 ; X86-LABEL: test_mm512_maskz_permute_ps:
1018 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1019 ; X86-NEXT: kmovw %eax, %k1
1020 ; X86-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1023 ; X64-LABEL: test_mm512_maskz_permute_ps:
1025 ; X64-NEXT: kmovw %edi, %k1
1026 ; X64-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1028 %arg0 = bitcast i16 %a0 to <16 x i1>
1029 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1030 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1031 ret <16 x float> %res1
1034 define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
1035 ; CHECK-LABEL: test_mm512_permutex_epi64:
1037 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1038 ; CHECK-NEXT: ret{{[l|q]}}
1039 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1043 define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
1044 ; X86-LABEL: test_mm512_mask_permutex_epi64:
1046 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1047 ; X86-NEXT: kmovw %eax, %k1
1048 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1051 ; X64-LABEL: test_mm512_mask_permutex_epi64:
1053 ; X64-NEXT: kmovw %edi, %k1
1054 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1056 %arg1 = bitcast i8 %a1 to <8 x i1>
1057 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1058 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1062 define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
1063 ; X86-LABEL: test_mm512_maskz_permutex_epi64:
1065 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1066 ; X86-NEXT: kmovw %eax, %k1
1067 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1070 ; X64-LABEL: test_mm512_maskz_permutex_epi64:
1072 ; X64-NEXT: kmovw %edi, %k1
1073 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1075 %arg0 = bitcast i8 %a0 to <8 x i1>
1076 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1077 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1081 define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
1082 ; CHECK-LABEL: test_mm512_permutex_pd:
1084 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1085 ; CHECK-NEXT: ret{{[l|q]}}
1086 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1087 ret <8 x double> %res
1090 define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
1091 ; X86-LABEL: test_mm512_mask_permutex_pd:
1093 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1094 ; X86-NEXT: kmovw %eax, %k1
1095 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1098 ; X64-LABEL: test_mm512_mask_permutex_pd:
1100 ; X64-NEXT: kmovw %edi, %k1
1101 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1103 %arg1 = bitcast i8 %a1 to <8 x i1>
1104 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1105 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1106 ret <8 x double> %res1
1109 define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
1110 ; X86-LABEL: test_mm512_maskz_permutex_pd:
1112 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1113 ; X86-NEXT: kmovw %eax, %k1
1114 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1117 ; X64-LABEL: test_mm512_maskz_permutex_pd:
1119 ; X64-NEXT: kmovw %edi, %k1
1120 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1122 %arg0 = bitcast i8 %a0 to <8 x i1>
1123 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1124 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1125 ret <8 x double> %res1
1128 define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
1129 ; CHECK-LABEL: test_mm512_shuffle_epi32:
1131 ; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1132 ; CHECK-NEXT: ret{{[l|q]}}
1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1134 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1135 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1139 define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
1140 ; X86-LABEL: test_mm512_mask_shuffle_epi32:
1142 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1143 ; X86-NEXT: kmovw %eax, %k1
1144 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1147 ; X64-LABEL: test_mm512_mask_shuffle_epi32:
1149 ; X64-NEXT: kmovw %edi, %k1
1150 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1152 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1153 %arg1 = bitcast i16 %a1 to <16 x i1>
1154 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1155 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1156 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1157 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1161 define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
1162 ; X86-LABEL: test_mm512_maskz_shuffle_epi32:
1164 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1165 ; X86-NEXT: kmovw %eax, %k1
1166 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1169 ; X64-LABEL: test_mm512_maskz_shuffle_epi32:
1171 ; X64-NEXT: kmovw %edi, %k1
1172 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1174 %arg0 = bitcast i16 %a0 to <16 x i1>
1175 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1176 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1177 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1178 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1182 define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
1183 ; CHECK-LABEL: test_mm512_shuffle_pd:
1185 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1186 ; CHECK-NEXT: ret{{[l|q]}}
1187 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1188 ret <8 x double> %res
1191 define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1192 ; X86-LABEL: test_mm512_mask_shuffle_pd:
1194 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1195 ; X86-NEXT: kmovw %eax, %k1
1196 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1199 ; X64-LABEL: test_mm512_mask_shuffle_pd:
1201 ; X64-NEXT: kmovw %edi, %k1
1202 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1204 %arg1 = bitcast i8 %a1 to <8 x i1>
1205 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1206 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1207 ret <8 x double> %res1
1210 define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1211 ; X86-LABEL: test_mm512_maskz_shuffle_pd:
1213 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1214 ; X86-NEXT: kmovw %eax, %k1
1215 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1218 ; X64-LABEL: test_mm512_maskz_shuffle_pd:
1220 ; X64-NEXT: kmovw %edi, %k1
1221 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1223 %arg0 = bitcast i8 %a0 to <8 x i1>
1224 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1225 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1226 ret <8 x double> %res1
1229 define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1230 ; CHECK-LABEL: test_mm512_unpackhi_epi32:
1232 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1233 ; CHECK-NEXT: ret{{[l|q]}}
1234 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1235 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1236 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1237 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1241 define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1242 ; X86-LABEL: test_mm512_mask_unpackhi_epi32:
1244 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1245 ; X86-NEXT: kmovw %eax, %k1
1246 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1249 ; X64-LABEL: test_mm512_mask_unpackhi_epi32:
1251 ; X64-NEXT: kmovw %edi, %k1
1252 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1254 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1255 %arg1 = bitcast i16 %a1 to <16 x i1>
1256 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1257 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1258 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1259 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1264 define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1265 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
1267 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1268 ; X86-NEXT: kmovw %eax, %k1
1269 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1272 ; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
1274 ; X64-NEXT: kmovw %edi, %k1
1275 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1277 %arg0 = bitcast i16 %a0 to <16 x i1>
1278 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1279 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1280 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1281 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1282 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1286 define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1287 ; CHECK-LABEL: test_mm512_unpackhi_epi64:
1289 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1290 ; CHECK-NEXT: ret{{[l|q]}}
1291 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1295 define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1296 ; X86-LABEL: test_mm512_mask_unpackhi_epi64:
1298 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1299 ; X86-NEXT: kmovw %eax, %k1
1300 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1303 ; X64-LABEL: test_mm512_mask_unpackhi_epi64:
1305 ; X64-NEXT: kmovw %edi, %k1
1306 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1308 %arg1 = bitcast i8 %a1 to <8 x i1>
1309 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1310 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1314 define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1315 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
1317 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1318 ; X86-NEXT: kmovw %eax, %k1
1319 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1322 ; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
1324 ; X64-NEXT: kmovw %edi, %k1
1325 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1327 %arg0 = bitcast i8 %a0 to <8 x i1>
1328 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1329 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1333 define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
1334 ; CHECK-LABEL: test_mm512_unpackhi_pd:
1336 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1337 ; CHECK-NEXT: ret{{[l|q]}}
1338 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1339 ret <8 x double> %res
1342 define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1343 ; X86-LABEL: test_mm512_mask_unpackhi_pd:
1345 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1346 ; X86-NEXT: kmovw %eax, %k1
1347 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1350 ; X64-LABEL: test_mm512_mask_unpackhi_pd:
1352 ; X64-NEXT: kmovw %edi, %k1
1353 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1355 %arg1 = bitcast i8 %a1 to <8 x i1>
1356 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1357 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1358 ret <8 x double> %res1
1361 define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1362 ; X86-LABEL: test_mm512_maskz_unpackhi_pd:
1364 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1365 ; X86-NEXT: kmovw %eax, %k1
1366 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1369 ; X64-LABEL: test_mm512_maskz_unpackhi_pd:
1371 ; X64-NEXT: kmovw %edi, %k1
1372 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1374 %arg0 = bitcast i8 %a0 to <8 x i1>
1375 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1376 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1377 ret <8 x double> %res1
1380 define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
1381 ; CHECK-LABEL: test_mm512_unpackhi_ps:
1383 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1384 ; CHECK-NEXT: ret{{[l|q]}}
1385 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1386 ret <16 x float> %res
1389 define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1390 ; X86-LABEL: test_mm512_mask_unpackhi_ps:
1392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1393 ; X86-NEXT: kmovw %eax, %k1
1394 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1397 ; X64-LABEL: test_mm512_mask_unpackhi_ps:
1399 ; X64-NEXT: kmovw %edi, %k1
1400 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1402 %arg1 = bitcast i16 %a1 to <16 x i1>
1403 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1404 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1405 ret <16 x float> %res1
1408 define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1409 ; X86-LABEL: test_mm512_maskz_unpackhi_ps:
1411 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1412 ; X86-NEXT: kmovw %eax, %k1
1413 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1416 ; X64-LABEL: test_mm512_maskz_unpackhi_ps:
1418 ; X64-NEXT: kmovw %edi, %k1
1419 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1421 %arg0 = bitcast i16 %a0 to <16 x i1>
1422 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1423 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1424 ret <16 x float> %res1
1427 define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1428 ; CHECK-LABEL: test_mm512_unpacklo_epi32:
1430 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1431 ; CHECK-NEXT: ret{{[l|q]}}
1432 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1433 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1434 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1435 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1439 define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1440 ; X86-LABEL: test_mm512_mask_unpacklo_epi32:
1442 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1443 ; X86-NEXT: kmovw %eax, %k1
1444 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1447 ; X64-LABEL: test_mm512_mask_unpacklo_epi32:
1449 ; X64-NEXT: kmovw %edi, %k1
1450 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1452 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1453 %arg1 = bitcast i16 %a1 to <16 x i1>
1454 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1455 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1456 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1457 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1458 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1462 define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1463 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
1465 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1466 ; X86-NEXT: kmovw %eax, %k1
1467 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1470 ; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
1472 ; X64-NEXT: kmovw %edi, %k1
1473 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1475 %arg0 = bitcast i16 %a0 to <16 x i1>
1476 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1477 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1478 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1479 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1480 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1484 define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1485 ; CHECK-LABEL: test_mm512_unpacklo_epi64:
1487 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1488 ; CHECK-NEXT: ret{{[l|q]}}
1489 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1493 define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1494 ; X86-LABEL: test_mm512_mask_unpacklo_epi64:
1496 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1497 ; X86-NEXT: kmovw %eax, %k1
1498 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1501 ; X64-LABEL: test_mm512_mask_unpacklo_epi64:
1503 ; X64-NEXT: kmovw %edi, %k1
1504 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1506 %arg1 = bitcast i8 %a1 to <8 x i1>
1507 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1508 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1512 define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1513 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
1515 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1516 ; X86-NEXT: kmovw %eax, %k1
1517 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1520 ; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
1522 ; X64-NEXT: kmovw %edi, %k1
1523 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1525 %arg0 = bitcast i8 %a0 to <8 x i1>
1526 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1527 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1531 define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
1532 ; CHECK-LABEL: test_mm512_unpacklo_pd:
1534 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1535 ; CHECK-NEXT: ret{{[l|q]}}
1536 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1537 ret <8 x double> %res
1540 define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1541 ; X86-LABEL: test_mm512_mask_unpacklo_pd:
1543 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1544 ; X86-NEXT: kmovw %eax, %k1
1545 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1548 ; X64-LABEL: test_mm512_mask_unpacklo_pd:
1550 ; X64-NEXT: kmovw %edi, %k1
1551 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1553 %arg1 = bitcast i8 %a1 to <8 x i1>
1554 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1555 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1556 ret <8 x double> %res1
1559 define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1560 ; X86-LABEL: test_mm512_maskz_unpacklo_pd:
1562 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1563 ; X86-NEXT: kmovw %eax, %k1
1564 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1567 ; X64-LABEL: test_mm512_maskz_unpacklo_pd:
1569 ; X64-NEXT: kmovw %edi, %k1
1570 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1572 %arg0 = bitcast i8 %a0 to <8 x i1>
1573 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1574 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1575 ret <8 x double> %res1
1578 define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
1579 ; CHECK-LABEL: test_mm512_unpacklo_ps:
1581 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1582 ; CHECK-NEXT: ret{{[l|q]}}
1583 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1584 ret <16 x float> %res
1587 define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1588 ; X86-LABEL: test_mm512_mask_unpacklo_ps:
1590 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1591 ; X86-NEXT: kmovw %eax, %k1
1592 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1595 ; X64-LABEL: test_mm512_mask_unpacklo_ps:
1597 ; X64-NEXT: kmovw %edi, %k1
1598 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1600 %arg1 = bitcast i16 %a1 to <16 x i1>
1601 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1602 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1603 ret <16 x float> %res1
1606 define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1607 ; X86-LABEL: test_mm512_maskz_unpacklo_ps:
1609 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: kmovw %eax, %k1
1611 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1614 ; X64-LABEL: test_mm512_maskz_unpacklo_ps:
1616 ; X64-NEXT: kmovw %edi, %k1
1617 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1619 %arg0 = bitcast i16 %a0 to <16 x i1>
1620 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1621 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1622 ret <16 x float> %res1
1625 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
1626 ; CHECK-LABEL: test_mm512_zextpd128_pd512:
1628 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1629 ; CHECK-NEXT: ret{{[l|q]}}
1630 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1631 ret <8 x double> %res
1634 define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
1635 ; CHECK-LABEL: test_mm512_zextpd256_pd512:
1637 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1638 ; CHECK-NEXT: ret{{[l|q]}}
1639 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1640 ret <8 x double> %res
1643 define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
1644 ; CHECK-LABEL: test_mm512_zextps128_ps512:
1646 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1647 ; CHECK-NEXT: ret{{[l|q]}}
1648 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
1649 ret <16 x float> %res
1652 define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
1653 ; CHECK-LABEL: test_mm512_zextps256_ps512:
1655 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1656 ; CHECK-NEXT: ret{{[l|q]}}
1657 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1658 ret <16 x float> %res
1661 define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
1662 ; CHECK-LABEL: test_mm512_zextsi128_si512:
1664 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1665 ; CHECK-NEXT: ret{{[l|q]}}
1666 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1670 define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
1671 ; CHECK-LABEL: test_mm512_zextsi256_si512:
1673 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1674 ; CHECK-NEXT: ret{{[l|q]}}
1675 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1679 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1680 ; CHECK-LABEL: test_mm512_mul_epi32:
1682 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0
1683 ; CHECK-NEXT: ret{{[l|q]}}
1684 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1685 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1686 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1687 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1688 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1692 define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1693 ; X86-LABEL: test_mm512_maskz_mul_epi32:
1694 ; X86: # %bb.0: # %entry
1695 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1696 ; X86-NEXT: kmovw %eax, %k1
1697 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1700 ; X64-LABEL: test_mm512_maskz_mul_epi32:
1701 ; X64: # %bb.0: # %entry
1702 ; X64-NEXT: kmovw %edi, %k1
1703 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1706 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1707 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1708 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1709 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1710 %4 = mul nsw <8 x i64> %3, %1
1711 %5 = bitcast i8 %__k to <8 x i1>
1712 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
1716 define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1717 ; X86-LABEL: test_mm512_mask_mul_epi32:
1718 ; X86: # %bb.0: # %entry
1719 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1720 ; X86-NEXT: kmovw %eax, %k1
1721 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1722 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1725 ; X64-LABEL: test_mm512_mask_mul_epi32:
1726 ; X64: # %bb.0: # %entry
1727 ; X64-NEXT: kmovw %edi, %k1
1728 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1729 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1732 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1733 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1734 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1735 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1736 %4 = mul nsw <8 x i64> %3, %1
1737 %5 = bitcast i8 %__k to <8 x i1>
1738 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src
1742 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1743 ; CHECK-LABEL: test_mm512_mul_epu32:
1745 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
1746 ; CHECK-NEXT: ret{{[l|q]}}
1747 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1748 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1749 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1753 define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1754 ; X86-LABEL: test_mm512_maskz_mul_epu32:
1755 ; X86: # %bb.0: # %entry
1756 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1757 ; X86-NEXT: kmovw %eax, %k1
1758 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1761 ; X64-LABEL: test_mm512_maskz_mul_epu32:
1762 ; X64: # %bb.0: # %entry
1763 ; X64-NEXT: kmovw %edi, %k1
1764 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1767 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1768 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1769 %2 = mul nuw <8 x i64> %1, %0
1770 %3 = bitcast i8 %__k to <8 x i1>
1771 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1775 define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1776 ; X86-LABEL: test_mm512_mask_mul_epu32:
1777 ; X86: # %bb.0: # %entry
1778 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1779 ; X86-NEXT: kmovw %eax, %k1
1780 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1781 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1784 ; X64-LABEL: test_mm512_mask_mul_epu32:
1785 ; X64: # %bb.0: # %entry
1786 ; X64-NEXT: kmovw %edi, %k1
1787 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1788 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1791 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1792 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1793 %2 = mul nuw <8 x i64> %1, %0
1794 %3 = bitcast i8 %__k to <8 x i1>
1795 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src
1799 define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
1800 ; X86-LABEL: test_mm512_set1_epi8:
1801 ; X86: # %bb.0: # %entry
1802 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1803 ; X86-NEXT: vmovd %eax, %xmm0
1804 ; X86-NEXT: vpbroadcastb %xmm0, %ymm0
1805 ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1808 ; X64-LABEL: test_mm512_set1_epi8:
1809 ; X64: # %bb.0: # %entry
1810 ; X64-NEXT: vmovd %edi, %xmm0
1811 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0
1812 ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1815 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
1816 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
1817 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
1821 define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
1822 ; X86-LABEL: test_mm_cvtu32_sd:
1823 ; X86: # %bb.0: # %entry
1824 ; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
1827 ; X64-LABEL: test_mm_cvtu32_sd:
1828 ; X64: # %bb.0: # %entry
1829 ; X64-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0
1832 %conv.i = uitofp i32 %__B to double
1833 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1834 ret <2 x double> %vecins.i
1837 define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
1838 ; X86-LABEL: test_mm_cvtu64_sd:
1839 ; X86: # %bb.0: # %entry
1840 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1841 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1842 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1843 ; X86-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1844 ; X86-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1845 ; X86-NEXT: vaddsd %xmm1, %xmm2, %xmm1
1846 ; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1849 ; X64-LABEL: test_mm_cvtu64_sd:
1850 ; X64: # %bb.0: # %entry
1851 ; X64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0
1854 %conv.i = uitofp i64 %__B to double
1855 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1856 ret <2 x double> %vecins.i
1859 define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
1860 ; X86-LABEL: test_mm_cvtu32_ss:
1861 ; X86: # %bb.0: # %entry
1862 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
1865 ; X64-LABEL: test_mm_cvtu32_ss:
1866 ; X64: # %bb.0: # %entry
1867 ; X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
1870 %conv.i = uitofp i32 %__B to float
1871 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1872 ret <4 x float> %vecins.i
1875 define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
1876 ; X86-LABEL: test_mm_cvtu64_ss:
1877 ; X86: # %bb.0: # %entry
1878 ; X86-NEXT: pushl %ebp
1879 ; X86-NEXT: .cfi_def_cfa_offset 8
1880 ; X86-NEXT: .cfi_offset %ebp, -8
1881 ; X86-NEXT: movl %esp, %ebp
1882 ; X86-NEXT: .cfi_def_cfa_register %ebp
1883 ; X86-NEXT: andl $-8, %esp
1884 ; X86-NEXT: subl $16, %esp
1885 ; X86-NEXT: movl 12(%ebp), %eax
1886 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1887 ; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1888 ; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
1889 ; X86-NEXT: shrl $31, %eax
1890 ; X86-NEXT: fildll {{[0-9]+}}(%esp)
1891 ; X86-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
1892 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
1893 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1894 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1895 ; X86-NEXT: movl %ebp, %esp
1896 ; X86-NEXT: popl %ebp
1897 ; X86-NEXT: .cfi_def_cfa %esp, 4
1900 ; X64-LABEL: test_mm_cvtu64_ss:
1901 ; X64: # %bb.0: # %entry
1902 ; X64-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0
1905 %conv.i = uitofp i64 %__B to float
1906 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1907 ret <4 x float> %vecins.i
1910 define <16 x float> @test_mm512_cvtph_ps(<4 x i64> %__A) {
1911 ; CHECK-LABEL: test_mm512_cvtph_ps:
1912 ; CHECK: # %bb.0: # %entry
1913 ; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0
1914 ; CHECK-NEXT: ret{{[l|q]}}
1916 %0 = bitcast <4 x i64> %__A to <16 x i16>
1917 %1 = bitcast <16 x i16> %0 to <16 x half>
1918 %2 = fpext <16 x half> %1 to <16 x float>
1922 define <16 x float> @test_mm512_mask_cvtph_ps(<16 x float> %__W, i16 zeroext %__U, <4 x i64> %__A) {
1923 ; X86-LABEL: test_mm512_mask_cvtph_ps:
1924 ; X86: # %bb.0: # %entry
1925 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1926 ; X86-NEXT: kmovw %eax, %k1
1927 ; X86-NEXT: vcvtph2ps %ymm1, %zmm0 {%k1}
1930 ; X64-LABEL: test_mm512_mask_cvtph_ps:
1931 ; X64: # %bb.0: # %entry
1932 ; X64-NEXT: kmovw %edi, %k1
1933 ; X64-NEXT: vcvtph2ps %ymm1, %zmm0 {%k1}
1936 %0 = bitcast <4 x i64> %__A to <16 x i16>
1937 %1 = bitcast <16 x i16> %0 to <16 x half>
1938 %2 = bitcast i16 %__U to <16 x i1>
1939 %3 = fpext <16 x half> %1 to <16 x float>
1940 %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> %__W
1944 define <16 x float> @test_mm512_maskz_cvtph_ps(i16 zeroext %__U, <4 x i64> %__A) {
1945 ; X86-LABEL: test_mm512_maskz_cvtph_ps:
1946 ; X86: # %bb.0: # %entry
1947 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1948 ; X86-NEXT: kmovw %eax, %k1
1949 ; X86-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1952 ; X64-LABEL: test_mm512_maskz_cvtph_ps:
1953 ; X64: # %bb.0: # %entry
1954 ; X64-NEXT: kmovw %edi, %k1
1955 ; X64-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
1958 %0 = bitcast <4 x i64> %__A to <16 x i16>
1959 %1 = bitcast <16 x i16> %0 to <16 x half>
1960 %2 = bitcast i16 %__U to <16 x i1>
1961 %3 = fpext <16 x half> %1 to <16 x float>
1962 %4 = select <16 x i1> %2, <16 x float> %3, <16 x float> zeroinitializer
1966 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
1967 ; CHECK-LABEL: test_mm512_cvtps_pd:
1968 ; CHECK: # %bb.0: # %entry
1969 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1970 ; CHECK-NEXT: ret{{[l|q]}}
1972 %conv.i = fpext <8 x float> %__A to <8 x double>
1973 ret <8 x double> %conv.i
1976 define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
1977 ; CHECK-LABEL: test_mm512_cvtpslo_pd:
1978 ; CHECK: # %bb.0: # %entry
1979 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1980 ; CHECK-NEXT: ret{{[l|q]}}
1982 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1983 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1984 ret <8 x double> %conv.i.i
1987 define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
1988 ; X86-LABEL: test_mm512_mask_cvtps_pd:
1989 ; X86: # %bb.0: # %entry
1990 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
1991 ; X86-NEXT: kmovw %eax, %k1
1992 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1995 ; X64-LABEL: test_mm512_mask_cvtps_pd:
1996 ; X64: # %bb.0: # %entry
1997 ; X64-NEXT: kmovw %edi, %k1
1998 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
2001 %conv.i.i = fpext <8 x float> %__A to <8 x double>
2002 %0 = bitcast i8 %__U to <8 x i1>
2003 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
2007 define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
2008 ; X86-LABEL: test_mm512_mask_cvtpslo_pd:
2009 ; X86: # %bb.0: # %entry
2010 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2011 ; X86-NEXT: kmovw %eax, %k1
2012 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
2015 ; X64-LABEL: test_mm512_mask_cvtpslo_pd:
2016 ; X64: # %bb.0: # %entry
2017 ; X64-NEXT: kmovw %edi, %k1
2018 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
2021 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2022 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
2023 %0 = bitcast i8 %__U to <8 x i1>
2024 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
2028 define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
2029 ; X86-LABEL: test_mm512_maskz_cvtps_pd:
2030 ; X86: # %bb.0: # %entry
2031 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2032 ; X86-NEXT: kmovw %eax, %k1
2033 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
2036 ; X64-LABEL: test_mm512_maskz_cvtps_pd:
2037 ; X64: # %bb.0: # %entry
2038 ; X64-NEXT: kmovw %edi, %k1
2039 ; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
2042 %conv.i.i = fpext <8 x float> %__A to <8 x double>
2043 %0 = bitcast i8 %__U to <8 x i1>
2044 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
2048 define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
2049 ; CHECK-LABEL: test_mm512_cvtepi32_epi8:
2050 ; CHECK: # %bb.0: # %entry
2051 ; CHECK-NEXT: vpmovdb %zmm0, %xmm0
2052 ; CHECK-NEXT: vzeroupper
2053 ; CHECK-NEXT: ret{{[l|q]}}
2055 %0 = bitcast <8 x i64> %__A to <16 x i32>
2056 %conv.i = trunc <16 x i32> %0 to <16 x i8>
2057 %1 = bitcast <16 x i8> %conv.i to <2 x i64>
2061 define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
2062 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
2063 ; X86: # %bb.0: # %entry
2064 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2065 ; X86-NEXT: kmovw %eax, %k1
2066 ; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2067 ; X86-NEXT: vzeroupper
2070 ; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
2071 ; X64: # %bb.0: # %entry
2072 ; X64-NEXT: kmovw %edi, %k1
2073 ; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2074 ; X64-NEXT: vzeroupper
2077 %0 = bitcast <8 x i64> %__A to <16 x i32>
2078 %1 = bitcast <2 x i64> %__O to <16 x i8>
2079 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
2080 %3 = bitcast <16 x i8> %2 to <2 x i64>
2084 define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
2085 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
2086 ; X86: # %bb.0: # %entry
2087 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2088 ; X86-NEXT: kmovw %eax, %k1
2089 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2090 ; X86-NEXT: vzeroupper
2093 ; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
2094 ; X64: # %bb.0: # %entry
2095 ; X64-NEXT: kmovw %edi, %k1
2096 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2097 ; X64-NEXT: vzeroupper
2100 %0 = bitcast <8 x i64> %__A to <16 x i32>
2101 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
2102 %2 = bitcast <16 x i8> %1 to <2 x i64>
2106 define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
2107 ; CHECK-LABEL: test_mm512_cvtepi64_epi32:
2108 ; CHECK: # %bb.0: # %entry
2109 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0
2110 ; CHECK-NEXT: ret{{[l|q]}}
2112 %conv.i = trunc <8 x i64> %__A to <8 x i32>
2113 %0 = bitcast <8 x i32> %conv.i to <4 x i64>
2117 define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2118 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
2119 ; X86: # %bb.0: # %entry
2120 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2121 ; X86-NEXT: kmovw %eax, %k1
2122 ; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2125 ; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
2126 ; X64: # %bb.0: # %entry
2127 ; X64-NEXT: kmovw %edi, %k1
2128 ; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2131 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2132 %0 = bitcast <4 x i64> %__O to <8 x i32>
2133 %1 = bitcast i8 %__M to <8 x i1>
2134 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
2135 %3 = bitcast <8 x i32> %2 to <4 x i64>
2139 define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
2140 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
2141 ; X86: # %bb.0: # %entry
2142 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2143 ; X86-NEXT: kmovw %eax, %k1
2144 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2147 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
2148 ; X64: # %bb.0: # %entry
2149 ; X64-NEXT: kmovw %edi, %k1
2150 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2153 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2154 %0 = bitcast i8 %__M to <8 x i1>
2155 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
2156 %2 = bitcast <8 x i32> %1 to <4 x i64>
2160 define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
2161 ; CHECK-LABEL: test_mm512_cvtepi64_epi16:
2162 ; CHECK: # %bb.0: # %entry
2163 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0
2164 ; CHECK-NEXT: vzeroupper
2165 ; CHECK-NEXT: ret{{[l|q]}}
2167 %conv.i = trunc <8 x i64> %__A to <8 x i16>
2168 %0 = bitcast <8 x i16> %conv.i to <2 x i64>
2172 define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2173 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
2174 ; X86: # %bb.0: # %entry
2175 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2176 ; X86-NEXT: kmovw %eax, %k1
2177 ; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2178 ; X86-NEXT: vzeroupper
2181 ; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
2182 ; X64: # %bb.0: # %entry
2183 ; X64-NEXT: kmovw %edi, %k1
2184 ; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2185 ; X64-NEXT: vzeroupper
2188 %0 = bitcast <2 x i64> %__O to <8 x i16>
2189 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
2190 %2 = bitcast <8 x i16> %1 to <2 x i64>
2194 define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
2195 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
2196 ; X86: # %bb.0: # %entry
2197 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2198 ; X86-NEXT: kmovw %eax, %k1
2199 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2200 ; X86-NEXT: vzeroupper
2203 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
2204 ; X64: # %bb.0: # %entry
2205 ; X64-NEXT: kmovw %edi, %k1
2206 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2207 ; X64-NEXT: vzeroupper
2210 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
2211 %1 = bitcast <8 x i16> %0 to <2 x i64>
2215 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
2216 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
2218 define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2219 ; CHECK-LABEL: test_mm512_ternarylogic_epi32:
2220 ; CHECK: # %bb.0: # %entry
2221 ; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0
2222 ; CHECK-NEXT: ret{{[l|q]}}
2224 %0 = bitcast <8 x i64> %__A to <16 x i32>
2225 %1 = bitcast <8 x i64> %__B to <16 x i32>
2226 %2 = bitcast <8 x i64> %__C to <16 x i32>
2227 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2228 %4 = bitcast <16 x i32> %3 to <8 x i64>
2232 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
2234 define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2235 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
2236 ; X86: # %bb.0: # %entry
2237 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2238 ; X86-NEXT: kmovw %eax, %k1
2239 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2242 ; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
2243 ; X64: # %bb.0: # %entry
2244 ; X64-NEXT: kmovw %edi, %k1
2245 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2248 %0 = bitcast <8 x i64> %__A to <16 x i32>
2249 %1 = bitcast <8 x i64> %__B to <16 x i32>
2250 %2 = bitcast <8 x i64> %__C to <16 x i32>
2251 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2252 %4 = bitcast i16 %__U to <16 x i1>
2253 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2254 %6 = bitcast <16 x i32> %5 to <8 x i64>
2258 define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2259 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
2260 ; X86: # %bb.0: # %entry
2261 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2262 ; X86-NEXT: kmovw %eax, %k1
2263 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2266 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
2267 ; X64: # %bb.0: # %entry
2268 ; X64-NEXT: kmovw %edi, %k1
2269 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2272 %0 = bitcast <8 x i64> %__A to <16 x i32>
2273 %1 = bitcast <8 x i64> %__B to <16 x i32>
2274 %2 = bitcast <8 x i64> %__C to <16 x i32>
2275 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2276 %4 = bitcast i16 %__U to <16 x i1>
2277 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2278 %6 = bitcast <16 x i32> %5 to <8 x i64>
2282 define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2283 ; CHECK-LABEL: test_mm512_ternarylogic_epi64:
2284 ; CHECK: # %bb.0: # %entry
2285 ; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0
2286 ; CHECK-NEXT: ret{{[l|q]}}
2288 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2292 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
2294 define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2295 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
2296 ; X86: # %bb.0: # %entry
2297 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2298 ; X86-NEXT: kmovw %eax, %k1
2299 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2302 ; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
2303 ; X64: # %bb.0: # %entry
2304 ; X64-NEXT: kmovw %edi, %k1
2305 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2308 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2309 %1 = bitcast i8 %__U to <8 x i1>
2310 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2314 define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2315 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
2316 ; X86: # %bb.0: # %entry
2317 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2318 ; X86-NEXT: kmovw %eax, %k1
2319 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2322 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
2323 ; X64: # %bb.0: # %entry
2324 ; X64-NEXT: kmovw %edi, %k1
2325 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2328 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2329 %1 = bitcast i8 %__U to <8 x i1>
2330 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2334 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2336 define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
2337 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
2338 ; X86: # %bb.0: # %entry
2339 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2340 ; X86-NEXT: kmovw %eax, %k1
2341 ; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2342 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2345 ; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
2346 ; X64: # %bb.0: # %entry
2347 ; X64-NEXT: kmovw %edi, %k1
2348 ; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2349 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2352 %0 = bitcast <8 x i64> %__A to <16 x i32>
2353 %1 = bitcast <8 x i64> %__I to <16 x i32>
2354 %2 = bitcast <8 x i64> %__B to <16 x i32>
2355 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2356 %4 = bitcast i16 %__U to <16 x i1>
2357 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
2358 %6 = bitcast <16 x i32> %5 to <8 x i64>
2362 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2364 define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
2365 ; X86-LABEL: test_mm512_mask2_permutex2var_pd:
2366 ; X86: # %bb.0: # %entry
2367 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2368 ; X86-NEXT: kmovw %eax, %k1
2369 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2370 ; X86-NEXT: vmovapd %zmm1, %zmm0
2373 ; X64-LABEL: test_mm512_mask2_permutex2var_pd:
2374 ; X64: # %bb.0: # %entry
2375 ; X64-NEXT: kmovw %edi, %k1
2376 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2377 ; X64-NEXT: vmovapd %zmm1, %zmm0
2380 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2381 %1 = bitcast <8 x i64> %__I to <8 x double>
2382 %2 = bitcast i8 %__U to <8 x i1>
2383 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
2387 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2389 define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
2390 ; X86-LABEL: test_mm512_mask2_permutex2var_ps:
2391 ; X86: # %bb.0: # %entry
2392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2393 ; X86-NEXT: kmovw %eax, %k1
2394 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2395 ; X86-NEXT: vmovaps %zmm1, %zmm0
2398 ; X64-LABEL: test_mm512_mask2_permutex2var_ps:
2399 ; X64: # %bb.0: # %entry
2400 ; X64-NEXT: kmovw %edi, %k1
2401 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2402 ; X64-NEXT: vmovaps %zmm1, %zmm0
2405 %0 = bitcast <8 x i64> %__I to <16 x i32>
2406 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2407 %2 = bitcast <8 x i64> %__I to <16 x float>
2408 %3 = bitcast i16 %__U to <16 x i1>
2409 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
2413 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2415 define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
2416 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
2417 ; X86: # %bb.0: # %entry
2418 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2419 ; X86-NEXT: kmovw %eax, %k1
2420 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2421 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2424 ; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
2425 ; X64: # %bb.0: # %entry
2426 ; X64-NEXT: kmovw %edi, %k1
2427 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2428 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2431 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2432 %1 = bitcast i8 %__U to <8 x i1>
2433 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
2437 define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2438 ; CHECK-LABEL: test_mm512_permutex2var_epi32:
2439 ; CHECK: # %bb.0: # %entry
2440 ; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
2441 ; CHECK-NEXT: ret{{[l|q]}}
2443 %0 = bitcast <8 x i64> %__A to <16 x i32>
2444 %1 = bitcast <8 x i64> %__I to <16 x i32>
2445 %2 = bitcast <8 x i64> %__B to <16 x i32>
2446 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2447 %4 = bitcast <16 x i32> %3 to <8 x i64>
2451 define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2452 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
2453 ; X86: # %bb.0: # %entry
2454 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2455 ; X86-NEXT: kmovw %eax, %k1
2456 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2459 ; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
2460 ; X64: # %bb.0: # %entry
2461 ; X64-NEXT: kmovw %edi, %k1
2462 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2465 %0 = bitcast <8 x i64> %__A to <16 x i32>
2466 %1 = bitcast <8 x i64> %__I to <16 x i32>
2467 %2 = bitcast <8 x i64> %__B to <16 x i32>
2468 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2469 %4 = bitcast i16 %__U to <16 x i1>
2470 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2471 %6 = bitcast <16 x i32> %5 to <8 x i64>
2475 define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2476 ; X86-LABEL: test_mm512_mask_permutex2var_epi32:
2477 ; X86: # %bb.0: # %entry
2478 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2479 ; X86-NEXT: kmovw %eax, %k1
2480 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2483 ; X64-LABEL: test_mm512_mask_permutex2var_epi32:
2484 ; X64: # %bb.0: # %entry
2485 ; X64-NEXT: kmovw %edi, %k1
2486 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2489 %0 = bitcast <8 x i64> %__A to <16 x i32>
2490 %1 = bitcast <8 x i64> %__I to <16 x i32>
2491 %2 = bitcast <8 x i64> %__B to <16 x i32>
2492 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2493 %4 = bitcast i16 %__U to <16 x i1>
2494 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2495 %6 = bitcast <16 x i32> %5 to <8 x i64>
2499 define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2500 ; CHECK-LABEL: test_mm512_permutex2var_pd:
2501 ; CHECK: # %bb.0: # %entry
2502 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
2503 ; CHECK-NEXT: ret{{[l|q]}}
2505 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2509 define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
2510 ; X86-LABEL: test_mm512_mask_permutex2var_pd:
2511 ; X86: # %bb.0: # %entry
2512 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2513 ; X86-NEXT: kmovw %eax, %k1
2514 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2517 ; X64-LABEL: test_mm512_mask_permutex2var_pd:
2518 ; X64: # %bb.0: # %entry
2519 ; X64-NEXT: kmovw %edi, %k1
2520 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2523 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2524 %1 = bitcast i8 %__U to <8 x i1>
2525 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2529 define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2530 ; X86-LABEL: test_mm512_maskz_permutex2var_pd:
2531 ; X86: # %bb.0: # %entry
2532 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2533 ; X86-NEXT: kmovw %eax, %k1
2534 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2537 ; X64-LABEL: test_mm512_maskz_permutex2var_pd:
2538 ; X64: # %bb.0: # %entry
2539 ; X64-NEXT: kmovw %edi, %k1
2540 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2543 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2544 %1 = bitcast i8 %__U to <8 x i1>
2545 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
2549 define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2550 ; CHECK-LABEL: test_mm512_permutex2var_ps:
2551 ; CHECK: # %bb.0: # %entry
2552 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
2553 ; CHECK-NEXT: ret{{[l|q]}}
2555 %0 = bitcast <8 x i64> %__I to <16 x i32>
2556 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2560 define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
2561 ; X86-LABEL: test_mm512_mask_permutex2var_ps:
2562 ; X86: # %bb.0: # %entry
2563 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2564 ; X86-NEXT: kmovw %eax, %k1
2565 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2568 ; X64-LABEL: test_mm512_mask_permutex2var_ps:
2569 ; X64: # %bb.0: # %entry
2570 ; X64-NEXT: kmovw %edi, %k1
2571 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2574 %0 = bitcast <8 x i64> %__I to <16 x i32>
2575 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2576 %2 = bitcast i16 %__U to <16 x i1>
2577 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
2581 define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2582 ; X86-LABEL: test_mm512_maskz_permutex2var_ps:
2583 ; X86: # %bb.0: # %entry
2584 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2585 ; X86-NEXT: kmovw %eax, %k1
2586 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2589 ; X64-LABEL: test_mm512_maskz_permutex2var_ps:
2590 ; X64: # %bb.0: # %entry
2591 ; X64-NEXT: kmovw %edi, %k1
2592 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2595 %0 = bitcast <8 x i64> %__I to <16 x i32>
2596 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2597 %2 = bitcast i16 %__U to <16 x i1>
2598 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2602 define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2603 ; CHECK-LABEL: test_mm512_permutex2var_epi64:
2604 ; CHECK: # %bb.0: # %entry
2605 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
2606 ; CHECK-NEXT: ret{{[l|q]}}
2608 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2612 define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2613 ; X86-LABEL: test_mm512_mask_permutex2var_epi64:
2614 ; X86: # %bb.0: # %entry
2615 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2616 ; X86-NEXT: kmovw %eax, %k1
2617 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2620 ; X64-LABEL: test_mm512_mask_permutex2var_epi64:
2621 ; X64: # %bb.0: # %entry
2622 ; X64-NEXT: kmovw %edi, %k1
2623 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2626 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2627 %1 = bitcast i8 %__U to <8 x i1>
2628 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2632 define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2633 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
2634 ; X86: # %bb.0: # %entry
2635 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2636 ; X86-NEXT: kmovw %eax, %k1
2637 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2640 ; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
2641 ; X64: # %bb.0: # %entry
2642 ; X64-NEXT: kmovw %edi, %k1
2643 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2646 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2647 %1 = bitcast i8 %__U to <8 x i1>
2648 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2651 define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2652 ; X86-LABEL: test_mm_mask_add_ss:
2653 ; X86: # %bb.0: # %entry
2654 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2655 ; X86-NEXT: kmovw %eax, %k1
2656 ; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2659 ; X64-LABEL: test_mm_mask_add_ss:
2660 ; X64: # %bb.0: # %entry
2661 ; X64-NEXT: kmovw %edi, %k1
2662 ; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2665 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2666 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2667 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2669 %tobool.i = icmp eq i8 %0, 0
2670 %vecext1.i = extractelement <4 x float> %__W, i32 0
2671 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
2672 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2673 ret <4 x float> %vecins.i
2676 define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2677 ; X86-LABEL: test_mm_maskz_add_ss:
2678 ; X86: # %bb.0: # %entry
2679 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2680 ; X86-NEXT: kmovw %eax, %k1
2681 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2684 ; X64-LABEL: test_mm_maskz_add_ss:
2685 ; X64: # %bb.0: # %entry
2686 ; X64-NEXT: kmovw %edi, %k1
2687 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2690 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2691 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2692 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2694 %tobool.i = icmp eq i8 %0, 0
2695 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
2696 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2697 ret <4 x float> %vecins.i
2700 define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2701 ; X86-LABEL: test_mm_mask_add_sd:
2702 ; X86: # %bb.0: # %entry
2703 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2704 ; X86-NEXT: kmovw %eax, %k1
2705 ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2708 ; X64-LABEL: test_mm_mask_add_sd:
2709 ; X64: # %bb.0: # %entry
2710 ; X64-NEXT: kmovw %edi, %k1
2711 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2714 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2715 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2716 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2718 %tobool.i = icmp eq i8 %0, 0
2719 %vecext1.i = extractelement <2 x double> %__W, i32 0
2720 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
2721 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2722 ret <2 x double> %vecins.i
2725 define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2726 ; X86-LABEL: test_mm_maskz_add_sd:
2727 ; X86: # %bb.0: # %entry
2728 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2729 ; X86-NEXT: kmovw %eax, %k1
2730 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2733 ; X64-LABEL: test_mm_maskz_add_sd:
2734 ; X64: # %bb.0: # %entry
2735 ; X64-NEXT: kmovw %edi, %k1
2736 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2739 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2740 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2741 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2743 %tobool.i = icmp eq i8 %0, 0
2744 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
2745 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2746 ret <2 x double> %vecins.i
2749 define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2750 ; X86-LABEL: test_mm_mask_sub_ss:
2751 ; X86: # %bb.0: # %entry
2752 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2753 ; X86-NEXT: kmovw %eax, %k1
2754 ; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2757 ; X64-LABEL: test_mm_mask_sub_ss:
2758 ; X64: # %bb.0: # %entry
2759 ; X64-NEXT: kmovw %edi, %k1
2760 ; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2763 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2764 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2765 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2767 %tobool.i = icmp eq i8 %0, 0
2768 %vecext1.i = extractelement <4 x float> %__W, i32 0
2769 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
2770 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2771 ret <4 x float> %vecins.i
2774 define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2775 ; X86-LABEL: test_mm_maskz_sub_ss:
2776 ; X86: # %bb.0: # %entry
2777 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2778 ; X86-NEXT: kmovw %eax, %k1
2779 ; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2782 ; X64-LABEL: test_mm_maskz_sub_ss:
2783 ; X64: # %bb.0: # %entry
2784 ; X64-NEXT: kmovw %edi, %k1
2785 ; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2788 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2789 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2790 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2792 %tobool.i = icmp eq i8 %0, 0
2793 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
2794 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2795 ret <4 x float> %vecins.i
2798 define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2799 ; X86-LABEL: test_mm_mask_sub_sd:
2800 ; X86: # %bb.0: # %entry
2801 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2802 ; X86-NEXT: kmovw %eax, %k1
2803 ; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2806 ; X64-LABEL: test_mm_mask_sub_sd:
2807 ; X64: # %bb.0: # %entry
2808 ; X64-NEXT: kmovw %edi, %k1
2809 ; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2812 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2813 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2814 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2816 %tobool.i = icmp eq i8 %0, 0
2817 %vecext1.i = extractelement <2 x double> %__W, i32 0
2818 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
2819 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2820 ret <2 x double> %vecins.i
2823 define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2824 ; X86-LABEL: test_mm_maskz_sub_sd:
2825 ; X86: # %bb.0: # %entry
2826 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2827 ; X86-NEXT: kmovw %eax, %k1
2828 ; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2831 ; X64-LABEL: test_mm_maskz_sub_sd:
2832 ; X64: # %bb.0: # %entry
2833 ; X64-NEXT: kmovw %edi, %k1
2834 ; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2837 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2838 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2839 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2841 %tobool.i = icmp eq i8 %0, 0
2842 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
2843 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2844 ret <2 x double> %vecins.i
2847 define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2848 ; X86-LABEL: test_mm_mask_mul_ss:
2849 ; X86: # %bb.0: # %entry
2850 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2851 ; X86-NEXT: kmovw %eax, %k1
2852 ; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2855 ; X64-LABEL: test_mm_mask_mul_ss:
2856 ; X64: # %bb.0: # %entry
2857 ; X64-NEXT: kmovw %edi, %k1
2858 ; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2861 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2862 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2863 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2865 %tobool.i = icmp eq i8 %0, 0
2866 %vecext1.i = extractelement <4 x float> %__W, i32 0
2867 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
2868 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2869 ret <4 x float> %vecins.i
2872 define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2873 ; X86-LABEL: test_mm_maskz_mul_ss:
2874 ; X86: # %bb.0: # %entry
2875 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2876 ; X86-NEXT: kmovw %eax, %k1
2877 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2880 ; X64-LABEL: test_mm_maskz_mul_ss:
2881 ; X64: # %bb.0: # %entry
2882 ; X64-NEXT: kmovw %edi, %k1
2883 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2886 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2887 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2888 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2890 %tobool.i = icmp eq i8 %0, 0
2891 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
2892 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2893 ret <4 x float> %vecins.i
2896 define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2897 ; X86-LABEL: test_mm_mask_mul_sd:
2898 ; X86: # %bb.0: # %entry
2899 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2900 ; X86-NEXT: kmovw %eax, %k1
2901 ; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2904 ; X64-LABEL: test_mm_mask_mul_sd:
2905 ; X64: # %bb.0: # %entry
2906 ; X64-NEXT: kmovw %edi, %k1
2907 ; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2910 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2911 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2912 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2914 %tobool.i = icmp eq i8 %0, 0
2915 %vecext1.i = extractelement <2 x double> %__W, i32 0
2916 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
2917 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2918 ret <2 x double> %vecins.i
2921 define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2922 ; X86-LABEL: test_mm_maskz_mul_sd:
2923 ; X86: # %bb.0: # %entry
2924 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2925 ; X86-NEXT: kmovw %eax, %k1
2926 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2929 ; X64-LABEL: test_mm_maskz_mul_sd:
2930 ; X64: # %bb.0: # %entry
2931 ; X64-NEXT: kmovw %edi, %k1
2932 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2935 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2936 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2937 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2939 %tobool.i = icmp eq i8 %0, 0
2940 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
2941 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2942 ret <2 x double> %vecins.i
2945 define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2946 ; X86-LABEL: test_mm_mask_div_ss:
2947 ; X86: # %bb.0: # %entry
2948 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2949 ; X86-NEXT: kmovw %eax, %k1
2950 ; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2953 ; X64-LABEL: test_mm_mask_div_ss:
2954 ; X64: # %bb.0: # %entry
2955 ; X64-NEXT: kmovw %edi, %k1
2956 ; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2959 %0 = extractelement <4 x float> %__A, i64 0
2960 %1 = extractelement <4 x float> %__B, i64 0
2961 %2 = extractelement <4 x float> %__W, i64 0
2962 %3 = fdiv float %0, %1
2963 %4 = bitcast i8 %__U to <8 x i1>
2964 %5 = extractelement <8 x i1> %4, i64 0
2965 %6 = select i1 %5, float %3, float %2
2966 %7 = insertelement <4 x float> %__A, float %6, i64 0
2970 define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2971 ; X86-LABEL: test_mm_maskz_div_ss:
2972 ; X86: # %bb.0: # %entry
2973 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2974 ; X86-NEXT: kmovw %eax, %k1
2975 ; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2978 ; X64-LABEL: test_mm_maskz_div_ss:
2979 ; X64: # %bb.0: # %entry
2980 ; X64-NEXT: kmovw %edi, %k1
2981 ; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2984 %0 = extractelement <4 x float> %__A, i64 0
2985 %1 = extractelement <4 x float> %__B, i64 0
2986 %2 = fdiv float %0, %1
2987 %3 = bitcast i8 %__U to <8 x i1>
2988 %4 = extractelement <8 x i1> %3, i64 0
2989 %5 = select i1 %4, float %2, float 0.000000e+00
2990 %6 = insertelement <4 x float> %__A, float %5, i64 0
2994 define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2995 ; X86-LABEL: test_mm_mask_div_sd:
2996 ; X86: # %bb.0: # %entry
2997 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
2998 ; X86-NEXT: kmovw %eax, %k1
2999 ; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
3002 ; X64-LABEL: test_mm_mask_div_sd:
3003 ; X64: # %bb.0: # %entry
3004 ; X64-NEXT: kmovw %edi, %k1
3005 ; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
3008 %0 = extractelement <2 x double> %__A, i64 0
3009 %1 = extractelement <2 x double> %__B, i64 0
3010 %2 = extractelement <2 x double> %__W, i64 0
3011 %3 = fdiv double %0, %1
3012 %4 = bitcast i8 %__U to <8 x i1>
3013 %5 = extractelement <8 x i1> %4, i64 0
3014 %6 = select i1 %5, double %3, double %2
3015 %7 = insertelement <2 x double> %__A, double %6, i64 0
3019 define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
3020 ; X86-LABEL: test_mm_maskz_div_sd:
3021 ; X86: # %bb.0: # %entry
3022 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3023 ; X86-NEXT: kmovw %eax, %k1
3024 ; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
3027 ; X64-LABEL: test_mm_maskz_div_sd:
3028 ; X64: # %bb.0: # %entry
3029 ; X64-NEXT: kmovw %edi, %k1
3030 ; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
3033 %0 = extractelement <2 x double> %__A, i64 0
3034 %1 = extractelement <2 x double> %__B, i64 0
3035 %2 = fdiv double %0, %1
3036 %3 = bitcast i8 %__U to <8 x i1>
3037 %4 = extractelement <8 x i1> %3, i64 0
3038 %5 = select i1 %4, double %2, double 0.000000e+00
3039 %6 = insertelement <2 x double> %__A, double %5, i64 0
3044 define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3045 ; CHECK-LABEL: test_mm512_fmadd_round_pd:
3046 ; CHECK: # %bb.0: # %entry
3047 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3048 ; CHECK-NEXT: ret{{[l|q]}}
3050 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3054 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3056 define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3057 ; X86-LABEL: test_mm512_mask_fmadd_round_pd:
3058 ; X86: # %bb.0: # %entry
3059 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3060 ; X86-NEXT: kmovw %eax, %k1
3061 ; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3064 ; X64-LABEL: test_mm512_mask_fmadd_round_pd:
3065 ; X64: # %bb.0: # %entry
3066 ; X64-NEXT: kmovw %edi, %k1
3067 ; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3070 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3071 %1 = bitcast i8 %__U to <8 x i1>
3072 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3076 define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3077 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
3078 ; X86: # %bb.0: # %entry
3079 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3080 ; X86-NEXT: kmovw %eax, %k1
3081 ; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3082 ; X86-NEXT: vmovapd %zmm2, %zmm0
3085 ; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
3086 ; X64: # %bb.0: # %entry
3087 ; X64-NEXT: kmovw %edi, %k1
3088 ; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3089 ; X64-NEXT: vmovapd %zmm2, %zmm0
3092 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3093 %1 = bitcast i8 %__U to <8 x i1>
3094 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3098 define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3099 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
3100 ; X86: # %bb.0: # %entry
3101 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3102 ; X86-NEXT: kmovw %eax, %k1
3103 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3106 ; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
3107 ; X64: # %bb.0: # %entry
3108 ; X64-NEXT: kmovw %edi, %k1
3109 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3112 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3113 %1 = bitcast i8 %__U to <8 x i1>
3114 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3118 define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3119 ; X86-LABEL: test_mm512_fmsub_round_pd:
3120 ; X86: # %bb.0: # %entry
3121 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
3122 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3125 ; X64-LABEL: test_mm512_fmsub_round_pd:
3126 ; X64: # %bb.0: # %entry
3127 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
3128 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3131 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3132 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3136 define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3137 ; X86-LABEL: test_mm512_mask_fmsub_round_pd:
3138 ; X86: # %bb.0: # %entry
3139 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3140 ; X86-NEXT: kmovw %eax, %k1
3141 ; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3144 ; X64-LABEL: test_mm512_mask_fmsub_round_pd:
3145 ; X64: # %bb.0: # %entry
3146 ; X64-NEXT: kmovw %edi, %k1
3147 ; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3150 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3151 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3152 %1 = bitcast i8 %__U to <8 x i1>
3153 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3157 define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3158 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
3159 ; X86: # %bb.0: # %entry
3160 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3161 ; X86-NEXT: kmovw %eax, %k1
3162 ; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3165 ; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
3166 ; X64: # %bb.0: # %entry
3167 ; X64-NEXT: kmovw %edi, %k1
3168 ; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3171 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3172 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3173 %1 = bitcast i8 %__U to <8 x i1>
3174 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3178 define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3179 ; X86-LABEL: test_mm512_fnmadd_round_pd:
3180 ; X86: # %bb.0: # %entry
3181 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
3182 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3185 ; X64-LABEL: test_mm512_fnmadd_round_pd:
3186 ; X64: # %bb.0: # %entry
3187 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
3188 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3191 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3192 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3196 define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3197 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
3198 ; X86: # %bb.0: # %entry
3199 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3200 ; X86-NEXT: kmovw %eax, %k1
3201 ; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3202 ; X86-NEXT: vmovapd %zmm2, %zmm0
3205 ; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
3206 ; X64: # %bb.0: # %entry
3207 ; X64-NEXT: kmovw %edi, %k1
3208 ; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3209 ; X64-NEXT: vmovapd %zmm2, %zmm0
3212 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3213 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3214 %1 = bitcast i8 %__U to <8 x i1>
3215 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3219 define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3220 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
3221 ; X86: # %bb.0: # %entry
3222 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3223 ; X86-NEXT: kmovw %eax, %k1
3224 ; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3227 ; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
3228 ; X64: # %bb.0: # %entry
3229 ; X64-NEXT: kmovw %edi, %k1
3230 ; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3233 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3234 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3235 %1 = bitcast i8 %__U to <8 x i1>
3236 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3240 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3241 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
3242 ; CHECK: # %bb.0: # %entry
3243 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3244 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3245 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3246 ; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
3247 ; CHECK-NEXT: ret{{[l|q]}}
3249 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3250 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3251 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3255 define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3256 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
3257 ; X86: # %bb.0: # %entry
3258 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3259 ; X86-NEXT: kmovw %eax, %k1
3260 ; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3263 ; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
3264 ; X64: # %bb.0: # %entry
3265 ; X64-NEXT: kmovw %edi, %k1
3266 ; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3269 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3270 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3271 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3272 %1 = bitcast i8 %__U to <8 x i1>
3273 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3277 define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3278 ; CHECK-LABEL: test_mm512_fmadd_pd:
3279 ; CHECK: # %bb.0: # %entry
3280 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3281 ; CHECK-NEXT: ret{{[l|q]}}
3283 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3287 define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3288 ; X86-LABEL: test_mm512_mask_fmadd_pd:
3289 ; X86: # %bb.0: # %entry
3290 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3291 ; X86-NEXT: kmovw %eax, %k1
3292 ; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3295 ; X64-LABEL: test_mm512_mask_fmadd_pd:
3296 ; X64: # %bb.0: # %entry
3297 ; X64-NEXT: kmovw %edi, %k1
3298 ; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3301 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3302 %1 = bitcast i8 %__U to <8 x i1>
3303 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3307 define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3308 ; X86-LABEL: test_mm512_mask3_fmadd_pd:
3309 ; X86: # %bb.0: # %entry
3310 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3311 ; X86-NEXT: kmovw %eax, %k1
3312 ; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3313 ; X86-NEXT: vmovapd %zmm2, %zmm0
3316 ; X64-LABEL: test_mm512_mask3_fmadd_pd:
3317 ; X64: # %bb.0: # %entry
3318 ; X64-NEXT: kmovw %edi, %k1
3319 ; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3320 ; X64-NEXT: vmovapd %zmm2, %zmm0
3323 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3324 %1 = bitcast i8 %__U to <8 x i1>
3325 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3329 define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3330 ; X86-LABEL: test_mm512_maskz_fmadd_pd:
3331 ; X86: # %bb.0: # %entry
3332 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3333 ; X86-NEXT: kmovw %eax, %k1
3334 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3337 ; X64-LABEL: test_mm512_maskz_fmadd_pd:
3338 ; X64: # %bb.0: # %entry
3339 ; X64-NEXT: kmovw %edi, %k1
3340 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3343 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3344 %1 = bitcast i8 %__U to <8 x i1>
3345 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3349 define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3350 ; X86-LABEL: test_mm512_fmsub_pd:
3351 ; X86: # %bb.0: # %entry
3352 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
3353 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3356 ; X64-LABEL: test_mm512_fmsub_pd:
3357 ; X64: # %bb.0: # %entry
3358 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
3359 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3362 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3363 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3367 define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3368 ; X86-LABEL: test_mm512_mask_fmsub_pd:
3369 ; X86: # %bb.0: # %entry
3370 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3371 ; X86-NEXT: kmovw %eax, %k1
3372 ; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3375 ; X64-LABEL: test_mm512_mask_fmsub_pd:
3376 ; X64: # %bb.0: # %entry
3377 ; X64-NEXT: kmovw %edi, %k1
3378 ; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3381 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3382 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3383 %1 = bitcast i8 %__U to <8 x i1>
3384 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3388 define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3389 ; X86-LABEL: test_mm512_maskz_fmsub_pd:
3390 ; X86: # %bb.0: # %entry
3391 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3392 ; X86-NEXT: kmovw %eax, %k1
3393 ; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3396 ; X64-LABEL: test_mm512_maskz_fmsub_pd:
3397 ; X64: # %bb.0: # %entry
3398 ; X64-NEXT: kmovw %edi, %k1
3399 ; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3402 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3403 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3404 %1 = bitcast i8 %__U to <8 x i1>
3405 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3409 define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3410 ; X86-LABEL: test_mm512_fnmadd_pd:
3411 ; X86: # %bb.0: # %entry
3412 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm0, %zmm0
3413 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3416 ; X64-LABEL: test_mm512_fnmadd_pd:
3417 ; X64: # %bb.0: # %entry
3418 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
3419 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3422 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3423 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3427 define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3428 ; X86-LABEL: test_mm512_mask3_fnmadd_pd:
3429 ; X86: # %bb.0: # %entry
3430 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3431 ; X86-NEXT: kmovw %eax, %k1
3432 ; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3433 ; X86-NEXT: vmovapd %zmm2, %zmm0
3436 ; X64-LABEL: test_mm512_mask3_fnmadd_pd:
3437 ; X64: # %bb.0: # %entry
3438 ; X64-NEXT: kmovw %edi, %k1
3439 ; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3440 ; X64-NEXT: vmovapd %zmm2, %zmm0
3443 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3444 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3445 %1 = bitcast i8 %__U to <8 x i1>
3446 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3450 define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3451 ; X86-LABEL: test_mm512_maskz_fnmadd_pd:
3452 ; X86: # %bb.0: # %entry
3453 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3454 ; X86-NEXT: kmovw %eax, %k1
3455 ; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3458 ; X64-LABEL: test_mm512_maskz_fnmadd_pd:
3459 ; X64: # %bb.0: # %entry
3460 ; X64-NEXT: kmovw %edi, %k1
3461 ; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3464 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3465 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3466 %1 = bitcast i8 %__U to <8 x i1>
3467 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3471 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3472 ; CHECK-LABEL: test_mm512_fnmsub_pd:
3473 ; CHECK: # %bb.0: # %entry
3474 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3475 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3476 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3477 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3478 ; CHECK-NEXT: ret{{[l|q]}}
3480 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3481 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3482 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3486 define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3487 ; X86-LABEL: test_mm512_maskz_fnmsub_pd:
3488 ; X86: # %bb.0: # %entry
3489 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3490 ; X86-NEXT: kmovw %eax, %k1
3491 ; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3494 ; X64-LABEL: test_mm512_maskz_fnmsub_pd:
3495 ; X64: # %bb.0: # %entry
3496 ; X64-NEXT: kmovw %edi, %k1
3497 ; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3500 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3501 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3502 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3503 %1 = bitcast i8 %__U to <8 x i1>
3504 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3508 define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3509 ; CHECK-LABEL: test_mm512_fmadd_round_ps:
3510 ; CHECK: # %bb.0: # %entry
3511 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3512 ; CHECK-NEXT: ret{{[l|q]}}
3514 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3518 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
3520 define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3521 ; X86-LABEL: test_mm512_mask_fmadd_round_ps:
3522 ; X86: # %bb.0: # %entry
3523 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3524 ; X86-NEXT: kmovw %eax, %k1
3525 ; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3528 ; X64-LABEL: test_mm512_mask_fmadd_round_ps:
3529 ; X64: # %bb.0: # %entry
3530 ; X64-NEXT: kmovw %edi, %k1
3531 ; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3534 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3535 %1 = bitcast i16 %__U to <16 x i1>
3536 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3540 define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3541 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
3542 ; X86: # %bb.0: # %entry
3543 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3544 ; X86-NEXT: kmovw %eax, %k1
3545 ; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3546 ; X86-NEXT: vmovaps %zmm2, %zmm0
3549 ; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
3550 ; X64: # %bb.0: # %entry
3551 ; X64-NEXT: kmovw %edi, %k1
3552 ; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3553 ; X64-NEXT: vmovaps %zmm2, %zmm0
3556 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3557 %1 = bitcast i16 %__U to <16 x i1>
3558 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3562 define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3563 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
3564 ; X86: # %bb.0: # %entry
3565 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3566 ; X86-NEXT: kmovw %eax, %k1
3567 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3570 ; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
3571 ; X64: # %bb.0: # %entry
3572 ; X64-NEXT: kmovw %edi, %k1
3573 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3576 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3577 %1 = bitcast i16 %__U to <16 x i1>
3578 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3582 define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3583 ; X86-LABEL: test_mm512_fmsub_round_ps:
3584 ; X86: # %bb.0: # %entry
3585 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
3586 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3589 ; X64-LABEL: test_mm512_fmsub_round_ps:
3590 ; X64: # %bb.0: # %entry
3591 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
3592 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3595 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3596 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3600 define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3601 ; X86-LABEL: test_mm512_mask_fmsub_round_ps:
3602 ; X86: # %bb.0: # %entry
3603 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3604 ; X86-NEXT: kmovw %eax, %k1
3605 ; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3608 ; X64-LABEL: test_mm512_mask_fmsub_round_ps:
3609 ; X64: # %bb.0: # %entry
3610 ; X64-NEXT: kmovw %edi, %k1
3611 ; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3614 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3615 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3616 %1 = bitcast i16 %__U to <16 x i1>
3617 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3621 define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3622 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
3623 ; X86: # %bb.0: # %entry
3624 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3625 ; X86-NEXT: kmovw %eax, %k1
3626 ; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3629 ; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
3630 ; X64: # %bb.0: # %entry
3631 ; X64-NEXT: kmovw %edi, %k1
3632 ; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3635 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3636 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3637 %1 = bitcast i16 %__U to <16 x i1>
3638 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3642 define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3643 ; X86-LABEL: test_mm512_fnmadd_round_ps:
3644 ; X86: # %bb.0: # %entry
3645 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
3646 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3649 ; X64-LABEL: test_mm512_fnmadd_round_ps:
3650 ; X64: # %bb.0: # %entry
3651 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
3652 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3655 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3656 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3660 define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3661 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
3662 ; X86: # %bb.0: # %entry
3663 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3664 ; X86-NEXT: kmovw %eax, %k1
3665 ; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3666 ; X86-NEXT: vmovaps %zmm2, %zmm0
3669 ; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
3670 ; X64: # %bb.0: # %entry
3671 ; X64-NEXT: kmovw %edi, %k1
3672 ; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3673 ; X64-NEXT: vmovaps %zmm2, %zmm0
3676 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3677 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3678 %1 = bitcast i16 %__U to <16 x i1>
3679 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3683 define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3684 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
3685 ; X86: # %bb.0: # %entry
3686 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3687 ; X86-NEXT: kmovw %eax, %k1
3688 ; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3691 ; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
3692 ; X64: # %bb.0: # %entry
3693 ; X64-NEXT: kmovw %edi, %k1
3694 ; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3697 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3698 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3699 %1 = bitcast i16 %__U to <16 x i1>
3700 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3704 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3705 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
3706 ; CHECK: # %bb.0: # %entry
3707 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3708 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3709 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3710 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
3711 ; CHECK-NEXT: ret{{[l|q]}}
3713 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3714 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3715 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3719 define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3720 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
3721 ; X86: # %bb.0: # %entry
3722 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3723 ; X86-NEXT: kmovw %eax, %k1
3724 ; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3727 ; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
3728 ; X64: # %bb.0: # %entry
3729 ; X64-NEXT: kmovw %edi, %k1
3730 ; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3733 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3734 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3735 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3736 %1 = bitcast i16 %__U to <16 x i1>
3737 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3741 define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3742 ; CHECK-LABEL: test_mm512_fmadd_ps:
3743 ; CHECK: # %bb.0: # %entry
3744 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3745 ; CHECK-NEXT: ret{{[l|q]}}
3747 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3751 define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3752 ; X86-LABEL: test_mm512_mask_fmadd_ps:
3753 ; X86: # %bb.0: # %entry
3754 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3755 ; X86-NEXT: kmovw %eax, %k1
3756 ; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3759 ; X64-LABEL: test_mm512_mask_fmadd_ps:
3760 ; X64: # %bb.0: # %entry
3761 ; X64-NEXT: kmovw %edi, %k1
3762 ; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
3765 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3766 %1 = bitcast i16 %__U to <16 x i1>
3767 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3771 define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3772 ; X86-LABEL: test_mm512_mask3_fmadd_ps:
3773 ; X86: # %bb.0: # %entry
3774 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3775 ; X86-NEXT: kmovw %eax, %k1
3776 ; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3777 ; X86-NEXT: vmovaps %zmm2, %zmm0
3780 ; X64-LABEL: test_mm512_mask3_fmadd_ps:
3781 ; X64: # %bb.0: # %entry
3782 ; X64-NEXT: kmovw %edi, %k1
3783 ; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) + zmm2
3784 ; X64-NEXT: vmovaps %zmm2, %zmm0
3787 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3788 %1 = bitcast i16 %__U to <16 x i1>
3789 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3793 define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3794 ; X86-LABEL: test_mm512_maskz_fmadd_ps:
3795 ; X86: # %bb.0: # %entry
3796 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3797 ; X86-NEXT: kmovw %eax, %k1
3798 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3801 ; X64-LABEL: test_mm512_maskz_fmadd_ps:
3802 ; X64: # %bb.0: # %entry
3803 ; X64-NEXT: kmovw %edi, %k1
3804 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) + zmm2
3807 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3808 %1 = bitcast i16 %__U to <16 x i1>
3809 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3813 define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3814 ; X86-LABEL: test_mm512_fmsub_ps:
3815 ; X86: # %bb.0: # %entry
3816 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
3817 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3820 ; X64-LABEL: test_mm512_fmsub_ps:
3821 ; X64: # %bb.0: # %entry
3822 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
3823 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3826 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3827 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3831 define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3832 ; X86-LABEL: test_mm512_mask_fmsub_ps:
3833 ; X86: # %bb.0: # %entry
3834 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3835 ; X86-NEXT: kmovw %eax, %k1
3836 ; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3839 ; X64-LABEL: test_mm512_mask_fmsub_ps:
3840 ; X64: # %bb.0: # %entry
3841 ; X64-NEXT: kmovw %edi, %k1
3842 ; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) - zmm2
3845 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3846 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3847 %1 = bitcast i16 %__U to <16 x i1>
3848 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3852 define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3853 ; X86-LABEL: test_mm512_maskz_fmsub_ps:
3854 ; X86: # %bb.0: # %entry
3855 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3856 ; X86-NEXT: kmovw %eax, %k1
3857 ; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3860 ; X64-LABEL: test_mm512_maskz_fmsub_ps:
3861 ; X64: # %bb.0: # %entry
3862 ; X64-NEXT: kmovw %edi, %k1
3863 ; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) - zmm2
3866 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3867 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3868 %1 = bitcast i16 %__U to <16 x i1>
3869 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3873 define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3874 ; X86-LABEL: test_mm512_fnmadd_ps:
3875 ; X86: # %bb.0: # %entry
3876 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm0
3877 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3880 ; X64-LABEL: test_mm512_fnmadd_ps:
3881 ; X64: # %bb.0: # %entry
3882 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
3883 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3886 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3887 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3891 define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3892 ; X86-LABEL: test_mm512_mask3_fnmadd_ps:
3893 ; X86: # %bb.0: # %entry
3894 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3895 ; X86-NEXT: kmovw %eax, %k1
3896 ; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3897 ; X86-NEXT: vmovaps %zmm2, %zmm0
3900 ; X64-LABEL: test_mm512_mask3_fnmadd_ps:
3901 ; X64: # %bb.0: # %entry
3902 ; X64-NEXT: kmovw %edi, %k1
3903 ; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) + zmm2
3904 ; X64-NEXT: vmovaps %zmm2, %zmm0
3907 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3908 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3909 %1 = bitcast i16 %__U to <16 x i1>
3910 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3914 define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3915 ; X86-LABEL: test_mm512_maskz_fnmadd_ps:
3916 ; X86: # %bb.0: # %entry
3917 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3918 ; X86-NEXT: kmovw %eax, %k1
3919 ; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3922 ; X64-LABEL: test_mm512_maskz_fnmadd_ps:
3923 ; X64: # %bb.0: # %entry
3924 ; X64-NEXT: kmovw %edi, %k1
3925 ; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) + zmm2
3928 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3929 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3930 %1 = bitcast i16 %__U to <16 x i1>
3931 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3935 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3936 ; CHECK-LABEL: test_mm512_fnmsub_ps:
3937 ; CHECK: # %bb.0: # %entry
3938 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3939 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3940 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3941 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3942 ; CHECK-NEXT: ret{{[l|q]}}
3944 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3945 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3946 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3950 define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3951 ; X86-LABEL: test_mm512_maskz_fnmsub_ps:
3952 ; X86: # %bb.0: # %entry
3953 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3954 ; X86-NEXT: kmovw %eax, %k1
3955 ; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3958 ; X64-LABEL: test_mm512_maskz_fnmsub_ps:
3959 ; X64: # %bb.0: # %entry
3960 ; X64-NEXT: kmovw %edi, %k1
3961 ; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 {%k1} {z} = -(zmm1 * zmm0) - zmm2
3964 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3965 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3966 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3967 %1 = bitcast i16 %__U to <16 x i1>
3968 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3972 define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3973 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
3974 ; CHECK: # %bb.0: # %entry
3975 ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3976 ; CHECK-NEXT: ret{{[l|q]}}
3978 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3982 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3984 define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3985 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
3986 ; X86: # %bb.0: # %entry
3987 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
3988 ; X86-NEXT: kmovw %eax, %k1
3989 ; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3992 ; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
3993 ; X64: # %bb.0: # %entry
3994 ; X64-NEXT: kmovw %edi, %k1
3995 ; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3998 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3999 %1 = bitcast i8 %__U to <8 x i1>
4000 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4004 define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4005 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
4006 ; X86: # %bb.0: # %entry
4007 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4008 ; X86-NEXT: kmovw %eax, %k1
4009 ; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4010 ; X86-NEXT: vmovapd %zmm2, %zmm0
4013 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
4014 ; X64: # %bb.0: # %entry
4015 ; X64-NEXT: kmovw %edi, %k1
4016 ; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4017 ; X64-NEXT: vmovapd %zmm2, %zmm0
4020 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
4021 %1 = bitcast i8 %__U to <8 x i1>
4022 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4026 define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4027 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
4028 ; X86: # %bb.0: # %entry
4029 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4030 ; X86-NEXT: kmovw %eax, %k1
4031 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4034 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
4035 ; X64: # %bb.0: # %entry
4036 ; X64-NEXT: kmovw %edi, %k1
4037 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4040 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
4041 %1 = bitcast i8 %__U to <8 x i1>
4042 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4046 define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4047 ; X86-LABEL: test_mm512_fmsubadd_round_pd:
4048 ; X86: # %bb.0: # %entry
4049 ; X86-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %zmm2, %zmm2
4050 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4053 ; X64-LABEL: test_mm512_fmsubadd_round_pd:
4054 ; X64: # %bb.0: # %entry
4055 ; X64-NEXT: vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm2
4056 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4059 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4060 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4064 define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4065 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
4066 ; X86: # %bb.0: # %entry
4067 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4068 ; X86-NEXT: kmovw %eax, %k1
4069 ; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4072 ; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
4073 ; X64: # %bb.0: # %entry
4074 ; X64-NEXT: kmovw %edi, %k1
4075 ; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4078 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4079 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4080 %1 = bitcast i8 %__U to <8 x i1>
4081 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4085 define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4086 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4087 ; X86: # %bb.0: # %entry
4088 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4089 ; X86-NEXT: kmovw %eax, %k1
4090 ; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4093 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4094 ; X64: # %bb.0: # %entry
4095 ; X64-NEXT: kmovw %edi, %k1
4096 ; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4099 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4100 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4101 %1 = bitcast i8 %__U to <8 x i1>
4102 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4106 define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4107 ; CHECK-LABEL: test_mm512_fmaddsub_pd:
4108 ; CHECK: # %bb.0: # %entry
4109 ; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4110 ; CHECK-NEXT: ret{{[l|q]}}
4112 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4113 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4114 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4115 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4119 define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4120 ; X86-LABEL: test_mm512_mask_fmaddsub_pd:
4121 ; X86: # %bb.0: # %entry
4122 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4123 ; X86-NEXT: kmovw %eax, %k1
4124 ; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4127 ; X64-LABEL: test_mm512_mask_fmaddsub_pd:
4128 ; X64: # %bb.0: # %entry
4129 ; X64-NEXT: kmovw %edi, %k1
4130 ; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4133 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4134 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4135 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4136 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4137 %4 = bitcast i8 %__U to <8 x i1>
4138 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
4142 define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4143 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
4144 ; X86: # %bb.0: # %entry
4145 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4146 ; X86-NEXT: kmovw %eax, %k1
4147 ; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4148 ; X86-NEXT: vmovapd %zmm2, %zmm0
4151 ; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
4152 ; X64: # %bb.0: # %entry
4153 ; X64-NEXT: kmovw %edi, %k1
4154 ; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4155 ; X64-NEXT: vmovapd %zmm2, %zmm0
4158 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4159 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4160 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4161 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4162 %4 = bitcast i8 %__U to <8 x i1>
4163 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
4167 define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4168 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
4169 ; X86: # %bb.0: # %entry
4170 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4171 ; X86-NEXT: kmovw %eax, %k1
4172 ; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4175 ; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
4176 ; X64: # %bb.0: # %entry
4177 ; X64-NEXT: kmovw %edi, %k1
4178 ; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4181 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4182 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4183 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4184 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4185 %4 = bitcast i8 %__U to <8 x i1>
4186 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
4190 define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4191 ; CHECK-LABEL: test_mm512_fmsubadd_pd:
4192 ; CHECK: # %bb.0: # %entry
4193 ; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4194 ; CHECK-NEXT: ret{{[l|q]}}
4196 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4197 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4198 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4199 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4203 define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4204 ; X86-LABEL: test_mm512_mask_fmsubadd_pd:
4205 ; X86: # %bb.0: # %entry
4206 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4207 ; X86-NEXT: kmovw %eax, %k1
4208 ; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4211 ; X64-LABEL: test_mm512_mask_fmsubadd_pd:
4212 ; X64: # %bb.0: # %entry
4213 ; X64-NEXT: kmovw %edi, %k1
4214 ; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4217 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4218 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4219 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4220 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4221 %3 = bitcast i8 %__U to <8 x i1>
4222 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
4226 define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4227 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
4228 ; X86: # %bb.0: # %entry
4229 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4230 ; X86-NEXT: kmovw %eax, %k1
4231 ; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4234 ; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
4235 ; X64: # %bb.0: # %entry
4236 ; X64-NEXT: kmovw %edi, %k1
4237 ; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4240 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4241 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4242 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4243 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4244 %3 = bitcast i8 %__U to <8 x i1>
4245 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
4249 define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4250 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
4251 ; CHECK: # %bb.0: # %entry
4252 ; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4253 ; CHECK-NEXT: ret{{[l|q]}}
4255 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4259 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
4261 define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4262 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
4263 ; X86: # %bb.0: # %entry
4264 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4265 ; X86-NEXT: kmovw %eax, %k1
4266 ; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4269 ; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
4270 ; X64: # %bb.0: # %entry
4271 ; X64-NEXT: kmovw %edi, %k1
4272 ; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4275 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4276 %1 = bitcast i16 %__U to <16 x i1>
4277 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4281 define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4282 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4283 ; X86: # %bb.0: # %entry
4284 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4285 ; X86-NEXT: kmovw %eax, %k1
4286 ; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4287 ; X86-NEXT: vmovaps %zmm2, %zmm0
4290 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4291 ; X64: # %bb.0: # %entry
4292 ; X64-NEXT: kmovw %edi, %k1
4293 ; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4294 ; X64-NEXT: vmovaps %zmm2, %zmm0
4297 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4298 %1 = bitcast i16 %__U to <16 x i1>
4299 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4303 define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4304 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4305 ; X86: # %bb.0: # %entry
4306 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4307 ; X86-NEXT: kmovw %eax, %k1
4308 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4311 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4312 ; X64: # %bb.0: # %entry
4313 ; X64-NEXT: kmovw %edi, %k1
4314 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4317 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4318 %1 = bitcast i16 %__U to <16 x i1>
4319 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4323 define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4324 ; X86-LABEL: test_mm512_fmsubadd_round_ps:
4325 ; X86: # %bb.0: # %entry
4326 ; X86-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm2, %zmm2
4327 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4330 ; X64-LABEL: test_mm512_fmsubadd_round_ps:
4331 ; X64: # %bb.0: # %entry
4332 ; X64-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
4333 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4336 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4337 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4341 define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4342 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
4343 ; X86: # %bb.0: # %entry
4344 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4345 ; X86-NEXT: kmovw %eax, %k1
4346 ; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4349 ; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
4350 ; X64: # %bb.0: # %entry
4351 ; X64-NEXT: kmovw %edi, %k1
4352 ; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4355 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4356 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4357 %1 = bitcast i16 %__U to <16 x i1>
4358 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4362 define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4363 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4364 ; X86: # %bb.0: # %entry
4365 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4366 ; X86-NEXT: kmovw %eax, %k1
4367 ; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4370 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4371 ; X64: # %bb.0: # %entry
4372 ; X64-NEXT: kmovw %edi, %k1
4373 ; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4376 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4377 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4378 %1 = bitcast i16 %__U to <16 x i1>
4379 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4383 define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4384 ; CHECK-LABEL: test_mm512_fmaddsub_ps:
4385 ; CHECK: # %bb.0: # %entry
4386 ; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4387 ; CHECK-NEXT: ret{{[l|q]}}
4389 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4390 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4391 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4392 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4396 define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4397 ; X86-LABEL: test_mm512_mask_fmaddsub_ps:
4398 ; X86: # %bb.0: # %entry
4399 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4400 ; X86-NEXT: kmovw %eax, %k1
4401 ; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4404 ; X64-LABEL: test_mm512_mask_fmaddsub_ps:
4405 ; X64: # %bb.0: # %entry
4406 ; X64-NEXT: kmovw %edi, %k1
4407 ; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) +/- zmm2
4410 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4411 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4412 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4413 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4414 %4 = bitcast i16 %__U to <16 x i1>
4415 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
4419 define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4420 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
4421 ; X86: # %bb.0: # %entry
4422 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4423 ; X86-NEXT: kmovw %eax, %k1
4424 ; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4425 ; X86-NEXT: vmovaps %zmm2, %zmm0
4428 ; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
4429 ; X64: # %bb.0: # %entry
4430 ; X64-NEXT: kmovw %edi, %k1
4431 ; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) +/- zmm2
4432 ; X64-NEXT: vmovaps %zmm2, %zmm0
4435 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4436 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4437 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4438 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4439 %4 = bitcast i16 %__U to <16 x i1>
4440 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
4444 define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4445 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
4446 ; X86: # %bb.0: # %entry
4447 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4448 ; X86-NEXT: kmovw %eax, %k1
4449 ; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4452 ; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
4453 ; X64: # %bb.0: # %entry
4454 ; X64-NEXT: kmovw %edi, %k1
4455 ; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) +/- zmm2
4458 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4459 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4460 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4461 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4462 %4 = bitcast i16 %__U to <16 x i1>
4463 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
4467 define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4468 ; CHECK-LABEL: test_mm512_fmsubadd_ps:
4469 ; CHECK: # %bb.0: # %entry
4470 ; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4471 ; CHECK-NEXT: ret{{[l|q]}}
4473 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4474 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4475 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4476 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4480 define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4481 ; X86-LABEL: test_mm512_mask_fmsubadd_ps:
4482 ; X86: # %bb.0: # %entry
4483 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4484 ; X86-NEXT: kmovw %eax, %k1
4485 ; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4488 ; X64-LABEL: test_mm512_mask_fmsubadd_ps:
4489 ; X64: # %bb.0: # %entry
4490 ; X64-NEXT: kmovw %edi, %k1
4491 ; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
4494 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4495 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4496 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4497 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4498 %3 = bitcast i16 %__U to <16 x i1>
4499 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
4503 define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4504 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
4505 ; X86: # %bb.0: # %entry
4506 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4507 ; X86-NEXT: kmovw %eax, %k1
4508 ; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4511 ; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
4512 ; X64: # %bb.0: # %entry
4513 ; X64-NEXT: kmovw %edi, %k1
4514 ; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 {%k1} {z} = (zmm1 * zmm0) -/+ zmm2
4517 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4518 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4519 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4520 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4521 %3 = bitcast i16 %__U to <16 x i1>
4522 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
4526 define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4527 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
4528 ; X86: # %bb.0: # %entry
4529 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4530 ; X86-NEXT: kmovw %eax, %k1
4531 ; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4532 ; X86-NEXT: vmovapd %zmm2, %zmm0
4535 ; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
4536 ; X64: # %bb.0: # %entry
4537 ; X64-NEXT: kmovw %edi, %k1
4538 ; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4539 ; X64-NEXT: vmovapd %zmm2, %zmm0
4542 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4543 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4544 %1 = bitcast i8 %__U to <8 x i1>
4545 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4549 define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4550 ; X86-LABEL: test_mm512_mask3_fmsub_pd:
4551 ; X86: # %bb.0: # %entry
4552 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4553 ; X86-NEXT: kmovw %eax, %k1
4554 ; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4555 ; X86-NEXT: vmovapd %zmm2, %zmm0
4558 ; X64-LABEL: test_mm512_mask3_fmsub_pd:
4559 ; X64: # %bb.0: # %entry
4560 ; X64-NEXT: kmovw %edi, %k1
4561 ; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4562 ; X64-NEXT: vmovapd %zmm2, %zmm0
4565 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4566 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4567 %1 = bitcast i8 %__U to <8 x i1>
4568 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4572 define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4573 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
4574 ; X86: # %bb.0: # %entry
4575 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4576 ; X86-NEXT: kmovw %eax, %k1
4577 ; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4578 ; X86-NEXT: vmovaps %zmm2, %zmm0
4581 ; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
4582 ; X64: # %bb.0: # %entry
4583 ; X64-NEXT: kmovw %edi, %k1
4584 ; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4585 ; X64-NEXT: vmovaps %zmm2, %zmm0
4588 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4589 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4590 %1 = bitcast i16 %__U to <16 x i1>
4591 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4595 define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4596 ; X86-LABEL: test_mm512_mask3_fmsub_ps:
4597 ; X86: # %bb.0: # %entry
4598 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4599 ; X86-NEXT: kmovw %eax, %k1
4600 ; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4601 ; X86-NEXT: vmovaps %zmm2, %zmm0
4604 ; X64-LABEL: test_mm512_mask3_fmsub_ps:
4605 ; X64: # %bb.0: # %entry
4606 ; X64-NEXT: kmovw %edi, %k1
4607 ; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) - zmm2
4608 ; X64-NEXT: vmovaps %zmm2, %zmm0
4611 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4612 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4613 %1 = bitcast i16 %__U to <16 x i1>
4614 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4618 define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4619 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4620 ; X86: # %bb.0: # %entry
4621 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4622 ; X86-NEXT: kmovw %eax, %k1
4623 ; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4624 ; X86-NEXT: vmovapd %zmm2, %zmm0
4627 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4628 ; X64: # %bb.0: # %entry
4629 ; X64-NEXT: kmovw %edi, %k1
4630 ; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4631 ; X64-NEXT: vmovapd %zmm2, %zmm0
4634 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4635 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4636 %1 = bitcast i8 %__U to <8 x i1>
4637 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4641 define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4642 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
4643 ; X86: # %bb.0: # %entry
4644 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4645 ; X86-NEXT: kmovw %eax, %k1
4646 ; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4647 ; X86-NEXT: vmovapd %zmm2, %zmm0
4650 ; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
4651 ; X64: # %bb.0: # %entry
4652 ; X64-NEXT: kmovw %edi, %k1
4653 ; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4654 ; X64-NEXT: vmovapd %zmm2, %zmm0
4657 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4658 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4659 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4660 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4661 %3 = bitcast i8 %__U to <8 x i1>
4662 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
4666 define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4667 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4668 ; X86: # %bb.0: # %entry
4669 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4670 ; X86-NEXT: kmovw %eax, %k1
4671 ; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4672 ; X86-NEXT: vmovaps %zmm2, %zmm0
4675 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4676 ; X64: # %bb.0: # %entry
4677 ; X64-NEXT: kmovw %edi, %k1
4678 ; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4679 ; X64-NEXT: vmovaps %zmm2, %zmm0
4682 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4683 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4684 %1 = bitcast i16 %__U to <16 x i1>
4685 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4689 define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4690 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
4691 ; X86: # %bb.0: # %entry
4692 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4693 ; X86-NEXT: kmovw %eax, %k1
4694 ; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4695 ; X86-NEXT: vmovaps %zmm2, %zmm0
4698 ; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
4699 ; X64: # %bb.0: # %entry
4700 ; X64-NEXT: kmovw %edi, %k1
4701 ; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 {%k1} = (zmm0 * zmm1) -/+ zmm2
4702 ; X64-NEXT: vmovaps %zmm2, %zmm0
4705 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4706 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4707 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4708 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4709 %3 = bitcast i16 %__U to <16 x i1>
4710 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
4714 define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4715 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
4716 ; X86: # %bb.0: # %entry
4717 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4718 ; X86-NEXT: kmovw %eax, %k1
4719 ; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4722 ; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
4723 ; X64: # %bb.0: # %entry
4724 ; X64-NEXT: kmovw %edi, %k1
4725 ; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4728 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4729 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
4730 %1 = bitcast i8 %__U to <8 x i1>
4731 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4735 define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4736 ; X86-LABEL: test_mm512_mask_fnmadd_pd:
4737 ; X86: # %bb.0: # %entry
4738 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4739 ; X86-NEXT: kmovw %eax, %k1
4740 ; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4743 ; X64-LABEL: test_mm512_mask_fnmadd_pd:
4744 ; X64: # %bb.0: # %entry
4745 ; X64-NEXT: kmovw %edi, %k1
4746 ; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4749 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4750 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
4751 %1 = bitcast i8 %__U to <8 x i1>
4752 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4756 define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4757 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
4758 ; X86: # %bb.0: # %entry
4759 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4760 ; X86-NEXT: kmovw %eax, %k1
4761 ; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4764 ; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
4765 ; X64: # %bb.0: # %entry
4766 ; X64-NEXT: kmovw %edi, %k1
4767 ; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4770 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4771 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
4772 %1 = bitcast i16 %__U to <16 x i1>
4773 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4777 define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4778 ; X86-LABEL: test_mm512_mask_fnmadd_ps:
4779 ; X86: # %bb.0: # %entry
4780 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4781 ; X86-NEXT: kmovw %eax, %k1
4782 ; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4785 ; X64-LABEL: test_mm512_mask_fnmadd_ps:
4786 ; X64: # %bb.0: # %entry
4787 ; X64-NEXT: kmovw %edi, %k1
4788 ; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) + zmm2
4791 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4792 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
4793 %1 = bitcast i16 %__U to <16 x i1>
4794 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4798 define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4799 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
4800 ; X86: # %bb.0: # %entry
4801 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4802 ; X86-NEXT: kmovw %eax, %k1
4803 ; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4806 ; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
4807 ; X64: # %bb.0: # %entry
4808 ; X64-NEXT: kmovw %edi, %k1
4809 ; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4812 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4813 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4814 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4815 %1 = bitcast i8 %__U to <8 x i1>
4816 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4820 define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4821 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
4822 ; X86: # %bb.0: # %entry
4823 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4824 ; X86-NEXT: kmovw %eax, %k1
4825 ; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4826 ; X86-NEXT: vmovapd %zmm2, %zmm0
4829 ; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
4830 ; X64: # %bb.0: # %entry
4831 ; X64-NEXT: kmovw %edi, %k1
4832 ; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4833 ; X64-NEXT: vmovapd %zmm2, %zmm0
4836 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4837 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4838 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4839 %1 = bitcast i8 %__U to <8 x i1>
4840 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4844 define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4845 ; X86-LABEL: test_mm512_mask_fnmsub_pd:
4846 ; X86: # %bb.0: # %entry
4847 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4848 ; X86-NEXT: kmovw %eax, %k1
4849 ; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4852 ; X64-LABEL: test_mm512_mask_fnmsub_pd:
4853 ; X64: # %bb.0: # %entry
4854 ; X64-NEXT: kmovw %edi, %k1
4855 ; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4858 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4859 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4860 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4861 %1 = bitcast i8 %__U to <8 x i1>
4862 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4866 define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4867 ; X86-LABEL: test_mm512_mask3_fnmsub_pd:
4868 ; X86: # %bb.0: # %entry
4869 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4870 ; X86-NEXT: kmovw %eax, %k1
4871 ; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4872 ; X86-NEXT: vmovapd %zmm2, %zmm0
4875 ; X64-LABEL: test_mm512_mask3_fnmsub_pd:
4876 ; X64: # %bb.0: # %entry
4877 ; X64-NEXT: kmovw %edi, %k1
4878 ; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4879 ; X64-NEXT: vmovapd %zmm2, %zmm0
4882 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4883 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4884 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4885 %1 = bitcast i8 %__U to <8 x i1>
4886 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4890 define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4891 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
4892 ; X86: # %bb.0: # %entry
4893 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4894 ; X86-NEXT: kmovw %eax, %k1
4895 ; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4898 ; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
4899 ; X64: # %bb.0: # %entry
4900 ; X64-NEXT: kmovw %edi, %k1
4901 ; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4904 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4905 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4906 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4907 %1 = bitcast i16 %__U to <16 x i1>
4908 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4912 define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4913 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
4914 ; X86: # %bb.0: # %entry
4915 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4916 ; X86-NEXT: kmovw %eax, %k1
4917 ; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4918 ; X86-NEXT: vmovaps %zmm2, %zmm0
4921 ; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
4922 ; X64: # %bb.0: # %entry
4923 ; X64-NEXT: kmovw %edi, %k1
4924 ; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4925 ; X64-NEXT: vmovaps %zmm2, %zmm0
4928 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4929 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4930 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4931 %1 = bitcast i16 %__U to <16 x i1>
4932 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4936 define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4937 ; X86-LABEL: test_mm512_mask_fnmsub_ps:
4938 ; X86: # %bb.0: # %entry
4939 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4940 ; X86-NEXT: kmovw %eax, %k1
4941 ; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4944 ; X64-LABEL: test_mm512_mask_fnmsub_ps:
4945 ; X64: # %bb.0: # %entry
4946 ; X64-NEXT: kmovw %edi, %k1
4947 ; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 {%k1} = -(zmm0 * zmm1) - zmm2
4950 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4951 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4952 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4953 %1 = bitcast i16 %__U to <16 x i1>
4954 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4958 define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4959 ; X86-LABEL: test_mm512_mask3_fnmsub_ps:
4960 ; X86: # %bb.0: # %entry
4961 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4962 ; X86-NEXT: kmovw %eax, %k1
4963 ; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4964 ; X86-NEXT: vmovaps %zmm2, %zmm0
4967 ; X64-LABEL: test_mm512_mask3_fnmsub_ps:
4968 ; X64: # %bb.0: # %entry
4969 ; X64-NEXT: kmovw %edi, %k1
4970 ; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 {%k1} = -(zmm0 * zmm1) - zmm2
4971 ; X64-NEXT: vmovaps %zmm2, %zmm0
4974 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4975 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4976 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4977 %1 = bitcast i16 %__U to <16 x i1>
4978 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4982 define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4983 ; X86-LABEL: test_mm_mask_fmadd_ss:
4984 ; X86: # %bb.0: # %entry
4985 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
4986 ; X86-NEXT: kmovw %eax, %k1
4987 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
4990 ; X64-LABEL: test_mm_mask_fmadd_ss:
4991 ; X64: # %bb.0: # %entry
4992 ; X64-NEXT: kmovw %edi, %k1
4993 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
4996 %0 = extractelement <4 x float> %__W, i64 0
4997 %1 = extractelement <4 x float> %__A, i64 0
4998 %2 = extractelement <4 x float> %__B, i64 0
4999 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5001 %tobool.i = icmp eq i8 %4, 0
5002 %vecext1.i = extractelement <4 x float> %__W, i32 0
5003 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5004 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5005 ret <4 x float> %vecins.i
5008 define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5009 ; X86-LABEL: test_mm_mask_fmadd_round_ss:
5010 ; X86: # %bb.0: # %entry
5011 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5012 ; X86-NEXT: kmovw %eax, %k1
5013 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5016 ; X64-LABEL: test_mm_mask_fmadd_round_ss:
5017 ; X64: # %bb.0: # %entry
5018 ; X64-NEXT: kmovw %edi, %k1
5019 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5022 %0 = extractelement <4 x float> %__W, i64 0
5023 %1 = extractelement <4 x float> %__A, i64 0
5024 %2 = extractelement <4 x float> %__B, i64 0
5025 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5026 %4 = bitcast i8 %__U to <8 x i1>
5027 %5 = extractelement <8 x i1> %4, i64 0
5028 %6 = select i1 %5, float %3, float %0
5029 %7 = insertelement <4 x float> %__W, float %6, i64 0
5033 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
5035 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5036 ; X86-LABEL: test_mm_maskz_fmadd_ss:
5037 ; X86: # %bb.0: # %entry
5038 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5039 ; X86-NEXT: kmovw %eax, %k1
5040 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5043 ; X64-LABEL: test_mm_maskz_fmadd_ss:
5044 ; X64: # %bb.0: # %entry
5045 ; X64-NEXT: kmovw %edi, %k1
5046 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5049 %0 = extractelement <4 x float> %__A, i64 0
5050 %1 = extractelement <4 x float> %__B, i64 0
5051 %2 = extractelement <4 x float> %__C, i64 0
5052 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5054 %tobool.i = icmp eq i8 %4, 0
5055 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5056 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5057 ret <4 x float> %vecins.i
5060 define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5061 ; X86-LABEL: test_mm_maskz_fmadd_round_ss:
5062 ; X86: # %bb.0: # %entry
5063 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5064 ; X86-NEXT: kmovw %eax, %k1
5065 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5068 ; X64-LABEL: test_mm_maskz_fmadd_round_ss:
5069 ; X64: # %bb.0: # %entry
5070 ; X64-NEXT: kmovw %edi, %k1
5071 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5074 %0 = extractelement <4 x float> %__A, i64 0
5075 %1 = extractelement <4 x float> %__B, i64 0
5076 %2 = extractelement <4 x float> %__C, i64 0
5077 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5078 %4 = bitcast i8 %__U to <8 x i1>
5079 %5 = extractelement <8 x i1> %4, i64 0
5080 %6 = select i1 %5, float %3, float 0.000000e+00
5081 %7 = insertelement <4 x float> %__A, float %6, i64 0
5085 define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5086 ; X86-LABEL: test_mm_mask3_fmadd_ss:
5087 ; X86: # %bb.0: # %entry
5088 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5089 ; X86-NEXT: kmovw %eax, %k1
5090 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5091 ; X86-NEXT: vmovaps %xmm2, %xmm0
5094 ; X64-LABEL: test_mm_mask3_fmadd_ss:
5095 ; X64: # %bb.0: # %entry
5096 ; X64-NEXT: kmovw %edi, %k1
5097 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5098 ; X64-NEXT: vmovaps %xmm2, %xmm0
5101 %0 = extractelement <4 x float> %__W, i64 0
5102 %1 = extractelement <4 x float> %__X, i64 0
5103 %2 = extractelement <4 x float> %__Y, i64 0
5104 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5106 %tobool.i = icmp eq i8 %4, 0
5107 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5108 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5109 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5110 ret <4 x float> %vecins.i
5113 define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5114 ; X86-LABEL: test_mm_mask3_fmadd_round_ss:
5115 ; X86: # %bb.0: # %entry
5116 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5117 ; X86-NEXT: kmovw %eax, %k1
5118 ; X86-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5119 ; X86-NEXT: vmovaps %xmm2, %xmm0
5122 ; X64-LABEL: test_mm_mask3_fmadd_round_ss:
5123 ; X64: # %bb.0: # %entry
5124 ; X64-NEXT: kmovw %edi, %k1
5125 ; X64-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5126 ; X64-NEXT: vmovaps %xmm2, %xmm0
5129 %0 = extractelement <4 x float> %__W, i64 0
5130 %1 = extractelement <4 x float> %__X, i64 0
5131 %2 = extractelement <4 x float> %__Y, i64 0
5132 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5133 %4 = bitcast i8 %__U to <8 x i1>
5134 %5 = extractelement <8 x i1> %4, i64 0
5135 %6 = select i1 %5, float %3, float %2
5136 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5140 define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5141 ; X86-LABEL: test_mm_mask_fmsub_ss:
5142 ; X86: # %bb.0: # %entry
5143 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5144 ; X86-NEXT: kmovw %eax, %k1
5145 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5148 ; X64-LABEL: test_mm_mask_fmsub_ss:
5149 ; X64: # %bb.0: # %entry
5150 ; X64-NEXT: kmovw %edi, %k1
5151 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5154 %0 = extractelement <4 x float> %__W, i64 0
5155 %1 = extractelement <4 x float> %__A, i64 0
5156 %.rhs.i = extractelement <4 x float> %__B, i64 0
5157 %2 = fsub float -0.000000e+00, %.rhs.i
5158 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5160 %tobool.i = icmp eq i8 %4, 0
5161 %vecext1.i = extractelement <4 x float> %__W, i32 0
5162 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5163 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5164 ret <4 x float> %vecins.i
5167 define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5168 ; X86-LABEL: test_mm_mask_fmsub_round_ss:
5169 ; X86: # %bb.0: # %entry
5170 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5171 ; X86-NEXT: kmovw %eax, %k1
5172 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5175 ; X64-LABEL: test_mm_mask_fmsub_round_ss:
5176 ; X64: # %bb.0: # %entry
5177 ; X64-NEXT: kmovw %edi, %k1
5178 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5181 %0 = extractelement <4 x float> %__W, i64 0
5182 %1 = extractelement <4 x float> %__A, i64 0
5183 %.rhs = extractelement <4 x float> %__B, i64 0
5184 %2 = fsub float -0.000000e+00, %.rhs
5185 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5186 %4 = bitcast i8 %__U to <8 x i1>
5187 %5 = extractelement <8 x i1> %4, i64 0
5188 %6 = select i1 %5, float %3, float %0
5189 %7 = insertelement <4 x float> %__W, float %6, i64 0
5193 define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5194 ; X86-LABEL: test_mm_maskz_fmsub_ss:
5195 ; X86: # %bb.0: # %entry
5196 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5197 ; X86-NEXT: kmovw %eax, %k1
5198 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5201 ; X64-LABEL: test_mm_maskz_fmsub_ss:
5202 ; X64: # %bb.0: # %entry
5203 ; X64-NEXT: kmovw %edi, %k1
5204 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5207 %0 = extractelement <4 x float> %__A, i64 0
5208 %1 = extractelement <4 x float> %__B, i64 0
5209 %.rhs.i = extractelement <4 x float> %__C, i64 0
5210 %2 = fsub float -0.000000e+00, %.rhs.i
5211 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5213 %tobool.i = icmp eq i8 %4, 0
5214 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5215 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5216 ret <4 x float> %vecins.i
5219 define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5220 ; X86-LABEL: test_mm_maskz_fmsub_round_ss:
5221 ; X86: # %bb.0: # %entry
5222 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5223 ; X86-NEXT: kmovw %eax, %k1
5224 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5227 ; X64-LABEL: test_mm_maskz_fmsub_round_ss:
5228 ; X64: # %bb.0: # %entry
5229 ; X64-NEXT: kmovw %edi, %k1
5230 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5233 %0 = extractelement <4 x float> %__A, i64 0
5234 %1 = extractelement <4 x float> %__B, i64 0
5235 %.rhs = extractelement <4 x float> %__C, i64 0
5236 %2 = fsub float -0.000000e+00, %.rhs
5237 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5238 %4 = bitcast i8 %__U to <8 x i1>
5239 %5 = extractelement <8 x i1> %4, i64 0
5240 %6 = select i1 %5, float %3, float 0.000000e+00
5241 %7 = insertelement <4 x float> %__A, float %6, i64 0
5245 define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5246 ; X86-LABEL: test_mm_mask3_fmsub_ss:
5247 ; X86: # %bb.0: # %entry
5248 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5249 ; X86-NEXT: kmovw %eax, %k1
5250 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5251 ; X86-NEXT: vmovaps %xmm2, %xmm0
5254 ; X64-LABEL: test_mm_mask3_fmsub_ss:
5255 ; X64: # %bb.0: # %entry
5256 ; X64-NEXT: kmovw %edi, %k1
5257 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5258 ; X64-NEXT: vmovaps %xmm2, %xmm0
5261 %0 = extractelement <4 x float> %__W, i64 0
5262 %1 = extractelement <4 x float> %__X, i64 0
5263 %.rhs.i = extractelement <4 x float> %__Y, i64 0
5264 %2 = fsub float -0.000000e+00, %.rhs.i
5265 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5267 %tobool.i = icmp eq i8 %4, 0
5268 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5269 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5270 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5271 ret <4 x float> %vecins.i
5274 define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5275 ; X86-LABEL: test_mm_mask3_fmsub_round_ss:
5276 ; X86: # %bb.0: # %entry
5277 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5278 ; X86-NEXT: kmovw %eax, %k1
5279 ; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5280 ; X86-NEXT: vmovaps %xmm2, %xmm0
5283 ; X64-LABEL: test_mm_mask3_fmsub_round_ss:
5284 ; X64: # %bb.0: # %entry
5285 ; X64-NEXT: kmovw %edi, %k1
5286 ; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5287 ; X64-NEXT: vmovaps %xmm2, %xmm0
5290 %0 = extractelement <4 x float> %__W, i64 0
5291 %1 = extractelement <4 x float> %__X, i64 0
5292 %.rhs = extractelement <4 x float> %__Y, i64 0
5293 %2 = fsub float -0.000000e+00, %.rhs
5294 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5295 %4 = bitcast i8 %__U to <8 x i1>
5296 %5 = extractelement <8 x i1> %4, i64 0
5297 %6 = select i1 %5, float %3, float %.rhs
5298 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5302 define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5303 ; X86-LABEL: test_mm_mask_fnmadd_ss:
5304 ; X86: # %bb.0: # %entry
5305 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5306 ; X86-NEXT: kmovw %eax, %k1
5307 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5310 ; X64-LABEL: test_mm_mask_fnmadd_ss:
5311 ; X64: # %bb.0: # %entry
5312 ; X64-NEXT: kmovw %edi, %k1
5313 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5316 %0 = extractelement <4 x float> %__W, i64 0
5317 %.rhs.i = extractelement <4 x float> %__A, i64 0
5318 %1 = fsub float -0.000000e+00, %.rhs.i
5319 %2 = extractelement <4 x float> %__B, i64 0
5320 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5322 %tobool.i = icmp eq i8 %4, 0
5323 %vecext1.i = extractelement <4 x float> %__W, i32 0
5324 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5325 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5326 ret <4 x float> %vecins.i
5329 define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5330 ; X86-LABEL: test_mm_mask_fnmadd_round_ss:
5331 ; X86: # %bb.0: # %entry
5332 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5333 ; X86-NEXT: kmovw %eax, %k1
5334 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5337 ; X64-LABEL: test_mm_mask_fnmadd_round_ss:
5338 ; X64: # %bb.0: # %entry
5339 ; X64-NEXT: kmovw %edi, %k1
5340 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5343 %0 = extractelement <4 x float> %__W, i64 0
5344 %.rhs = extractelement <4 x float> %__A, i64 0
5345 %1 = fsub float -0.000000e+00, %.rhs
5346 %2 = extractelement <4 x float> %__B, i64 0
5347 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5348 %4 = bitcast i8 %__U to <8 x i1>
5349 %5 = extractelement <8 x i1> %4, i64 0
5350 %6 = select i1 %5, float %3, float %0
5351 %7 = insertelement <4 x float> %__W, float %6, i64 0
5355 define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5356 ; X86-LABEL: test_mm_maskz_fnmadd_ss:
5357 ; X86: # %bb.0: # %entry
5358 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5359 ; X86-NEXT: kmovw %eax, %k1
5360 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
5363 ; X64-LABEL: test_mm_maskz_fnmadd_ss:
5364 ; X64: # %bb.0: # %entry
5365 ; X64-NEXT: kmovw %edi, %k1
5366 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
5369 %0 = extractelement <4 x float> %__A, i64 0
5370 %.rhs.i = extractelement <4 x float> %__B, i64 0
5371 %1 = fsub float -0.000000e+00, %.rhs.i
5372 %2 = extractelement <4 x float> %__C, i64 0
5373 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5375 %tobool.i = icmp eq i8 %4, 0
5376 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5377 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5378 ret <4 x float> %vecins.i
5381 define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5382 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
5383 ; X86: # %bb.0: # %entry
5384 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5385 ; X86-NEXT: kmovw %eax, %k1
5386 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5389 ; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
5390 ; X64: # %bb.0: # %entry
5391 ; X64-NEXT: kmovw %edi, %k1
5392 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5395 %0 = extractelement <4 x float> %__A, i64 0
5396 %.rhs = extractelement <4 x float> %__B, i64 0
5397 %1 = fsub float -0.000000e+00, %.rhs
5398 %2 = extractelement <4 x float> %__C, i64 0
5399 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5400 %4 = bitcast i8 %__U to <8 x i1>
5401 %5 = extractelement <8 x i1> %4, i64 0
5402 %6 = select i1 %5, float %3, float 0.000000e+00
5403 %7 = insertelement <4 x float> %__A, float %6, i64 0
5407 define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5408 ; X86-LABEL: test_mm_mask3_fnmadd_ss:
5409 ; X86: # %bb.0: # %entry
5410 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5411 ; X86-NEXT: kmovw %eax, %k1
5412 ; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
5413 ; X86-NEXT: vmovaps %xmm2, %xmm0
5416 ; X64-LABEL: test_mm_mask3_fnmadd_ss:
5417 ; X64: # %bb.0: # %entry
5418 ; X64-NEXT: kmovw %edi, %k1
5419 ; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
5420 ; X64-NEXT: vmovaps %xmm2, %xmm0
5423 %0 = extractelement <4 x float> %__W, i64 0
5424 %.rhs.i = extractelement <4 x float> %__X, i64 0
5425 %1 = fsub float -0.000000e+00, %.rhs.i
5426 %2 = extractelement <4 x float> %__Y, i64 0
5427 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5429 %tobool.i = icmp eq i8 %4, 0
5430 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5431 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5432 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5433 ret <4 x float> %vecins.i
5436 define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5437 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
5438 ; X86: # %bb.0: # %entry
5439 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5440 ; X86-NEXT: kmovw %eax, %k1
5441 ; X86-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5442 ; X86-NEXT: vmovaps %xmm2, %xmm0
5445 ; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
5446 ; X64: # %bb.0: # %entry
5447 ; X64-NEXT: kmovw %edi, %k1
5448 ; X64-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5449 ; X64-NEXT: vmovaps %xmm2, %xmm0
5452 %0 = extractelement <4 x float> %__W, i64 0
5453 %.rhs = extractelement <4 x float> %__X, i64 0
5454 %1 = fsub float -0.000000e+00, %.rhs
5455 %2 = extractelement <4 x float> %__Y, i64 0
5456 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5457 %4 = bitcast i8 %__U to <8 x i1>
5458 %5 = extractelement <8 x i1> %4, i64 0
5459 %6 = select i1 %5, float %3, float %2
5460 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5464 define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5465 ; X86-LABEL: test_mm_mask_fnmsub_ss:
5466 ; X86: # %bb.0: # %entry
5467 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5468 ; X86-NEXT: kmovw %eax, %k1
5469 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
5472 ; X64-LABEL: test_mm_mask_fnmsub_ss:
5473 ; X64: # %bb.0: # %entry
5474 ; X64-NEXT: kmovw %edi, %k1
5475 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
5478 %0 = extractelement <4 x float> %__W, i64 0
5479 %.rhs.i = extractelement <4 x float> %__A, i64 0
5480 %1 = fsub float -0.000000e+00, %.rhs.i
5481 %.rhs7.i = extractelement <4 x float> %__B, i64 0
5482 %2 = fsub float -0.000000e+00, %.rhs7.i
5483 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5485 %tobool.i = icmp eq i8 %4, 0
5486 %vecext2.i = extractelement <4 x float> %__W, i32 0
5487 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5488 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5489 ret <4 x float> %vecins.i
5492 define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5493 ; X86-LABEL: test_mm_mask_fnmsub_round_ss:
5494 ; X86: # %bb.0: # %entry
5495 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5496 ; X86-NEXT: kmovw %eax, %k1
5497 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5500 ; X64-LABEL: test_mm_mask_fnmsub_round_ss:
5501 ; X64: # %bb.0: # %entry
5502 ; X64-NEXT: kmovw %edi, %k1
5503 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5506 %0 = extractelement <4 x float> %__W, i64 0
5507 %.rhs = extractelement <4 x float> %__A, i64 0
5508 %1 = fsub float -0.000000e+00, %.rhs
5509 %.rhs2 = extractelement <4 x float> %__B, i64 0
5510 %2 = fsub float -0.000000e+00, %.rhs2
5511 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5512 %4 = bitcast i8 %__U to <8 x i1>
5513 %5 = extractelement <8 x i1> %4, i64 0
5514 %6 = select i1 %5, float %3, float %0
5515 %7 = insertelement <4 x float> %__W, float %6, i64 0
5519 define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5520 ; X86-LABEL: test_mm_maskz_fnmsub_ss:
5521 ; X86: # %bb.0: # %entry
5522 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5523 ; X86-NEXT: kmovw %eax, %k1
5524 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
5527 ; X64-LABEL: test_mm_maskz_fnmsub_ss:
5528 ; X64: # %bb.0: # %entry
5529 ; X64-NEXT: kmovw %edi, %k1
5530 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
5533 %0 = extractelement <4 x float> %__A, i64 0
5534 %.rhs.i = extractelement <4 x float> %__B, i64 0
5535 %1 = fsub float -0.000000e+00, %.rhs.i
5536 %.rhs5.i = extractelement <4 x float> %__C, i64 0
5537 %2 = fsub float -0.000000e+00, %.rhs5.i
5538 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5540 %tobool.i = icmp eq i8 %4, 0
5541 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5542 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5543 ret <4 x float> %vecins.i
5546 define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5547 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
5548 ; X86: # %bb.0: # %entry
5549 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5550 ; X86-NEXT: kmovw %eax, %k1
5551 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5554 ; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
5555 ; X64: # %bb.0: # %entry
5556 ; X64-NEXT: kmovw %edi, %k1
5557 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5560 %0 = extractelement <4 x float> %__A, i64 0
5561 %.rhs = extractelement <4 x float> %__B, i64 0
5562 %1 = fsub float -0.000000e+00, %.rhs
5563 %.rhs2 = extractelement <4 x float> %__C, i64 0
5564 %2 = fsub float -0.000000e+00, %.rhs2
5565 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5566 %4 = bitcast i8 %__U to <8 x i1>
5567 %5 = extractelement <8 x i1> %4, i64 0
5568 %6 = select i1 %5, float %3, float 0.000000e+00
5569 %7 = insertelement <4 x float> %__A, float %6, i64 0
5573 define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5574 ; X86-LABEL: test_mm_mask3_fnmsub_ss:
5575 ; X86: # %bb.0: # %entry
5576 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5577 ; X86-NEXT: kmovw %eax, %k1
5578 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5579 ; X86-NEXT: vmovaps %xmm2, %xmm0
5582 ; X64-LABEL: test_mm_mask3_fnmsub_ss:
5583 ; X64: # %bb.0: # %entry
5584 ; X64-NEXT: kmovw %edi, %k1
5585 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
5586 ; X64-NEXT: vmovaps %xmm2, %xmm0
5589 %0 = extractelement <4 x float> %__W, i64 0
5590 %.rhs.i = extractelement <4 x float> %__X, i64 0
5591 %1 = fsub float -0.000000e+00, %.rhs.i
5592 %.rhs7.i = extractelement <4 x float> %__Y, i64 0
5593 %2 = fsub float -0.000000e+00, %.rhs7.i
5594 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5596 %tobool.i = icmp eq i8 %4, 0
5597 %vecext2.i = extractelement <4 x float> %__Y, i32 0
5598 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5599 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5600 ret <4 x float> %vecins.i
5603 define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5604 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
5605 ; X86: # %bb.0: # %entry
5606 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5607 ; X86-NEXT: kmovw %eax, %k1
5608 ; X86-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5609 ; X86-NEXT: vmovaps %xmm2, %xmm0
5612 ; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
5613 ; X64: # %bb.0: # %entry
5614 ; X64-NEXT: kmovw %edi, %k1
5615 ; X64-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5616 ; X64-NEXT: vmovaps %xmm2, %xmm0
5619 %0 = extractelement <4 x float> %__W, i64 0
5620 %.rhs = extractelement <4 x float> %__X, i64 0
5621 %1 = fsub float -0.000000e+00, %.rhs
5622 %.rhs1 = extractelement <4 x float> %__Y, i64 0
5623 %2 = fsub float -0.000000e+00, %.rhs1
5624 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5625 %4 = bitcast i8 %__U to <8 x i1>
5626 %5 = extractelement <8 x i1> %4, i64 0
5627 %6 = select i1 %5, float %3, float %.rhs1
5628 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5632 define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5633 ; X86-LABEL: test_mm_mask_fmadd_sd:
5634 ; X86: # %bb.0: # %entry
5635 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5636 ; X86-NEXT: kmovw %eax, %k1
5637 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
5640 ; X64-LABEL: test_mm_mask_fmadd_sd:
5641 ; X64: # %bb.0: # %entry
5642 ; X64-NEXT: kmovw %edi, %k1
5643 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) + xmm2
5646 %0 = extractelement <2 x double> %__W, i64 0
5647 %1 = extractelement <2 x double> %__A, i64 0
5648 %2 = extractelement <2 x double> %__B, i64 0
5649 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5651 %tobool.i = icmp eq i8 %4, 0
5652 %vecext1.i = extractelement <2 x double> %__W, i32 0
5653 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5654 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5655 ret <2 x double> %vecins.i
5658 define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5659 ; X86-LABEL: test_mm_mask_fmadd_round_sd:
5660 ; X86: # %bb.0: # %entry
5661 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5662 ; X86-NEXT: kmovw %eax, %k1
5663 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5666 ; X64-LABEL: test_mm_mask_fmadd_round_sd:
5667 ; X64: # %bb.0: # %entry
5668 ; X64-NEXT: kmovw %edi, %k1
5669 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5672 %0 = extractelement <2 x double> %__W, i64 0
5673 %1 = extractelement <2 x double> %__A, i64 0
5674 %2 = extractelement <2 x double> %__B, i64 0
5675 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5676 %4 = bitcast i8 %__U to <8 x i1>
5677 %5 = extractelement <8 x i1> %4, i64 0
5678 %6 = select i1 %5, double %3, double %0
5679 %7 = insertelement <2 x double> %__W, double %6, i64 0
5683 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
5685 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5686 ; X86-LABEL: test_mm_maskz_fmadd_sd:
5687 ; X86: # %bb.0: # %entry
5688 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5689 ; X86-NEXT: kmovw %eax, %k1
5690 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5693 ; X64-LABEL: test_mm_maskz_fmadd_sd:
5694 ; X64: # %bb.0: # %entry
5695 ; X64-NEXT: kmovw %edi, %k1
5696 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2
5699 %0 = extractelement <2 x double> %__A, i64 0
5700 %1 = extractelement <2 x double> %__B, i64 0
5701 %2 = extractelement <2 x double> %__C, i64 0
5702 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5704 %tobool.i = icmp eq i8 %4, 0
5705 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5706 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5707 ret <2 x double> %vecins.i
5710 define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5711 ; X86-LABEL: test_mm_maskz_fmadd_round_sd:
5712 ; X86: # %bb.0: # %entry
5713 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5714 ; X86-NEXT: kmovw %eax, %k1
5715 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5718 ; X64-LABEL: test_mm_maskz_fmadd_round_sd:
5719 ; X64: # %bb.0: # %entry
5720 ; X64-NEXT: kmovw %edi, %k1
5721 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5724 %0 = extractelement <2 x double> %__A, i64 0
5725 %1 = extractelement <2 x double> %__B, i64 0
5726 %2 = extractelement <2 x double> %__C, i64 0
5727 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5728 %4 = bitcast i8 %__U to <8 x i1>
5729 %5 = extractelement <8 x i1> %4, i64 0
5730 %6 = select i1 %5, double %3, double 0.000000e+00
5731 %7 = insertelement <2 x double> %__A, double %6, i64 0
5735 define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5736 ; X86-LABEL: test_mm_mask3_fmadd_sd:
5737 ; X86: # %bb.0: # %entry
5738 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5739 ; X86-NEXT: kmovw %eax, %k1
5740 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5741 ; X86-NEXT: vmovapd %xmm2, %xmm0
5744 ; X64-LABEL: test_mm_mask3_fmadd_sd:
5745 ; X64: # %bb.0: # %entry
5746 ; X64-NEXT: kmovw %edi, %k1
5747 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2
5748 ; X64-NEXT: vmovapd %xmm2, %xmm0
5751 %0 = extractelement <2 x double> %__W, i64 0
5752 %1 = extractelement <2 x double> %__X, i64 0
5753 %2 = extractelement <2 x double> %__Y, i64 0
5754 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5756 %tobool.i = icmp eq i8 %4, 0
5757 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5758 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5759 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5760 ret <2 x double> %vecins.i
5763 define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5764 ; X86-LABEL: test_mm_mask3_fmadd_round_sd:
5765 ; X86: # %bb.0: # %entry
5766 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5767 ; X86-NEXT: kmovw %eax, %k1
5768 ; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5769 ; X86-NEXT: vmovapd %xmm2, %xmm0
5772 ; X64-LABEL: test_mm_mask3_fmadd_round_sd:
5773 ; X64: # %bb.0: # %entry
5774 ; X64-NEXT: kmovw %edi, %k1
5775 ; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5776 ; X64-NEXT: vmovapd %xmm2, %xmm0
5779 %0 = extractelement <2 x double> %__W, i64 0
5780 %1 = extractelement <2 x double> %__X, i64 0
5781 %2 = extractelement <2 x double> %__Y, i64 0
5782 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5783 %4 = bitcast i8 %__U to <8 x i1>
5784 %5 = extractelement <8 x i1> %4, i64 0
5785 %6 = select i1 %5, double %3, double %2
5786 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5790 define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5791 ; X86-LABEL: test_mm_mask_fmsub_sd:
5792 ; X86: # %bb.0: # %entry
5793 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5794 ; X86-NEXT: kmovw %eax, %k1
5795 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5798 ; X64-LABEL: test_mm_mask_fmsub_sd:
5799 ; X64: # %bb.0: # %entry
5800 ; X64-NEXT: kmovw %edi, %k1
5801 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
5804 %0 = extractelement <2 x double> %__W, i64 0
5805 %1 = extractelement <2 x double> %__A, i64 0
5806 %.rhs.i = extractelement <2 x double> %__B, i64 0
5807 %2 = fsub double -0.000000e+00, %.rhs.i
5808 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5810 %tobool.i = icmp eq i8 %4, 0
5811 %vecext1.i = extractelement <2 x double> %__W, i32 0
5812 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5813 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5814 ret <2 x double> %vecins.i
5817 define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5818 ; X86-LABEL: test_mm_mask_fmsub_round_sd:
5819 ; X86: # %bb.0: # %entry
5820 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5821 ; X86-NEXT: kmovw %eax, %k1
5822 ; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5825 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
5826 ; X64: # %bb.0: # %entry
5827 ; X64-NEXT: kmovw %edi, %k1
5828 ; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5831 %0 = extractelement <2 x double> %__W, i64 0
5832 %1 = extractelement <2 x double> %__A, i64 0
5833 %.rhs = extractelement <2 x double> %__B, i64 0
5834 %2 = fsub double -0.000000e+00, %.rhs
5835 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5836 %4 = bitcast i8 %__U to <8 x i1>
5837 %5 = extractelement <8 x i1> %4, i64 0
5838 %6 = select i1 %5, double %3, double %0
5839 %7 = insertelement <2 x double> %__W, double %6, i64 0
5843 define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5844 ; X86-LABEL: test_mm_maskz_fmsub_sd:
5845 ; X86: # %bb.0: # %entry
5846 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5847 ; X86-NEXT: kmovw %eax, %k1
5848 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5851 ; X64-LABEL: test_mm_maskz_fmsub_sd:
5852 ; X64: # %bb.0: # %entry
5853 ; X64-NEXT: kmovw %edi, %k1
5854 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2
5857 %0 = extractelement <2 x double> %__A, i64 0
5858 %1 = extractelement <2 x double> %__B, i64 0
5859 %.rhs.i = extractelement <2 x double> %__C, i64 0
5860 %2 = fsub double -0.000000e+00, %.rhs.i
5861 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5863 %tobool.i = icmp eq i8 %4, 0
5864 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5865 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5866 ret <2 x double> %vecins.i
5869 define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5870 ; X86-LABEL: test_mm_maskz_fmsub_round_sd:
5871 ; X86: # %bb.0: # %entry
5872 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5873 ; X86-NEXT: kmovw %eax, %k1
5874 ; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5877 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
5878 ; X64: # %bb.0: # %entry
5879 ; X64-NEXT: kmovw %edi, %k1
5880 ; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5883 %0 = extractelement <2 x double> %__A, i64 0
5884 %1 = extractelement <2 x double> %__B, i64 0
5885 %.rhs = extractelement <2 x double> %__C, i64 0
5886 %2 = fsub double -0.000000e+00, %.rhs
5887 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5888 %4 = bitcast i8 %__U to <8 x i1>
5889 %5 = extractelement <8 x i1> %4, i64 0
5890 %6 = select i1 %5, double %3, double 0.000000e+00
5891 %7 = insertelement <2 x double> %__A, double %6, i64 0
5895 define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5896 ; X86-LABEL: test_mm_mask3_fmsub_sd:
5897 ; X86: # %bb.0: # %entry
5898 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5899 ; X86-NEXT: kmovw %eax, %k1
5900 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5901 ; X86-NEXT: vmovapd %xmm2, %xmm0
5904 ; X64-LABEL: test_mm_mask3_fmsub_sd:
5905 ; X64: # %bb.0: # %entry
5906 ; X64-NEXT: kmovw %edi, %k1
5907 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2
5908 ; X64-NEXT: vmovapd %xmm2, %xmm0
5911 %0 = extractelement <2 x double> %__W, i64 0
5912 %1 = extractelement <2 x double> %__X, i64 0
5913 %.rhs.i = extractelement <2 x double> %__Y, i64 0
5914 %2 = fsub double -0.000000e+00, %.rhs.i
5915 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5917 %tobool.i = icmp eq i8 %4, 0
5918 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5919 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5920 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5921 ret <2 x double> %vecins.i
5924 define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5925 ; X86-LABEL: test_mm_mask3_fmsub_round_sd:
5926 ; X86: # %bb.0: # %entry
5927 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5928 ; X86-NEXT: kmovw %eax, %k1
5929 ; X86-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5930 ; X86-NEXT: vmovapd %xmm2, %xmm0
5933 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
5934 ; X64: # %bb.0: # %entry
5935 ; X64-NEXT: kmovw %edi, %k1
5936 ; X64-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5937 ; X64-NEXT: vmovapd %xmm2, %xmm0
5940 %0 = extractelement <2 x double> %__W, i64 0
5941 %1 = extractelement <2 x double> %__X, i64 0
5942 %.rhs = extractelement <2 x double> %__Y, i64 0
5943 %2 = fsub double -0.000000e+00, %.rhs
5944 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5945 %4 = bitcast i8 %__U to <8 x i1>
5946 %5 = extractelement <8 x i1> %4, i64 0
5947 %6 = select i1 %5, double %3, double %.rhs
5948 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5952 define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5953 ; X86-LABEL: test_mm_mask_fnmadd_sd:
5954 ; X86: # %bb.0: # %entry
5955 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5956 ; X86-NEXT: kmovw %eax, %k1
5957 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5960 ; X64-LABEL: test_mm_mask_fnmadd_sd:
5961 ; X64: # %bb.0: # %entry
5962 ; X64-NEXT: kmovw %edi, %k1
5963 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
5966 %0 = extractelement <2 x double> %__W, i64 0
5967 %.rhs.i = extractelement <2 x double> %__A, i64 0
5968 %1 = fsub double -0.000000e+00, %.rhs.i
5969 %2 = extractelement <2 x double> %__B, i64 0
5970 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5972 %tobool.i = icmp eq i8 %4, 0
5973 %vecext1.i = extractelement <2 x double> %__W, i32 0
5974 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5975 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5976 ret <2 x double> %vecins.i
5979 define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5980 ; X86-LABEL: test_mm_mask_fnmadd_round_sd:
5981 ; X86: # %bb.0: # %entry
5982 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
5983 ; X86-NEXT: kmovw %eax, %k1
5984 ; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5987 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
5988 ; X64: # %bb.0: # %entry
5989 ; X64-NEXT: kmovw %edi, %k1
5990 ; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5993 %0 = extractelement <2 x double> %__W, i64 0
5994 %.rhs = extractelement <2 x double> %__A, i64 0
5995 %1 = fsub double -0.000000e+00, %.rhs
5996 %2 = extractelement <2 x double> %__B, i64 0
5997 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5998 %4 = bitcast i8 %__U to <8 x i1>
5999 %5 = extractelement <8 x i1> %4, i64 0
6000 %6 = select i1 %5, double %3, double %0
6001 %7 = insertelement <2 x double> %__W, double %6, i64 0
6005 define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6006 ; X86-LABEL: test_mm_maskz_fnmadd_sd:
6007 ; X86: # %bb.0: # %entry
6008 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6009 ; X86-NEXT: kmovw %eax, %k1
6010 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
6013 ; X64-LABEL: test_mm_maskz_fnmadd_sd:
6014 ; X64: # %bb.0: # %entry
6015 ; X64-NEXT: kmovw %edi, %k1
6016 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2
6019 %0 = extractelement <2 x double> %__A, i64 0
6020 %.rhs.i = extractelement <2 x double> %__B, i64 0
6021 %1 = fsub double -0.000000e+00, %.rhs.i
6022 %2 = extractelement <2 x double> %__C, i64 0
6023 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6025 %tobool.i = icmp eq i8 %4, 0
6026 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6027 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6028 ret <2 x double> %vecins.i
6031 define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6032 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
6033 ; X86: # %bb.0: # %entry
6034 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6035 ; X86-NEXT: kmovw %eax, %k1
6036 ; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6039 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
6040 ; X64: # %bb.0: # %entry
6041 ; X64-NEXT: kmovw %edi, %k1
6042 ; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6045 %0 = extractelement <2 x double> %__A, i64 0
6046 %.rhs = extractelement <2 x double> %__B, i64 0
6047 %1 = fsub double -0.000000e+00, %.rhs
6048 %2 = extractelement <2 x double> %__C, i64 0
6049 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6050 %4 = bitcast i8 %__U to <8 x i1>
6051 %5 = extractelement <8 x i1> %4, i64 0
6052 %6 = select i1 %5, double %3, double 0.000000e+00
6053 %7 = insertelement <2 x double> %__A, double %6, i64 0
6057 define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6058 ; X86-LABEL: test_mm_mask3_fnmadd_sd:
6059 ; X86: # %bb.0: # %entry
6060 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6061 ; X86-NEXT: kmovw %eax, %k1
6062 ; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
6063 ; X86-NEXT: vmovapd %xmm2, %xmm0
6066 ; X64-LABEL: test_mm_mask3_fnmadd_sd:
6067 ; X64: # %bb.0: # %entry
6068 ; X64-NEXT: kmovw %edi, %k1
6069 ; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
6070 ; X64-NEXT: vmovapd %xmm2, %xmm0
6073 %0 = extractelement <2 x double> %__W, i64 0
6074 %.rhs.i = extractelement <2 x double> %__X, i64 0
6075 %1 = fsub double -0.000000e+00, %.rhs.i
6076 %2 = extractelement <2 x double> %__Y, i64 0
6077 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6079 %tobool.i = icmp eq i8 %4, 0
6080 %vecext1.i = extractelement <2 x double> %__Y, i32 0
6081 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
6082 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6083 ret <2 x double> %vecins.i
6086 define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6087 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
6088 ; X86: # %bb.0: # %entry
6089 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6090 ; X86-NEXT: kmovw %eax, %k1
6091 ; X86-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6092 ; X86-NEXT: vmovapd %xmm2, %xmm0
6095 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
6096 ; X64: # %bb.0: # %entry
6097 ; X64-NEXT: kmovw %edi, %k1
6098 ; X64-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6099 ; X64-NEXT: vmovapd %xmm2, %xmm0
6102 %0 = extractelement <2 x double> %__W, i64 0
6103 %.rhs = extractelement <2 x double> %__X, i64 0
6104 %1 = fsub double -0.000000e+00, %.rhs
6105 %2 = extractelement <2 x double> %__Y, i64 0
6106 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6107 %4 = bitcast i8 %__U to <8 x i1>
6108 %5 = extractelement <8 x i1> %4, i64 0
6109 %6 = select i1 %5, double %3, double %2
6110 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6114 define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6115 ; X86-LABEL: test_mm_mask_fnmsub_sd:
6116 ; X86: # %bb.0: # %entry
6117 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6118 ; X86-NEXT: kmovw %eax, %k1
6119 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
6122 ; X64-LABEL: test_mm_mask_fnmsub_sd:
6123 ; X64: # %bb.0: # %entry
6124 ; X64-NEXT: kmovw %edi, %k1
6125 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
6128 %0 = extractelement <2 x double> %__W, i64 0
6129 %.rhs.i = extractelement <2 x double> %__A, i64 0
6130 %1 = fsub double -0.000000e+00, %.rhs.i
6131 %.rhs7.i = extractelement <2 x double> %__B, i64 0
6132 %2 = fsub double -0.000000e+00, %.rhs7.i
6133 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6135 %tobool.i = icmp eq i8 %4, 0
6136 %vecext2.i = extractelement <2 x double> %__W, i32 0
6137 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6138 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
6139 ret <2 x double> %vecins.i
6142 define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6143 ; X86-LABEL: test_mm_mask_fnmsub_round_sd:
6144 ; X86: # %bb.0: # %entry
6145 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6146 ; X86-NEXT: kmovw %eax, %k1
6147 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6150 ; X64-LABEL: test_mm_mask_fnmsub_round_sd:
6151 ; X64: # %bb.0: # %entry
6152 ; X64-NEXT: kmovw %edi, %k1
6153 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6156 %0 = extractelement <2 x double> %__W, i64 0
6157 %.rhs = extractelement <2 x double> %__A, i64 0
6158 %1 = fsub double -0.000000e+00, %.rhs
6159 %.rhs2 = extractelement <2 x double> %__B, i64 0
6160 %2 = fsub double -0.000000e+00, %.rhs2
6161 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6162 %4 = bitcast i8 %__U to <8 x i1>
6163 %5 = extractelement <8 x i1> %4, i64 0
6164 %6 = select i1 %5, double %3, double %0
6165 %7 = insertelement <2 x double> %__W, double %6, i64 0
6169 define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6170 ; X86-LABEL: test_mm_maskz_fnmsub_sd:
6171 ; X86: # %bb.0: # %entry
6172 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6173 ; X86-NEXT: kmovw %eax, %k1
6174 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
6177 ; X64-LABEL: test_mm_maskz_fnmsub_sd:
6178 ; X64: # %bb.0: # %entry
6179 ; X64-NEXT: kmovw %edi, %k1
6180 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2
6183 %0 = extractelement <2 x double> %__A, i64 0
6184 %.rhs.i = extractelement <2 x double> %__B, i64 0
6185 %1 = fsub double -0.000000e+00, %.rhs.i
6186 %.rhs5.i = extractelement <2 x double> %__C, i64 0
6187 %2 = fsub double -0.000000e+00, %.rhs5.i
6188 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6190 %tobool.i = icmp eq i8 %4, 0
6191 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6192 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6193 ret <2 x double> %vecins.i
6196 define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6197 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
6198 ; X86: # %bb.0: # %entry
6199 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6200 ; X86-NEXT: kmovw %eax, %k1
6201 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6204 ; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
6205 ; X64: # %bb.0: # %entry
6206 ; X64-NEXT: kmovw %edi, %k1
6207 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6210 %0 = extractelement <2 x double> %__A, i64 0
6211 %.rhs = extractelement <2 x double> %__B, i64 0
6212 %1 = fsub double -0.000000e+00, %.rhs
6213 %.rhs2 = extractelement <2 x double> %__C, i64 0
6214 %2 = fsub double -0.000000e+00, %.rhs2
6215 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6216 %4 = bitcast i8 %__U to <8 x i1>
6217 %5 = extractelement <8 x i1> %4, i64 0
6218 %6 = select i1 %5, double %3, double 0.000000e+00
6219 %7 = insertelement <2 x double> %__A, double %6, i64 0
6223 define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6224 ; X86-LABEL: test_mm_mask3_fnmsub_sd:
6225 ; X86: # %bb.0: # %entry
6226 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6227 ; X86-NEXT: kmovw %eax, %k1
6228 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
6229 ; X86-NEXT: vmovapd %xmm2, %xmm0
6232 ; X64-LABEL: test_mm_mask3_fnmsub_sd:
6233 ; X64: # %bb.0: # %entry
6234 ; X64-NEXT: kmovw %edi, %k1
6235 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2
6236 ; X64-NEXT: vmovapd %xmm2, %xmm0
6239 %0 = extractelement <2 x double> %__W, i64 0
6240 %.rhs.i = extractelement <2 x double> %__X, i64 0
6241 %1 = fsub double -0.000000e+00, %.rhs.i
6242 %.rhs7.i = extractelement <2 x double> %__Y, i64 0
6243 %2 = fsub double -0.000000e+00, %.rhs7.i
6244 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6246 %tobool.i = icmp eq i8 %4, 0
6247 %vecext2.i = extractelement <2 x double> %__Y, i32 0
6248 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6249 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6250 ret <2 x double> %vecins.i
6253 define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6254 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
6255 ; X86: # %bb.0: # %entry
6256 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6257 ; X86-NEXT: kmovw %eax, %k1
6258 ; X86-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6259 ; X86-NEXT: vmovapd %xmm2, %xmm0
6262 ; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
6263 ; X64: # %bb.0: # %entry
6264 ; X64-NEXT: kmovw %edi, %k1
6265 ; X64-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6266 ; X64-NEXT: vmovapd %xmm2, %xmm0
6269 %0 = extractelement <2 x double> %__W, i64 0
6270 %.rhs = extractelement <2 x double> %__X, i64 0
6271 %1 = fsub double -0.000000e+00, %.rhs
6272 %.rhs1 = extractelement <2 x double> %__Y, i64 0
6273 %2 = fsub double -0.000000e+00, %.rhs1
6274 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6275 %4 = bitcast i8 %__U to <8 x i1>
6276 %5 = extractelement <8 x i1> %4, i64 0
6277 %6 = select i1 %5, double %3, double %.rhs1
6278 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6282 define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, ptr readonly %__P) {
6283 ; X86-LABEL: test_mm512_mask_expandloadu_epi64:
6284 ; X86: # %bb.0: # %entry
6285 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6286 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6287 ; X86-NEXT: kmovw %ecx, %k1
6288 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1}
6291 ; X64-LABEL: test_mm512_mask_expandloadu_epi64:
6292 ; X64: # %bb.0: # %entry
6293 ; X64-NEXT: kmovw %edi, %k1
6294 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1}
6297 %0 = bitcast i8 %__U to <8 x i1>
6298 %1 = tail call <8 x i64> @llvm.masked.expandload.v8i64(ptr %__P, <8 x i1> %0, <8 x i64> %__W)
6302 define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, ptr readonly %__P) {
6303 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
6304 ; X86: # %bb.0: # %entry
6305 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6306 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6307 ; X86-NEXT: kmovw %ecx, %k1
6308 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
6311 ; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
6312 ; X64: # %bb.0: # %entry
6313 ; X64-NEXT: kmovw %edi, %k1
6314 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
6317 %0 = bitcast i8 %__U to <8 x i1>
6318 %1 = tail call <8 x i64> @llvm.masked.expandload.v8i64(ptr %__P, <8 x i1> %0, <8 x i64> zeroinitializer)
6322 define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, ptr readonly %__P) {
6323 ; X86-LABEL: test_mm512_mask_expandloadu_pd:
6324 ; X86: # %bb.0: # %entry
6325 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6326 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6327 ; X86-NEXT: kmovw %ecx, %k1
6328 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1}
6331 ; X64-LABEL: test_mm512_mask_expandloadu_pd:
6332 ; X64: # %bb.0: # %entry
6333 ; X64-NEXT: kmovw %edi, %k1
6334 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1}
6337 %0 = bitcast i8 %__U to <8 x i1>
6338 %1 = tail call <8 x double> @llvm.masked.expandload.v8f64(ptr %__P, <8 x i1> %0, <8 x double> %__W)
6342 define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, ptr readonly %__P) {
6343 ; X86-LABEL: test_mm512_maskz_expandloadu_pd:
6344 ; X86: # %bb.0: # %entry
6345 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6346 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
6347 ; X86-NEXT: kmovw %ecx, %k1
6348 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z}
6351 ; X64-LABEL: test_mm512_maskz_expandloadu_pd:
6352 ; X64: # %bb.0: # %entry
6353 ; X64-NEXT: kmovw %edi, %k1
6354 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z}
6357 %0 = bitcast i8 %__U to <8 x i1>
6358 %1 = tail call <8 x double> @llvm.masked.expandload.v8f64(ptr %__P, <8 x i1> %0, <8 x double> zeroinitializer)
6362 define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, ptr readonly %__P) {
6363 ; X86-LABEL: test_mm512_mask_expandloadu_epi32:
6364 ; X86: # %bb.0: # %entry
6365 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6366 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6367 ; X86-NEXT: kmovw %ecx, %k1
6368 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1}
6371 ; X64-LABEL: test_mm512_mask_expandloadu_epi32:
6372 ; X64: # %bb.0: # %entry
6373 ; X64-NEXT: kmovw %edi, %k1
6374 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1}
6377 %0 = bitcast <8 x i64> %__W to <16 x i32>
6378 %1 = bitcast i16 %__U to <16 x i1>
6379 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(ptr %__P, <16 x i1> %1, <16 x i32> %0) #11
6380 %3 = bitcast <16 x i32> %2 to <8 x i64>
6384 define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, ptr readonly %__P) {
6385 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
6386 ; X86: # %bb.0: # %entry
6387 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6388 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6389 ; X86-NEXT: kmovw %ecx, %k1
6390 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z}
6393 ; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
6394 ; X64: # %bb.0: # %entry
6395 ; X64-NEXT: kmovw %edi, %k1
6396 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z}
6399 %0 = bitcast i16 %__U to <16 x i1>
6400 %1 = tail call <16 x i32> @llvm.masked.expandload.v16i32(ptr %__P, <16 x i1> %0, <16 x i32> zeroinitializer)
6401 %2 = bitcast <16 x i32> %1 to <8 x i64>
6405 define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, ptr readonly %__P) {
6406 ; X86-LABEL: test_mm512_mask_expandloadu_ps:
6407 ; X86: # %bb.0: # %entry
6408 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6409 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6410 ; X86-NEXT: kmovw %ecx, %k1
6411 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1}
6414 ; X64-LABEL: test_mm512_mask_expandloadu_ps:
6415 ; X64: # %bb.0: # %entry
6416 ; X64-NEXT: kmovw %edi, %k1
6417 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1}
6420 %0 = bitcast i16 %__U to <16 x i1>
6421 %1 = tail call <16 x float> @llvm.masked.expandload.v16f32(ptr %__P, <16 x i1> %0, <16 x float> %__W) #11
6425 define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, ptr readonly %__P) {
6426 ; X86-LABEL: test_mm512_maskz_expandloadu_ps:
6427 ; X86: # %bb.0: # %entry
6428 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6429 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6430 ; X86-NEXT: kmovw %ecx, %k1
6431 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z}
6434 ; X64-LABEL: test_mm512_maskz_expandloadu_ps:
6435 ; X64: # %bb.0: # %entry
6436 ; X64-NEXT: kmovw %edi, %k1
6437 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z}
6440 %0 = bitcast i16 %__U to <16 x i1>
6441 %1 = tail call <16 x float> @llvm.masked.expandload.v16f32(ptr %__P, <16 x i1> %0, <16 x float> zeroinitializer)
6445 define void @test_mm512_mask_compressstoreu_pd(ptr %__P, i8 zeroext %__U, <8 x double> %__A) {
6446 ; X86-LABEL: test_mm512_mask_compressstoreu_pd:
6447 ; X86: # %bb.0: # %entry
6448 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6449 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6450 ; X86-NEXT: kmovw %eax, %k1
6451 ; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1}
6452 ; X86-NEXT: vzeroupper
6455 ; X64-LABEL: test_mm512_mask_compressstoreu_pd:
6456 ; X64: # %bb.0: # %entry
6457 ; X64-NEXT: kmovw %esi, %k1
6458 ; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
6459 ; X64-NEXT: vzeroupper
6462 %0 = bitcast i8 %__U to <8 x i1>
6463 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, ptr %__P, <8 x i1> %0)
6467 define void @test_mm512_mask_compressstoreu_epi64(ptr %__P, i8 zeroext %__U, <8 x i64> %__A) {
6468 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
6469 ; X86: # %bb.0: # %entry
6470 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6471 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6472 ; X86-NEXT: kmovw %eax, %k1
6473 ; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1}
6474 ; X86-NEXT: vzeroupper
6477 ; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
6478 ; X64: # %bb.0: # %entry
6479 ; X64-NEXT: kmovw %esi, %k1
6480 ; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
6481 ; X64-NEXT: vzeroupper
6484 %0 = bitcast i8 %__U to <8 x i1>
6485 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, ptr %__P, <8 x i1> %0)
6489 define void @test_mm512_mask_compressstoreu_ps(ptr %__P, i16 zeroext %__U, <16 x float> %__A) {
6490 ; X86-LABEL: test_mm512_mask_compressstoreu_ps:
6491 ; X86: # %bb.0: # %entry
6492 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6493 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6494 ; X86-NEXT: kmovw %eax, %k1
6495 ; X86-NEXT: vcompressps %zmm0, (%ecx) {%k1}
6496 ; X86-NEXT: vzeroupper
6499 ; X64-LABEL: test_mm512_mask_compressstoreu_ps:
6500 ; X64: # %bb.0: # %entry
6501 ; X64-NEXT: kmovw %esi, %k1
6502 ; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1}
6503 ; X64-NEXT: vzeroupper
6506 %0 = bitcast i16 %__U to <16 x i1>
6507 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, ptr %__P, <16 x i1> %0)
6511 define void @test_mm512_mask_compressstoreu_epi32(ptr %__P, i16 zeroext %__U, <8 x i64> %__A) {
6512 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
6513 ; X86: # %bb.0: # %entry
6514 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6515 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6516 ; X86-NEXT: kmovw %eax, %k1
6517 ; X86-NEXT: vpcompressd %zmm0, (%ecx) {%k1}
6518 ; X86-NEXT: vzeroupper
6521 ; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
6522 ; X64: # %bb.0: # %entry
6523 ; X64-NEXT: kmovw %esi, %k1
6524 ; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1}
6525 ; X64-NEXT: vzeroupper
6528 %0 = bitcast <8 x i64> %__A to <16 x i32>
6529 %1 = bitcast i16 %__U to <16 x i1>
6530 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, ptr %__P, <16 x i1> %1)
6534 define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
6535 ; X86-LABEL: test_mm512_reduce_add_epi64:
6536 ; X86: # %bb.0: # %entry
6537 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6538 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6539 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6540 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6541 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6542 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6543 ; X86-NEXT: vmovd %xmm0, %eax
6544 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6545 ; X86-NEXT: vzeroupper
6548 ; X64-LABEL: test_mm512_reduce_add_epi64:
6549 ; X64: # %bb.0: # %entry
6550 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6551 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6552 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6553 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6554 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6555 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6556 ; X64-NEXT: vmovq %xmm0, %rax
6557 ; X64-NEXT: vzeroupper
6560 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6561 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6562 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6563 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6564 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6565 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6566 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6567 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6568 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6572 define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
6573 ; X86-LABEL: test_mm512_reduce_mul_epi64:
6574 ; X86: # %bb.0: # %entry
6575 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6576 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm2
6577 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6578 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm3
6579 ; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6580 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6581 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6582 ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6583 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6584 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6585 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6586 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6587 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6588 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6589 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6590 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6591 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6592 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6593 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6594 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6595 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6596 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6597 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6598 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6599 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6600 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6601 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6602 ; X86-NEXT: vmovd %xmm0, %eax
6603 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6604 ; X86-NEXT: vzeroupper
6607 ; X64-LABEL: test_mm512_reduce_mul_epi64:
6608 ; X64: # %bb.0: # %entry
6609 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6610 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
6611 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6612 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
6613 ; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6614 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6615 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6616 ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6617 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6618 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6619 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6620 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6621 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6622 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6623 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6624 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6625 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6626 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6627 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6628 ; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
6629 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6630 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6631 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6632 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6633 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6634 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6635 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6636 ; X64-NEXT: vmovq %xmm0, %rax
6637 ; X64-NEXT: vzeroupper
6640 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6641 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6642 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6643 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6644 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6645 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6646 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6647 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6648 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6652 define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
6653 ; X86-LABEL: test_mm512_reduce_or_epi64:
6654 ; X86: # %bb.0: # %entry
6655 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6656 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6657 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6658 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6659 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6660 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6661 ; X86-NEXT: vmovd %xmm0, %eax
6662 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6663 ; X86-NEXT: vzeroupper
6666 ; X64-LABEL: test_mm512_reduce_or_epi64:
6667 ; X64: # %bb.0: # %entry
6668 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6669 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6670 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6671 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6672 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6673 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6674 ; X64-NEXT: vmovq %xmm0, %rax
6675 ; X64-NEXT: vzeroupper
6678 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6679 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6680 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6681 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6682 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6683 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6684 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6685 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6686 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6690 define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
6691 ; X86-LABEL: test_mm512_reduce_and_epi64:
6692 ; X86: # %bb.0: # %entry
6693 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6694 ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
6695 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6696 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6697 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6698 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6699 ; X86-NEXT: vmovd %xmm0, %eax
6700 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6701 ; X86-NEXT: vzeroupper
6704 ; X64-LABEL: test_mm512_reduce_and_epi64:
6705 ; X64: # %bb.0: # %entry
6706 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6707 ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
6708 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6709 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6710 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6711 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6712 ; X64-NEXT: vmovq %xmm0, %rax
6713 ; X64-NEXT: vzeroupper
6716 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6717 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6718 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6719 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6720 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6721 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6722 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6723 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6724 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6728 define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6729 ; X86-LABEL: test_mm512_mask_reduce_add_epi64:
6730 ; X86: # %bb.0: # %entry
6731 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6732 ; X86-NEXT: kmovw %eax, %k1
6733 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6734 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6735 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6736 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6737 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6738 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6739 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6740 ; X86-NEXT: vmovd %xmm0, %eax
6741 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6742 ; X86-NEXT: vzeroupper
6745 ; X64-LABEL: test_mm512_mask_reduce_add_epi64:
6746 ; X64: # %bb.0: # %entry
6747 ; X64-NEXT: kmovw %edi, %k1
6748 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6749 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6750 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6751 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6752 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6753 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6754 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6755 ; X64-NEXT: vmovq %xmm0, %rax
6756 ; X64-NEXT: vzeroupper
6759 %0 = bitcast i8 %__M to <8 x i1>
6760 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6761 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6762 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6763 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6764 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6765 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6766 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6767 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6768 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6769 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6773 define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6774 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
6775 ; X86: # %bb.0: # %entry
6776 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6777 ; X86-NEXT: kmovw %eax, %k1
6778 ; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
6779 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6780 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6781 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2
6782 ; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6783 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm3
6784 ; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6785 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6786 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6787 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6788 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6789 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6790 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6791 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6792 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6793 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6794 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6795 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6796 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6797 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6798 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6799 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6800 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6801 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6802 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6803 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6804 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6805 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6806 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6807 ; X86-NEXT: vmovd %xmm0, %eax
6808 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6809 ; X86-NEXT: vzeroupper
6812 ; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
6813 ; X64: # %bb.0: # %entry
6814 ; X64-NEXT: kmovw %edi, %k1
6815 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
6816 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6817 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6818 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm2
6819 ; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6820 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm3
6821 ; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6822 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6823 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6824 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6825 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6826 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6827 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6828 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6829 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6830 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6831 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6832 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6833 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6834 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6835 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6836 ; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
6837 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6838 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6839 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6840 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6841 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6842 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6843 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6844 ; X64-NEXT: vmovq %xmm0, %rax
6845 ; X64-NEXT: vzeroupper
6848 %0 = bitcast i8 %__M to <8 x i1>
6849 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
6850 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6851 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6852 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6853 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6854 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6855 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6856 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6857 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6858 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6862 define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6863 ; X86-LABEL: test_mm512_mask_reduce_and_epi64:
6864 ; X86: # %bb.0: # %entry
6865 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6866 ; X86-NEXT: kmovw %eax, %k1
6867 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6868 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6869 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6870 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
6871 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6872 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6873 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6874 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6875 ; X86-NEXT: vmovd %xmm0, %eax
6876 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6877 ; X86-NEXT: vzeroupper
6880 ; X64-LABEL: test_mm512_mask_reduce_and_epi64:
6881 ; X64: # %bb.0: # %entry
6882 ; X64-NEXT: kmovw %edi, %k1
6883 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6884 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6885 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6886 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
6887 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6888 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6889 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6890 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6891 ; X64-NEXT: vmovq %xmm0, %rax
6892 ; X64-NEXT: vzeroupper
6895 %0 = bitcast i8 %__M to <8 x i1>
6896 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
6897 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6898 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6899 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6900 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6901 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6902 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6903 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6904 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6905 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6909 define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6910 ; X86-LABEL: test_mm512_mask_reduce_or_epi64:
6911 ; X86: # %bb.0: # %entry
6912 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
6913 ; X86-NEXT: kmovw %eax, %k1
6914 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6915 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6916 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6917 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6918 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6919 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6920 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6921 ; X86-NEXT: vmovd %xmm0, %eax
6922 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6923 ; X86-NEXT: vzeroupper
6926 ; X64-LABEL: test_mm512_mask_reduce_or_epi64:
6927 ; X64: # %bb.0: # %entry
6928 ; X64-NEXT: kmovw %edi, %k1
6929 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6930 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6931 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6932 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6933 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6934 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
6935 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6936 ; X64-NEXT: vmovq %xmm0, %rax
6937 ; X64-NEXT: vzeroupper
6940 %0 = bitcast i8 %__M to <8 x i1>
6941 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6942 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6943 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6944 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6945 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6946 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6947 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6948 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6949 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6950 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6954 define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
6955 ; CHECK-LABEL: test_mm512_reduce_add_epi32:
6956 ; CHECK: # %bb.0: # %entry
6957 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6958 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6959 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6960 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6961 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6962 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6963 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
6964 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6965 ; CHECK-NEXT: vmovd %xmm0, %eax
6966 ; CHECK-NEXT: vzeroupper
6967 ; CHECK-NEXT: ret{{[l|q]}}
6969 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6970 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6971 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6972 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6973 %add.i = add <8 x i32> %0, %1
6974 %2 = bitcast <8 x i32> %add.i to <4 x i64>
6975 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6976 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6977 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6978 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6979 %add5.i = add <4 x i32> %3, %4
6980 %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6981 %add6.i = add <4 x i32> %shuffle.i, %add5.i
6982 %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6983 %add8.i = add <4 x i32> %shuffle7.i, %add6.i
6984 %vecext.i = extractelement <4 x i32> %add8.i, i32 0
6988 define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
6989 ; CHECK-LABEL: test_mm512_reduce_mul_epi32:
6990 ; CHECK: # %bb.0: # %entry
6991 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6992 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
6993 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6994 ; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0
6995 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6996 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6997 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
6998 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6999 ; CHECK-NEXT: vmovd %xmm0, %eax
7000 ; CHECK-NEXT: vzeroupper
7001 ; CHECK-NEXT: ret{{[l|q]}}
7003 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7004 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
7005 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7006 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
7007 %mul.i = mul <8 x i32> %0, %1
7008 %2 = bitcast <8 x i32> %mul.i to <4 x i64>
7009 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7010 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
7011 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7012 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
7013 %mul5.i = mul <4 x i32> %3, %4
7014 %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7015 %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
7016 %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7017 %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
7018 %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
7022 define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
7023 ; CHECK-LABEL: test_mm512_reduce_or_epi32:
7024 ; CHECK: # %bb.0: # %entry
7025 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7026 ; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
7027 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
7028 ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
7029 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7030 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
7031 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7032 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
7033 ; CHECK-NEXT: vmovd %xmm0, %eax
7034 ; CHECK-NEXT: vzeroupper
7035 ; CHECK-NEXT: ret{{[l|q]}}
7037 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7038 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7039 %or25.i = or <4 x i64> %extract.i, %extract2.i
7040 %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7041 %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7042 %or526.i = or <2 x i64> %extract3.i, %extract4.i
7043 %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
7044 %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7045 %or6.i = or <4 x i32> %shuffle.i, %or5.i
7046 %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7047 %or8.i = or <4 x i32> %shuffle7.i, %or6.i
7048 %vecext.i = extractelement <4 x i32> %or8.i, i32 0
7052 define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
7053 ; CHECK-LABEL: test_mm512_reduce_and_epi32:
7054 ; CHECK: # %bb.0: # %entry
7055 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7056 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
7057 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
7058 ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
7059 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7060 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7061 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7062 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7063 ; CHECK-NEXT: vmovd %xmm0, %eax
7064 ; CHECK-NEXT: vzeroupper
7065 ; CHECK-NEXT: ret{{[l|q]}}
7067 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7068 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7069 %and25.i = and <4 x i64> %extract.i, %extract2.i
7070 %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7071 %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7072 %and526.i = and <2 x i64> %extract3.i, %extract4.i
7073 %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
7074 %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7075 %and6.i = and <4 x i32> %shuffle.i, %and5.i
7076 %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7077 %and8.i = and <4 x i32> %shuffle7.i, %and6.i
7078 %vecext.i = extractelement <4 x i32> %and8.i, i32 0
7082 define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7083 ; X86-LABEL: test_mm512_mask_reduce_add_epi32:
7084 ; X86: # %bb.0: # %entry
7085 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7086 ; X86-NEXT: kmovw %eax, %k1
7087 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7088 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7089 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7090 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7091 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7092 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7093 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7094 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7095 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7096 ; X86-NEXT: vmovd %xmm0, %eax
7097 ; X86-NEXT: vzeroupper
7100 ; X64-LABEL: test_mm512_mask_reduce_add_epi32:
7101 ; X64: # %bb.0: # %entry
7102 ; X64-NEXT: kmovw %edi, %k1
7103 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7104 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7105 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7106 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7107 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7108 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7109 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7110 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7111 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7112 ; X64-NEXT: vmovd %xmm0, %eax
7113 ; X64-NEXT: vzeroupper
7116 %0 = bitcast <8 x i64> %__W to <16 x i32>
7117 %1 = bitcast i16 %__M to <16 x i1>
7118 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7119 %3 = bitcast <16 x i32> %2 to <8 x i64>
7120 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7121 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7122 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7123 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
7124 %add.i = add <8 x i32> %4, %5
7125 %6 = bitcast <8 x i32> %add.i to <4 x i64>
7126 %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7127 %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
7128 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7129 %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
7130 %add6.i = add <4 x i32> %7, %8
7131 %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7132 %add7.i = add <4 x i32> %shuffle.i, %add6.i
7133 %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7134 %add9.i = add <4 x i32> %shuffle8.i, %add7.i
7135 %vecext.i = extractelement <4 x i32> %add9.i, i32 0
7139 define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7140 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
7141 ; X86: # %bb.0: # %entry
7142 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7143 ; X86-NEXT: kmovw %eax, %k1
7144 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7145 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7146 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7147 ; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7148 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7149 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7150 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7151 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7152 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7153 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7154 ; X86-NEXT: vmovd %xmm0, %eax
7155 ; X86-NEXT: vzeroupper
7158 ; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
7159 ; X64: # %bb.0: # %entry
7160 ; X64-NEXT: kmovw %edi, %k1
7161 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7162 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7163 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7164 ; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7165 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7166 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7167 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7168 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7169 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7170 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7171 ; X64-NEXT: vmovd %xmm0, %eax
7172 ; X64-NEXT: vzeroupper
7175 %0 = bitcast <8 x i64> %__W to <16 x i32>
7176 %1 = bitcast i16 %__M to <16 x i1>
7177 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
7178 %3 = bitcast <16 x i32> %2 to <8 x i64>
7179 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7180 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7181 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7182 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
7183 %mul.i = mul <8 x i32> %4, %5
7184 %6 = bitcast <8 x i32> %mul.i to <4 x i64>
7185 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7186 %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
7187 %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7188 %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
7189 %mul7.i = mul <4 x i32> %7, %8
7190 %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7191 %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
7192 %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7193 %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
7194 %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
7198 define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7199 ; X86-LABEL: test_mm512_mask_reduce_and_epi32:
7200 ; X86: # %bb.0: # %entry
7201 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7202 ; X86-NEXT: kmovw %eax, %k1
7203 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7204 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7205 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7206 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
7207 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7208 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
7209 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7210 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7211 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7212 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7213 ; X86-NEXT: vmovd %xmm0, %eax
7214 ; X86-NEXT: vzeroupper
7217 ; X64-LABEL: test_mm512_mask_reduce_and_epi32:
7218 ; X64: # %bb.0: # %entry
7219 ; X64-NEXT: kmovw %edi, %k1
7220 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7221 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7222 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7223 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
7224 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7225 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
7226 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7227 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7228 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7229 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7230 ; X64-NEXT: vmovd %xmm0, %eax
7231 ; X64-NEXT: vzeroupper
7234 %0 = bitcast <8 x i64> %__W to <16 x i32>
7235 %1 = bitcast i16 %__M to <16 x i1>
7236 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
7237 %3 = bitcast <16 x i32> %2 to <8 x i64>
7238 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7239 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7240 %and28.i = and <4 x i64> %extract.i, %extract4.i
7241 %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7242 %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7243 %and729.i = and <2 x i64> %extract5.i, %extract6.i
7244 %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
7245 %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7246 %and8.i = and <4 x i32> %shuffle.i, %and7.i
7247 %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7248 %and10.i = and <4 x i32> %shuffle9.i, %and8.i
7249 %vecext.i = extractelement <4 x i32> %and10.i, i32 0
7253 define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7254 ; X86-LABEL: test_mm512_mask_reduce_or_epi32:
7255 ; X86: # %bb.0: # %entry
7256 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7257 ; X86-NEXT: kmovw %eax, %k1
7258 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7259 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7260 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
7261 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7262 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
7263 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7264 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7265 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7266 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7267 ; X86-NEXT: vmovd %xmm0, %eax
7268 ; X86-NEXT: vzeroupper
7271 ; X64-LABEL: test_mm512_mask_reduce_or_epi32:
7272 ; X64: # %bb.0: # %entry
7273 ; X64-NEXT: kmovw %edi, %k1
7274 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7275 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7276 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
7277 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7278 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
7279 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7280 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7281 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
7282 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7283 ; X64-NEXT: vmovd %xmm0, %eax
7284 ; X64-NEXT: vzeroupper
7287 %0 = bitcast <8 x i64> %__W to <16 x i32>
7288 %1 = bitcast i16 %__M to <16 x i1>
7289 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7290 %3 = bitcast <16 x i32> %2 to <8 x i64>
7291 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7292 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7293 %or27.i = or <4 x i64> %extract.i, %extract3.i
7294 %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7295 %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7296 %or628.i = or <2 x i64> %extract4.i, %extract5.i
7297 %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
7298 %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7299 %or7.i = or <4 x i32> %shuffle.i, %or6.i
7300 %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7301 %or9.i = or <4 x i32> %shuffle8.i, %or7.i
7302 %vecext.i = extractelement <4 x i32> %or9.i, i32 0
7306 define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
7307 ; X86-LABEL: test_mm512_reduce_add_pd:
7308 ; X86: # %bb.0: # %entry
7309 ; X86-NEXT: pushl %ebp
7310 ; X86-NEXT: .cfi_def_cfa_offset 8
7311 ; X86-NEXT: .cfi_offset %ebp, -8
7312 ; X86-NEXT: movl %esp, %ebp
7313 ; X86-NEXT: .cfi_def_cfa_register %ebp
7314 ; X86-NEXT: andl $-8, %esp
7315 ; X86-NEXT: subl $8, %esp
7316 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7317 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7318 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7319 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7320 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7321 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7322 ; X86-NEXT: vmovsd %xmm0, (%esp)
7323 ; X86-NEXT: fldl (%esp)
7324 ; X86-NEXT: movl %ebp, %esp
7325 ; X86-NEXT: popl %ebp
7326 ; X86-NEXT: .cfi_def_cfa %esp, 4
7327 ; X86-NEXT: vzeroupper
7330 ; X64-LABEL: test_mm512_reduce_add_pd:
7331 ; X64: # %bb.0: # %entry
7332 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7333 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7334 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7335 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7336 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7337 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7338 ; X64-NEXT: vzeroupper
7341 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7342 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7343 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7344 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7345 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7346 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7347 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7348 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7349 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7350 ret double %vecext.i
7353 define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
7354 ; X86-LABEL: test_mm512_reduce_mul_pd:
7355 ; X86: # %bb.0: # %entry
7356 ; X86-NEXT: pushl %ebp
7357 ; X86-NEXT: .cfi_def_cfa_offset 8
7358 ; X86-NEXT: .cfi_offset %ebp, -8
7359 ; X86-NEXT: movl %esp, %ebp
7360 ; X86-NEXT: .cfi_def_cfa_register %ebp
7361 ; X86-NEXT: andl $-8, %esp
7362 ; X86-NEXT: subl $8, %esp
7363 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7364 ; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7365 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7366 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7367 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7368 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7369 ; X86-NEXT: vmovsd %xmm0, (%esp)
7370 ; X86-NEXT: fldl (%esp)
7371 ; X86-NEXT: movl %ebp, %esp
7372 ; X86-NEXT: popl %ebp
7373 ; X86-NEXT: .cfi_def_cfa %esp, 4
7374 ; X86-NEXT: vzeroupper
7377 ; X64-LABEL: test_mm512_reduce_mul_pd:
7378 ; X64: # %bb.0: # %entry
7379 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7380 ; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7381 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7382 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7383 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7384 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7385 ; X64-NEXT: vzeroupper
7388 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7389 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7390 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7391 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7392 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7393 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7394 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7395 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7396 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7397 ret double %vecext.i
7400 define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
7401 ; X86-LABEL: test_mm512_reduce_add_ps:
7402 ; X86: # %bb.0: # %entry
7403 ; X86-NEXT: pushl %eax
7404 ; X86-NEXT: .cfi_def_cfa_offset 8
7405 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7406 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7407 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7408 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7409 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7410 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7411 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7412 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
7413 ; X86-NEXT: vmovss %xmm0, (%esp)
7414 ; X86-NEXT: flds (%esp)
7415 ; X86-NEXT: popl %eax
7416 ; X86-NEXT: .cfi_def_cfa_offset 4
7417 ; X86-NEXT: vzeroupper
7420 ; X64-LABEL: test_mm512_reduce_add_ps:
7421 ; X64: # %bb.0: # %entry
7422 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7423 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7424 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7425 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7426 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7427 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7428 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7429 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
7430 ; X64-NEXT: vzeroupper
7433 %0 = bitcast <16 x float> %__W to <8 x double>
7434 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7435 %1 = bitcast <4 x double> %extract.i to <8 x float>
7436 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7437 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7438 %add.i = fadd <8 x float> %1, %2
7439 %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7440 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7441 %add5.i = fadd <4 x float> %extract3.i, %extract4.i
7442 %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7443 %add6.i = fadd <4 x float> %add5.i, %shuffle.i
7444 %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7445 %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
7446 %vecext.i = extractelement <4 x float> %add8.i, i32 0
7450 define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
7451 ; X86-LABEL: test_mm512_reduce_mul_ps:
7452 ; X86: # %bb.0: # %entry
7453 ; X86-NEXT: pushl %eax
7454 ; X86-NEXT: .cfi_def_cfa_offset 8
7455 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7456 ; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0
7457 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7458 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7459 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7460 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7461 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7462 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
7463 ; X86-NEXT: vmovss %xmm0, (%esp)
7464 ; X86-NEXT: flds (%esp)
7465 ; X86-NEXT: popl %eax
7466 ; X86-NEXT: .cfi_def_cfa_offset 4
7467 ; X86-NEXT: vzeroupper
7470 ; X64-LABEL: test_mm512_reduce_mul_ps:
7471 ; X64: # %bb.0: # %entry
7472 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7473 ; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
7474 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7475 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7476 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7477 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7478 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7479 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
7480 ; X64-NEXT: vzeroupper
7483 %0 = bitcast <16 x float> %__W to <8 x double>
7484 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7485 %1 = bitcast <4 x double> %extract.i to <8 x float>
7486 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7487 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7488 %mul.i = fmul <8 x float> %1, %2
7489 %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7490 %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7491 %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
7492 %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7493 %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
7494 %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7495 %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
7496 %vecext.i = extractelement <4 x float> %mul8.i, i32 0
7500 define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
7501 ; X86-LABEL: test_mm512_mask_reduce_add_pd:
7502 ; X86: # %bb.0: # %entry
7503 ; X86-NEXT: pushl %ebp
7504 ; X86-NEXT: .cfi_def_cfa_offset 8
7505 ; X86-NEXT: .cfi_offset %ebp, -8
7506 ; X86-NEXT: movl %esp, %ebp
7507 ; X86-NEXT: .cfi_def_cfa_register %ebp
7508 ; X86-NEXT: andl $-8, %esp
7509 ; X86-NEXT: subl $8, %esp
7510 ; X86-NEXT: movzbl 8(%ebp), %eax
7511 ; X86-NEXT: kmovw %eax, %k1
7512 ; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7513 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7514 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7515 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7516 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7517 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7518 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7519 ; X86-NEXT: vmovsd %xmm0, (%esp)
7520 ; X86-NEXT: fldl (%esp)
7521 ; X86-NEXT: movl %ebp, %esp
7522 ; X86-NEXT: popl %ebp
7523 ; X86-NEXT: .cfi_def_cfa %esp, 4
7524 ; X86-NEXT: vzeroupper
7527 ; X64-LABEL: test_mm512_mask_reduce_add_pd:
7528 ; X64: # %bb.0: # %entry
7529 ; X64-NEXT: kmovw %edi, %k1
7530 ; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7531 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7532 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7533 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7534 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7535 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7536 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7537 ; X64-NEXT: vzeroupper
7540 %0 = bitcast i8 %__M to <8 x i1>
7541 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
7542 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7543 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7544 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7545 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7546 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7547 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7548 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7549 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7550 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7551 ret double %vecext.i
7554 define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
7555 ; X86-LABEL: test_mm512_mask_reduce_mul_pd:
7556 ; X86: # %bb.0: # %entry
7557 ; X86-NEXT: pushl %ebp
7558 ; X86-NEXT: .cfi_def_cfa_offset 8
7559 ; X86-NEXT: .cfi_offset %ebp, -8
7560 ; X86-NEXT: movl %esp, %ebp
7561 ; X86-NEXT: .cfi_def_cfa_register %ebp
7562 ; X86-NEXT: andl $-8, %esp
7563 ; X86-NEXT: subl $8, %esp
7564 ; X86-NEXT: movzbl 8(%ebp), %eax
7565 ; X86-NEXT: kmovw %eax, %k1
7566 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7567 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7568 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7569 ; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7570 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7571 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7572 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7573 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7574 ; X86-NEXT: vmovsd %xmm0, (%esp)
7575 ; X86-NEXT: fldl (%esp)
7576 ; X86-NEXT: movl %ebp, %esp
7577 ; X86-NEXT: popl %ebp
7578 ; X86-NEXT: .cfi_def_cfa %esp, 4
7579 ; X86-NEXT: vzeroupper
7582 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
7583 ; X64: # %bb.0: # %entry
7584 ; X64-NEXT: kmovw %edi, %k1
7585 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7586 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7587 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7588 ; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7589 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7590 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7591 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7592 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7593 ; X64-NEXT: vzeroupper
7596 %0 = bitcast i8 %__M to <8 x i1>
7597 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
7598 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7599 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7600 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7601 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7602 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7603 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7604 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7605 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7606 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7607 ret double %vecext.i
7610 define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
7611 ; X86-LABEL: test_mm512_mask_reduce_add_ps:
7612 ; X86: # %bb.0: # %entry
7613 ; X86-NEXT: pushl %eax
7614 ; X86-NEXT: .cfi_def_cfa_offset 8
7615 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7616 ; X86-NEXT: kmovw %eax, %k1
7617 ; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7618 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7619 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7620 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7621 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7622 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7623 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7624 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7625 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
7626 ; X86-NEXT: vmovss %xmm0, (%esp)
7627 ; X86-NEXT: flds (%esp)
7628 ; X86-NEXT: popl %eax
7629 ; X86-NEXT: .cfi_def_cfa_offset 4
7630 ; X86-NEXT: vzeroupper
7633 ; X64-LABEL: test_mm512_mask_reduce_add_ps:
7634 ; X64: # %bb.0: # %entry
7635 ; X64-NEXT: kmovw %edi, %k1
7636 ; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7637 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7638 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7639 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7640 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7641 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7642 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7643 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7644 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
7645 ; X64-NEXT: vzeroupper
7648 %0 = bitcast i16 %__M to <16 x i1>
7649 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
7650 %2 = bitcast <16 x float> %1 to <8 x double>
7651 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7652 %3 = bitcast <4 x double> %extract.i to <8 x float>
7653 %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7654 %4 = bitcast <4 x double> %extract3.i to <8 x float>
7655 %add.i = fadd <8 x float> %3, %4
7656 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7657 %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7658 %add6.i = fadd <4 x float> %extract4.i, %extract5.i
7659 %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7660 %add7.i = fadd <4 x float> %add6.i, %shuffle.i
7661 %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7662 %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
7663 %vecext.i = extractelement <4 x float> %add9.i, i32 0
7667 define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
7668 ; X86-LABEL: test_mm512_mask_reduce_mul_ps:
7669 ; X86: # %bb.0: # %entry
7670 ; X86-NEXT: pushl %eax
7671 ; X86-NEXT: .cfi_def_cfa_offset 8
7672 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7673 ; X86-NEXT: kmovw %eax, %k1
7674 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7675 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7676 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7677 ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0
7678 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7679 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7680 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7681 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7682 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7683 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
7684 ; X86-NEXT: vmovss %xmm0, (%esp)
7685 ; X86-NEXT: flds (%esp)
7686 ; X86-NEXT: popl %eax
7687 ; X86-NEXT: .cfi_def_cfa_offset 4
7688 ; X86-NEXT: vzeroupper
7691 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
7692 ; X64: # %bb.0: # %entry
7693 ; X64-NEXT: kmovw %edi, %k1
7694 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7695 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7696 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7697 ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0
7698 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7699 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7700 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7701 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7702 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7703 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
7704 ; X64-NEXT: vzeroupper
7707 %0 = bitcast i16 %__M to <16 x i1>
7708 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
7709 %2 = bitcast <16 x float> %1 to <8 x double>
7710 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7711 %3 = bitcast <4 x double> %extract.i to <8 x float>
7712 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7713 %4 = bitcast <4 x double> %extract4.i to <8 x float>
7714 %mul.i = fmul <8 x float> %3, %4
7715 %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7716 %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7717 %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
7718 %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7719 %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
7720 %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7721 %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
7722 %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7726 define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
7727 ; X86-LABEL: test_mm512_reduce_max_epi64:
7728 ; X86: # %bb.0: # %entry
7729 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7730 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7731 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7732 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7733 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7734 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7735 ; X86-NEXT: vmovd %xmm0, %eax
7736 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7737 ; X86-NEXT: vzeroupper
7740 ; X64-LABEL: test_mm512_reduce_max_epi64:
7741 ; X64: # %bb.0: # %entry
7742 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7743 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7744 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7745 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7746 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7747 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7748 ; X64-NEXT: vmovq %xmm0, %rax
7749 ; X64-NEXT: vzeroupper
7752 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7753 %0 = icmp slt <8 x i64> %shuffle.i, %__W
7754 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7755 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7756 %2 = icmp sgt <8 x i64> %1, %shuffle1.i
7757 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7758 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7759 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7760 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7761 %vecext.i = extractelement <8 x i64> %5, i32 0
7765 define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
7766 ; X86-LABEL: test_mm512_reduce_max_epu64:
7767 ; X86: # %bb.0: # %entry
7768 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7769 ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7770 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7771 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7772 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7773 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7774 ; X86-NEXT: vmovd %xmm0, %eax
7775 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7776 ; X86-NEXT: vzeroupper
7779 ; X64-LABEL: test_mm512_reduce_max_epu64:
7780 ; X64: # %bb.0: # %entry
7781 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7782 ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7783 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7784 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7785 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7786 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7787 ; X64-NEXT: vmovq %xmm0, %rax
7788 ; X64-NEXT: vzeroupper
7791 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7792 %0 = icmp ult <8 x i64> %shuffle.i, %__W
7793 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7794 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7795 %2 = icmp ugt <8 x i64> %1, %shuffle1.i
7796 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7797 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7798 %4 = icmp ugt <8 x i64> %3, %shuffle3.i
7799 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7800 %vecext.i = extractelement <8 x i64> %5, i32 0
7804 define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
7805 ; X86-LABEL: test_mm512_reduce_max_pd:
7806 ; X86: # %bb.0: # %entry
7807 ; X86-NEXT: pushl %ebp
7808 ; X86-NEXT: .cfi_def_cfa_offset 8
7809 ; X86-NEXT: .cfi_offset %ebp, -8
7810 ; X86-NEXT: movl %esp, %ebp
7811 ; X86-NEXT: .cfi_def_cfa_register %ebp
7812 ; X86-NEXT: andl $-8, %esp
7813 ; X86-NEXT: subl $8, %esp
7814 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7815 ; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7816 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7817 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7818 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7819 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
7820 ; X86-NEXT: vmovsd %xmm0, (%esp)
7821 ; X86-NEXT: fldl (%esp)
7822 ; X86-NEXT: movl %ebp, %esp
7823 ; X86-NEXT: popl %ebp
7824 ; X86-NEXT: .cfi_def_cfa %esp, 4
7825 ; X86-NEXT: vzeroupper
7828 ; X64-LABEL: test_mm512_reduce_max_pd:
7829 ; X64: # %bb.0: # %entry
7830 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7831 ; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7832 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7833 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7834 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7835 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
7836 ; X64-NEXT: vzeroupper
7839 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7840 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7841 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7842 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7843 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7844 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7845 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7846 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
7847 %vecext.i = extractelement <2 x double> %2, i32 0
7848 ret double %vecext.i
7851 define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
7852 ; X86-LABEL: test_mm512_reduce_min_epi64:
7853 ; X86: # %bb.0: # %entry
7854 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7855 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7856 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7857 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7858 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7859 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7860 ; X86-NEXT: vmovd %xmm0, %eax
7861 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7862 ; X86-NEXT: vzeroupper
7865 ; X64-LABEL: test_mm512_reduce_min_epi64:
7866 ; X64: # %bb.0: # %entry
7867 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7868 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7869 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7870 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7871 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7872 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7873 ; X64-NEXT: vmovq %xmm0, %rax
7874 ; X64-NEXT: vzeroupper
7877 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7878 %0 = icmp sgt <8 x i64> %shuffle.i, %__W
7879 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7880 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7881 %2 = icmp slt <8 x i64> %1, %shuffle1.i
7882 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7883 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7884 %4 = icmp slt <8 x i64> %3, %shuffle3.i
7885 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7886 %vecext.i = extractelement <8 x i64> %5, i32 0
7890 define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
7891 ; X86-LABEL: test_mm512_reduce_min_epu64:
7892 ; X86: # %bb.0: # %entry
7893 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7894 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7895 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7896 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7897 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7898 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7899 ; X86-NEXT: vmovd %xmm0, %eax
7900 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7901 ; X86-NEXT: vzeroupper
7904 ; X64-LABEL: test_mm512_reduce_min_epu64:
7905 ; X64: # %bb.0: # %entry
7906 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7907 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7908 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7909 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7910 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7911 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7912 ; X64-NEXT: vmovq %xmm0, %rax
7913 ; X64-NEXT: vzeroupper
7916 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7917 %0 = icmp ugt <8 x i64> %shuffle.i, %__W
7918 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7919 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7920 %2 = icmp ult <8 x i64> %1, %shuffle1.i
7921 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7922 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7923 %4 = icmp ult <8 x i64> %3, %shuffle3.i
7924 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7925 %vecext.i = extractelement <8 x i64> %5, i32 0
7929 define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
7930 ; X86-LABEL: test_mm512_reduce_min_pd:
7931 ; X86: # %bb.0: # %entry
7932 ; X86-NEXT: pushl %ebp
7933 ; X86-NEXT: .cfi_def_cfa_offset 8
7934 ; X86-NEXT: .cfi_offset %ebp, -8
7935 ; X86-NEXT: movl %esp, %ebp
7936 ; X86-NEXT: .cfi_def_cfa_register %ebp
7937 ; X86-NEXT: andl $-8, %esp
7938 ; X86-NEXT: subl $8, %esp
7939 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7940 ; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0
7941 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7942 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
7943 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7944 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
7945 ; X86-NEXT: vmovsd %xmm0, (%esp)
7946 ; X86-NEXT: fldl (%esp)
7947 ; X86-NEXT: movl %ebp, %esp
7948 ; X86-NEXT: popl %ebp
7949 ; X86-NEXT: .cfi_def_cfa %esp, 4
7950 ; X86-NEXT: vzeroupper
7953 ; X64-LABEL: test_mm512_reduce_min_pd:
7954 ; X64: # %bb.0: # %entry
7955 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7956 ; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
7957 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7958 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
7959 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
7960 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
7961 ; X64-NEXT: vzeroupper
7964 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7965 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7966 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7967 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7968 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7969 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7970 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7971 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
7972 %vecext.i = extractelement <2 x double> %2, i32 0
7973 ret double %vecext.i
7976 define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7977 ; X86-LABEL: test_mm512_mask_reduce_max_epi64:
7978 ; X86: # %bb.0: # %entry
7979 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
7980 ; X86-NEXT: kmovw %eax, %k1
7981 ; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
7982 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7983 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7984 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7985 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7986 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7987 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7988 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7989 ; X86-NEXT: vmovd %xmm0, %eax
7990 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7991 ; X86-NEXT: vzeroupper
7994 ; X64-LABEL: test_mm512_mask_reduce_max_epi64:
7995 ; X64: # %bb.0: # %entry
7996 ; X64-NEXT: kmovw %edi, %k1
7997 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
7998 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7999 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8000 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
8001 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8002 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
8003 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8004 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
8005 ; X64-NEXT: vmovq %xmm0, %rax
8006 ; X64-NEXT: vzeroupper
8009 %0 = bitcast i8 %__M to <8 x i1>
8010 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
8011 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8012 %2 = icmp sgt <8 x i64> %1, %shuffle.i
8013 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8014 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8015 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
8016 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8017 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8018 %6 = icmp sgt <8 x i64> %5, %shuffle5.i
8019 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8020 %vecext.i = extractelement <8 x i64> %7, i32 0
8024 define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8025 ; X86-LABEL: test_mm512_mask_reduce_max_epu64:
8026 ; X86: # %bb.0: # %entry
8027 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
8028 ; X86-NEXT: kmovw %eax, %k1
8029 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8030 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8031 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8032 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8033 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8034 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8035 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8036 ; X86-NEXT: vmovd %xmm0, %eax
8037 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8038 ; X86-NEXT: vzeroupper
8041 ; X64-LABEL: test_mm512_mask_reduce_max_epu64:
8042 ; X64: # %bb.0: # %entry
8043 ; X64-NEXT: kmovw %edi, %k1
8044 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8045 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8046 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8047 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8048 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8049 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8050 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8051 ; X64-NEXT: vmovq %xmm0, %rax
8052 ; X64-NEXT: vzeroupper
8055 %0 = bitcast i8 %__M to <8 x i1>
8056 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
8057 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8058 %2 = icmp ugt <8 x i64> %1, %shuffle.i
8059 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8060 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8061 %4 = icmp ugt <8 x i64> %3, %shuffle2.i
8062 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
8063 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8064 %6 = icmp ugt <8 x i64> %5, %shuffle4.i
8065 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
8066 %vecext.i = extractelement <8 x i64> %7, i32 0
8070 define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
8071 ; X86-LABEL: test_mm512_mask_reduce_max_pd:
8072 ; X86: # %bb.0: # %entry
8073 ; X86-NEXT: pushl %ebp
8074 ; X86-NEXT: .cfi_def_cfa_offset 8
8075 ; X86-NEXT: .cfi_offset %ebp, -8
8076 ; X86-NEXT: movl %esp, %ebp
8077 ; X86-NEXT: .cfi_def_cfa_register %ebp
8078 ; X86-NEXT: andl $-8, %esp
8079 ; X86-NEXT: subl $8, %esp
8080 ; X86-NEXT: movzbl 8(%ebp), %eax
8081 ; X86-NEXT: kmovw %eax, %k1
8082 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8083 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8084 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8085 ; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8086 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8087 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8088 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8089 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
8090 ; X86-NEXT: vmovsd %xmm0, (%esp)
8091 ; X86-NEXT: fldl (%esp)
8092 ; X86-NEXT: movl %ebp, %esp
8093 ; X86-NEXT: popl %ebp
8094 ; X86-NEXT: .cfi_def_cfa %esp, 4
8095 ; X86-NEXT: vzeroupper
8098 ; X64-LABEL: test_mm512_mask_reduce_max_pd:
8099 ; X64: # %bb.0: # %entry
8100 ; X64-NEXT: kmovw %edi, %k1
8101 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8102 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8103 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8104 ; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8105 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8106 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8107 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8108 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
8109 ; X64-NEXT: vzeroupper
8112 %0 = bitcast i8 %__M to <8 x i1>
8113 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
8114 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8115 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8116 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
8117 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8118 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8119 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
8120 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8121 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
8122 %vecext.i = extractelement <2 x double> %4, i32 0
8123 ret double %vecext.i
8126 define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
8127 ; X86-LABEL: test_mm512_mask_reduce_min_epi64:
8128 ; X86: # %bb.0: # %entry
8129 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
8130 ; X86-NEXT: kmovw %eax, %k1
8131 ; X86-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
8132 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8133 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8134 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8135 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8136 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8137 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8138 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8139 ; X86-NEXT: vmovd %xmm0, %eax
8140 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8141 ; X86-NEXT: vzeroupper
8144 ; X64-LABEL: test_mm512_mask_reduce_min_epi64:
8145 ; X64: # %bb.0: # %entry
8146 ; X64-NEXT: kmovw %edi, %k1
8147 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
8148 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8149 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8150 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8151 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8152 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8153 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8154 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8155 ; X64-NEXT: vmovq %xmm0, %rax
8156 ; X64-NEXT: vzeroupper
8159 %0 = bitcast i8 %__M to <8 x i1>
8160 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
8161 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8162 %2 = icmp slt <8 x i64> %1, %shuffle.i
8163 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8164 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8165 %4 = icmp slt <8 x i64> %3, %shuffle3.i
8166 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8167 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8168 %6 = icmp slt <8 x i64> %5, %shuffle5.i
8169 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8170 %vecext.i = extractelement <8 x i64> %7, i32 0
8174 define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8175 ; X86-LABEL: test_mm512_mask_reduce_min_epu64:
8176 ; X86: # %bb.0: # %entry
8177 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
8178 ; X86-NEXT: kmovw %eax, %k1
8179 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8180 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8181 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8182 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8183 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8184 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8185 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8186 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8187 ; X86-NEXT: vmovd %xmm0, %eax
8188 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8189 ; X86-NEXT: vzeroupper
8192 ; X64-LABEL: test_mm512_mask_reduce_min_epu64:
8193 ; X64: # %bb.0: # %entry
8194 ; X64-NEXT: kmovw %edi, %k1
8195 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8196 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8197 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8198 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8199 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8200 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8201 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8202 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8203 ; X64-NEXT: vmovq %xmm0, %rax
8204 ; X64-NEXT: vzeroupper
8207 %0 = bitcast i8 %__M to <8 x i1>
8208 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
8209 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8210 %2 = icmp ult <8 x i64> %1, %shuffle.i
8211 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8212 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8213 %4 = icmp ult <8 x i64> %3, %shuffle3.i
8214 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8215 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8216 %6 = icmp ult <8 x i64> %5, %shuffle5.i
8217 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8218 %vecext.i = extractelement <8 x i64> %7, i32 0
8222 define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
8223 ; X86-LABEL: test_mm512_mask_reduce_min_pd:
8224 ; X86: # %bb.0: # %entry
8225 ; X86-NEXT: pushl %ebp
8226 ; X86-NEXT: .cfi_def_cfa_offset 8
8227 ; X86-NEXT: .cfi_offset %ebp, -8
8228 ; X86-NEXT: movl %esp, %ebp
8229 ; X86-NEXT: .cfi_def_cfa_register %ebp
8230 ; X86-NEXT: andl $-8, %esp
8231 ; X86-NEXT: subl $8, %esp
8232 ; X86-NEXT: movzbl 8(%ebp), %eax
8233 ; X86-NEXT: kmovw %eax, %k1
8234 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8235 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8236 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8237 ; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0
8238 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8239 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
8240 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8241 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
8242 ; X86-NEXT: vmovsd %xmm0, (%esp)
8243 ; X86-NEXT: fldl (%esp)
8244 ; X86-NEXT: movl %ebp, %esp
8245 ; X86-NEXT: popl %ebp
8246 ; X86-NEXT: .cfi_def_cfa %esp, 4
8247 ; X86-NEXT: vzeroupper
8250 ; X64-LABEL: test_mm512_mask_reduce_min_pd:
8251 ; X64: # %bb.0: # %entry
8252 ; X64-NEXT: kmovw %edi, %k1
8253 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8254 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8255 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8256 ; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0
8257 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8258 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
8259 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8260 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
8261 ; X64-NEXT: vzeroupper
8264 %0 = bitcast i8 %__M to <8 x i1>
8265 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
8266 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8267 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8268 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
8269 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8270 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8271 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
8272 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8273 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
8274 %vecext.i = extractelement <2 x double> %4, i32 0
8275 ret double %vecext.i
8278 define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
8279 ; CHECK-LABEL: test_mm512_reduce_max_epi32:
8280 ; CHECK: # %bb.0: # %entry
8281 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8282 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
8283 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8284 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8285 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8286 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8287 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8288 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8289 ; CHECK-NEXT: vmovd %xmm0, %eax
8290 ; CHECK-NEXT: vzeroupper
8291 ; CHECK-NEXT: ret{{[l|q]}}
8293 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8294 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8295 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8296 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8297 %2 = icmp sgt <8 x i32> %0, %1
8298 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8299 %4 = bitcast <8 x i32> %3 to <4 x i64>
8300 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8301 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8302 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8303 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8304 %7 = icmp sgt <4 x i32> %5, %6
8305 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8306 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8307 %9 = icmp sgt <4 x i32> %8, %shuffle.i
8308 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8309 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8310 %11 = icmp sgt <4 x i32> %10, %shuffle8.i
8311 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8312 %vecext.i = extractelement <4 x i32> %12, i32 0
8316 define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
8317 ; CHECK-LABEL: test_mm512_reduce_max_epu32:
8318 ; CHECK: # %bb.0: # %entry
8319 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8320 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8321 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8322 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8323 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8324 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8325 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8326 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8327 ; CHECK-NEXT: vmovd %xmm0, %eax
8328 ; CHECK-NEXT: vzeroupper
8329 ; CHECK-NEXT: ret{{[l|q]}}
8331 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8332 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8333 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8334 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8335 %2 = icmp ugt <8 x i32> %0, %1
8336 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8337 %4 = bitcast <8 x i32> %3 to <4 x i64>
8338 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8339 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8340 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8341 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8342 %7 = icmp ugt <4 x i32> %5, %6
8343 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8344 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8345 %9 = icmp ugt <4 x i32> %8, %shuffle.i
8346 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8347 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8348 %11 = icmp ugt <4 x i32> %10, %shuffle8.i
8349 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8350 %vecext.i = extractelement <4 x i32> %12, i32 0
8354 define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
8355 ; X86-LABEL: test_mm512_reduce_max_ps:
8356 ; X86: # %bb.0: # %entry
8357 ; X86-NEXT: pushl %eax
8358 ; X86-NEXT: .cfi_def_cfa_offset 8
8359 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8360 ; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8361 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8362 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8363 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8364 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8365 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8366 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8367 ; X86-NEXT: vmovss %xmm0, (%esp)
8368 ; X86-NEXT: flds (%esp)
8369 ; X86-NEXT: popl %eax
8370 ; X86-NEXT: .cfi_def_cfa_offset 4
8371 ; X86-NEXT: vzeroupper
8374 ; X64-LABEL: test_mm512_reduce_max_ps:
8375 ; X64: # %bb.0: # %entry
8376 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8377 ; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8378 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8379 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8380 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8381 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8382 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8383 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8384 ; X64-NEXT: vzeroupper
8387 %0 = bitcast <16 x float> %__W to <8 x double>
8388 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8389 %1 = bitcast <4 x double> %extract.i to <8 x float>
8390 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8391 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8392 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
8393 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8394 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8395 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8396 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8397 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
8398 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8399 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8400 %vecext.i = extractelement <4 x float> %6, i32 0
8404 define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
8405 ; CHECK-LABEL: test_mm512_reduce_min_epi32:
8406 ; CHECK: # %bb.0: # %entry
8407 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8408 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
8409 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8410 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8411 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8412 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8413 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8414 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8415 ; CHECK-NEXT: vmovd %xmm0, %eax
8416 ; CHECK-NEXT: vzeroupper
8417 ; CHECK-NEXT: ret{{[l|q]}}
8419 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8420 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8421 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8422 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8423 %2 = icmp slt <8 x i32> %0, %1
8424 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8425 %4 = bitcast <8 x i32> %3 to <4 x i64>
8426 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8427 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8428 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8429 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8430 %7 = icmp slt <4 x i32> %5, %6
8431 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8432 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8433 %9 = icmp slt <4 x i32> %8, %shuffle.i
8434 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8435 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8436 %11 = icmp slt <4 x i32> %10, %shuffle8.i
8437 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8438 %vecext.i = extractelement <4 x i32> %12, i32 0
8442 define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
8443 ; CHECK-LABEL: test_mm512_reduce_min_epu32:
8444 ; CHECK: # %bb.0: # %entry
8445 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8446 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
8447 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8448 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8449 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8450 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8451 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8452 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8453 ; CHECK-NEXT: vmovd %xmm0, %eax
8454 ; CHECK-NEXT: vzeroupper
8455 ; CHECK-NEXT: ret{{[l|q]}}
8457 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8458 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8459 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8460 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8461 %2 = icmp ult <8 x i32> %0, %1
8462 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8463 %4 = bitcast <8 x i32> %3 to <4 x i64>
8464 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8465 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8466 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8467 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8468 %7 = icmp ult <4 x i32> %5, %6
8469 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8470 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8471 %9 = icmp ult <4 x i32> %8, %shuffle.i
8472 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8473 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8474 %11 = icmp ult <4 x i32> %10, %shuffle8.i
8475 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8476 %vecext.i = extractelement <4 x i32> %12, i32 0
8480 define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
8481 ; X86-LABEL: test_mm512_reduce_min_ps:
8482 ; X86: # %bb.0: # %entry
8483 ; X86-NEXT: pushl %eax
8484 ; X86-NEXT: .cfi_def_cfa_offset 8
8485 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8486 ; X86-NEXT: vminps %ymm1, %ymm0, %ymm0
8487 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8488 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8489 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8490 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8491 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8492 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
8493 ; X86-NEXT: vmovss %xmm0, (%esp)
8494 ; X86-NEXT: flds (%esp)
8495 ; X86-NEXT: popl %eax
8496 ; X86-NEXT: .cfi_def_cfa_offset 4
8497 ; X86-NEXT: vzeroupper
8500 ; X64-LABEL: test_mm512_reduce_min_ps:
8501 ; X64: # %bb.0: # %entry
8502 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8503 ; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
8504 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8505 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8506 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8507 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8508 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8509 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
8510 ; X64-NEXT: vzeroupper
8513 %0 = bitcast <16 x float> %__W to <8 x double>
8514 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8515 %1 = bitcast <4 x double> %extract.i to <8 x float>
8516 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8517 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8518 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
8519 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8520 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8521 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8522 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8523 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
8524 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8525 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8526 %vecext.i = extractelement <4 x float> %6, i32 0
8530 define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8531 ; X86-LABEL: test_mm512_mask_reduce_max_epi32:
8532 ; X86: # %bb.0: # %entry
8533 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8534 ; X86-NEXT: kmovw %eax, %k1
8535 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8536 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8537 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8538 ; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8539 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8540 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8541 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8542 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8543 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8544 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8545 ; X86-NEXT: vmovd %xmm0, %eax
8546 ; X86-NEXT: vzeroupper
8549 ; X64-LABEL: test_mm512_mask_reduce_max_epi32:
8550 ; X64: # %bb.0: # %entry
8551 ; X64-NEXT: kmovw %edi, %k1
8552 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8553 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8554 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8555 ; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8556 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8557 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8558 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8559 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8560 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8561 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8562 ; X64-NEXT: vmovd %xmm0, %eax
8563 ; X64-NEXT: vzeroupper
8566 %0 = bitcast <8 x i64> %__W to <16 x i32>
8567 %1 = bitcast i16 %__M to <16 x i1>
8568 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
8569 %3 = bitcast <16 x i32> %2 to <8 x i64>
8570 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8571 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8572 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8573 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8574 %6 = icmp sgt <8 x i32> %4, %5
8575 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8576 %8 = bitcast <8 x i32> %7 to <4 x i64>
8577 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8578 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8579 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8580 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8581 %11 = icmp sgt <4 x i32> %9, %10
8582 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8583 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8584 %13 = icmp sgt <4 x i32> %12, %shuffle.i
8585 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8586 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8587 %15 = icmp sgt <4 x i32> %14, %shuffle10.i
8588 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8589 %vecext.i = extractelement <4 x i32> %16, i32 0
8593 define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8594 ; X86-LABEL: test_mm512_mask_reduce_max_epu32:
8595 ; X86: # %bb.0: # %entry
8596 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8597 ; X86-NEXT: kmovw %eax, %k1
8598 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8599 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8600 ; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8601 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8602 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8603 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8604 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8605 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8606 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8607 ; X86-NEXT: vmovd %xmm0, %eax
8608 ; X86-NEXT: vzeroupper
8611 ; X64-LABEL: test_mm512_mask_reduce_max_epu32:
8612 ; X64: # %bb.0: # %entry
8613 ; X64-NEXT: kmovw %edi, %k1
8614 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8615 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8616 ; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8617 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8618 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8619 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8620 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8621 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8622 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8623 ; X64-NEXT: vmovd %xmm0, %eax
8624 ; X64-NEXT: vzeroupper
8627 %0 = bitcast <8 x i64> %__W to <16 x i32>
8628 %1 = bitcast i16 %__M to <16 x i1>
8629 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
8630 %3 = bitcast <16 x i32> %2 to <8 x i64>
8631 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8632 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8633 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8634 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
8635 %6 = icmp ugt <8 x i32> %4, %5
8636 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8637 %8 = bitcast <8 x i32> %7 to <4 x i64>
8638 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8639 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8640 %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
8641 %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
8642 %11 = icmp ugt <4 x i32> %9, %10
8643 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8644 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8645 %13 = icmp ugt <4 x i32> %12, %shuffle.i
8646 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8647 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8648 %15 = icmp ugt <4 x i32> %14, %shuffle9.i
8649 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
8650 %vecext.i = extractelement <4 x i32> %16, i32 0
8654 define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
8655 ; X86-LABEL: test_mm512_mask_reduce_max_ps:
8656 ; X86: # %bb.0: # %entry
8657 ; X86-NEXT: pushl %eax
8658 ; X86-NEXT: .cfi_def_cfa_offset 8
8659 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8660 ; X86-NEXT: kmovw %eax, %k1
8661 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8662 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8663 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8664 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8665 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8666 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8667 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8668 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8669 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8670 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8671 ; X86-NEXT: vmovss %xmm0, (%esp)
8672 ; X86-NEXT: flds (%esp)
8673 ; X86-NEXT: popl %eax
8674 ; X86-NEXT: .cfi_def_cfa_offset 4
8675 ; X86-NEXT: vzeroupper
8678 ; X64-LABEL: test_mm512_mask_reduce_max_ps:
8679 ; X64: # %bb.0: # %entry
8680 ; X64-NEXT: kmovw %edi, %k1
8681 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8682 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8683 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8684 ; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8685 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8686 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8687 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8688 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8689 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8690 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8691 ; X64-NEXT: vzeroupper
8694 %0 = bitcast i16 %__M to <16 x i1>
8695 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
8696 %2 = bitcast <16 x float> %1 to <8 x double>
8697 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8698 %3 = bitcast <4 x double> %extract.i to <8 x float>
8699 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8700 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8701 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
8702 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8703 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8704 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8705 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8706 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
8707 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8708 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8709 %vecext.i = extractelement <4 x float> %8, i32 0
8713 define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8714 ; X86-LABEL: test_mm512_mask_reduce_min_epi32:
8715 ; X86: # %bb.0: # %entry
8716 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8717 ; X86-NEXT: kmovw %eax, %k1
8718 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8719 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8720 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8721 ; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8722 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8723 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8724 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8725 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8726 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8727 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8728 ; X86-NEXT: vmovd %xmm0, %eax
8729 ; X86-NEXT: vzeroupper
8732 ; X64-LABEL: test_mm512_mask_reduce_min_epi32:
8733 ; X64: # %bb.0: # %entry
8734 ; X64-NEXT: kmovw %edi, %k1
8735 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8736 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8737 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8738 ; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8739 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8740 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8741 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8742 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8743 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8744 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8745 ; X64-NEXT: vmovd %xmm0, %eax
8746 ; X64-NEXT: vzeroupper
8749 %0 = bitcast <8 x i64> %__W to <16 x i32>
8750 %1 = bitcast i16 %__M to <16 x i1>
8751 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
8752 %3 = bitcast <16 x i32> %2 to <8 x i64>
8753 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8754 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8755 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8756 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8757 %6 = icmp slt <8 x i32> %4, %5
8758 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8759 %8 = bitcast <8 x i32> %7 to <4 x i64>
8760 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8761 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8762 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8763 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8764 %11 = icmp slt <4 x i32> %9, %10
8765 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8766 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8767 %13 = icmp slt <4 x i32> %12, %shuffle.i
8768 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8769 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8770 %15 = icmp slt <4 x i32> %14, %shuffle10.i
8771 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8772 %vecext.i = extractelement <4 x i32> %16, i32 0
8776 define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8777 ; X86-LABEL: test_mm512_mask_reduce_min_epu32:
8778 ; X86: # %bb.0: # %entry
8779 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8780 ; X86-NEXT: kmovw %eax, %k1
8781 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8782 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8783 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8784 ; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0
8785 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8786 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8787 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8788 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8789 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8790 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8791 ; X86-NEXT: vmovd %xmm0, %eax
8792 ; X86-NEXT: vzeroupper
8795 ; X64-LABEL: test_mm512_mask_reduce_min_epu32:
8796 ; X64: # %bb.0: # %entry
8797 ; X64-NEXT: kmovw %edi, %k1
8798 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8799 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8800 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8801 ; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0
8802 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8803 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8804 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8805 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8806 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8807 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8808 ; X64-NEXT: vmovd %xmm0, %eax
8809 ; X64-NEXT: vzeroupper
8812 %0 = bitcast <8 x i64> %__W to <16 x i32>
8813 %1 = bitcast i16 %__M to <16 x i1>
8814 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
8815 %3 = bitcast <16 x i32> %2 to <8 x i64>
8816 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8817 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8818 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8819 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8820 %6 = icmp ult <8 x i32> %4, %5
8821 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8822 %8 = bitcast <8 x i32> %7 to <4 x i64>
8823 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8824 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8825 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8826 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8827 %11 = icmp ult <4 x i32> %9, %10
8828 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8829 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8830 %13 = icmp ult <4 x i32> %12, %shuffle.i
8831 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8832 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8833 %15 = icmp ult <4 x i32> %14, %shuffle10.i
8834 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8835 %vecext.i = extractelement <4 x i32> %16, i32 0
8839 define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
8840 ; X86-LABEL: test_mm512_mask_reduce_min_ps:
8841 ; X86: # %bb.0: # %entry
8842 ; X86-NEXT: pushl %eax
8843 ; X86-NEXT: .cfi_def_cfa_offset 8
8844 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8845 ; X86-NEXT: kmovw %eax, %k1
8846 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8847 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8848 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8849 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0
8850 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8851 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8852 ; X86-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8853 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8854 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8855 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
8856 ; X86-NEXT: vmovss %xmm0, (%esp)
8857 ; X86-NEXT: flds (%esp)
8858 ; X86-NEXT: popl %eax
8859 ; X86-NEXT: .cfi_def_cfa_offset 4
8860 ; X86-NEXT: vzeroupper
8863 ; X64-LABEL: test_mm512_mask_reduce_min_ps:
8864 ; X64: # %bb.0: # %entry
8865 ; X64-NEXT: kmovw %edi, %k1
8866 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8867 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8868 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8869 ; X64-NEXT: vminps %ymm0, %ymm1, %ymm0
8870 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8871 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8872 ; X64-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
8873 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8874 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
8875 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
8876 ; X64-NEXT: vzeroupper
8879 %0 = bitcast i16 %__M to <16 x i1>
8880 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
8881 %2 = bitcast <16 x float> %1 to <8 x double>
8882 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8883 %3 = bitcast <4 x double> %extract.i to <8 x float>
8884 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8885 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8886 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
8887 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8888 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8889 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8890 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8891 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
8892 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8893 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8894 %vecext.i = extractelement <4 x float> %8, i32 0
8898 define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8899 ; X86-LABEL: test_mm512_mask_max_pd:
8900 ; X86: # %bb.0: # %entry
8901 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
8902 ; X86-NEXT: kmovw %eax, %k1
8903 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8906 ; X64-LABEL: test_mm512_mask_max_pd:
8907 ; X64: # %bb.0: # %entry
8908 ; X64-NEXT: kmovw %edi, %k1
8909 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8912 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8913 %1 = bitcast i8 %__U to <8 x i1>
8914 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8918 define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8919 ; X86-LABEL: test_mm512_maskz_max_pd:
8920 ; X86: # %bb.0: # %entry
8921 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
8922 ; X86-NEXT: kmovw %eax, %k1
8923 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8926 ; X64-LABEL: test_mm512_maskz_max_pd:
8927 ; X64: # %bb.0: # %entry
8928 ; X64-NEXT: kmovw %edi, %k1
8929 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8932 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8933 %1 = bitcast i8 %__U to <8 x i1>
8934 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8938 define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8939 ; X86-LABEL: test_mm512_mask_max_ps:
8940 ; X86: # %bb.0: # %entry
8941 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8942 ; X86-NEXT: kmovw %eax, %k1
8943 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8946 ; X64-LABEL: test_mm512_mask_max_ps:
8947 ; X64: # %bb.0: # %entry
8948 ; X64-NEXT: kmovw %edi, %k1
8949 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8952 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8953 %1 = bitcast i16 %__U to <16 x i1>
8954 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
8958 define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8959 ; X86-LABEL: test_mm512_mask_max_round_pd:
8960 ; X86: # %bb.0: # %entry
8961 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
8962 ; X86-NEXT: kmovw %eax, %k1
8963 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8966 ; X64-LABEL: test_mm512_mask_max_round_pd:
8967 ; X64: # %bb.0: # %entry
8968 ; X64-NEXT: kmovw %edi, %k1
8969 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8972 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8973 %1 = bitcast i8 %__U to <8 x i1>
8974 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8978 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
8980 define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8981 ; X86-LABEL: test_mm512_maskz_max_round_pd:
8982 ; X86: # %bb.0: # %entry
8983 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
8984 ; X86-NEXT: kmovw %eax, %k1
8985 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8988 ; X64-LABEL: test_mm512_maskz_max_round_pd:
8989 ; X64: # %bb.0: # %entry
8990 ; X64-NEXT: kmovw %edi, %k1
8991 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8994 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8995 %1 = bitcast i8 %__U to <8 x i1>
8996 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9000 define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
9001 ; CHECK-LABEL: test_mm512_max_round_pd:
9002 ; CHECK: # %bb.0: # %entry
9003 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
9004 ; CHECK-NEXT: ret{{[l|q]}}
9006 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9010 define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9011 ; X86-LABEL: test_mm512_maskz_max_ps:
9012 ; X86: # %bb.0: # %entry
9013 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9014 ; X86-NEXT: kmovw %eax, %k1
9015 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9018 ; X64-LABEL: test_mm512_maskz_max_ps:
9019 ; X64: # %bb.0: # %entry
9020 ; X64-NEXT: kmovw %edi, %k1
9021 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9024 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9025 %1 = bitcast i16 %__U to <16 x i1>
9026 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9030 define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9031 ; X86-LABEL: test_mm512_mask_max_round_ps:
9032 ; X86: # %bb.0: # %entry
9033 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9034 ; X86-NEXT: kmovw %eax, %k1
9035 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9038 ; X64-LABEL: test_mm512_mask_max_round_ps:
9039 ; X64: # %bb.0: # %entry
9040 ; X64-NEXT: kmovw %edi, %k1
9041 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9044 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9045 %1 = bitcast i16 %__U to <16 x i1>
9046 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9050 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
9052 define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9053 ; X86-LABEL: test_mm512_maskz_max_round_ps:
9054 ; X86: # %bb.0: # %entry
9055 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9056 ; X86-NEXT: kmovw %eax, %k1
9057 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9060 ; X64-LABEL: test_mm512_maskz_max_round_ps:
9061 ; X64: # %bb.0: # %entry
9062 ; X64-NEXT: kmovw %edi, %k1
9063 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9066 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9067 %1 = bitcast i16 %__U to <16 x i1>
9068 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9072 define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
9073 ; CHECK-LABEL: test_mm512_max_round_ps:
9074 ; CHECK: # %bb.0: # %entry
9075 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
9076 ; CHECK-NEXT: ret{{[l|q]}}
9078 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9082 define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9083 ; X86-LABEL: test_mm512_mask_min_pd:
9084 ; X86: # %bb.0: # %entry
9085 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9086 ; X86-NEXT: kmovw %eax, %k1
9087 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9090 ; X64-LABEL: test_mm512_mask_min_pd:
9091 ; X64: # %bb.0: # %entry
9092 ; X64-NEXT: kmovw %edi, %k1
9093 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9096 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9097 %1 = bitcast i8 %__U to <8 x i1>
9098 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9102 define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9103 ; X86-LABEL: test_mm512_maskz_min_pd:
9104 ; X86: # %bb.0: # %entry
9105 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9106 ; X86-NEXT: kmovw %eax, %k1
9107 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9110 ; X64-LABEL: test_mm512_maskz_min_pd:
9111 ; X64: # %bb.0: # %entry
9112 ; X64-NEXT: kmovw %edi, %k1
9113 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9116 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9117 %1 = bitcast i8 %__U to <8 x i1>
9118 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9122 define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9123 ; X86-LABEL: test_mm512_mask_min_round_pd:
9124 ; X86: # %bb.0: # %entry
9125 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9126 ; X86-NEXT: kmovw %eax, %k1
9127 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9130 ; X64-LABEL: test_mm512_mask_min_round_pd:
9131 ; X64: # %bb.0: # %entry
9132 ; X64-NEXT: kmovw %edi, %k1
9133 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9136 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9137 %1 = bitcast i8 %__U to <8 x i1>
9138 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9142 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
9144 define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9145 ; X86-LABEL: test_mm512_maskz_min_round_pd:
9146 ; X86: # %bb.0: # %entry
9147 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9148 ; X86-NEXT: kmovw %eax, %k1
9149 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9152 ; X64-LABEL: test_mm512_maskz_min_round_pd:
9153 ; X64: # %bb.0: # %entry
9154 ; X64-NEXT: kmovw %edi, %k1
9155 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9158 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9159 %1 = bitcast i8 %__U to <8 x i1>
9160 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9164 define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
9165 ; CHECK-LABEL: test_mm512_min_round_pd:
9166 ; CHECK: # %bb.0: # %entry
9167 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
9168 ; CHECK-NEXT: ret{{[l|q]}}
9170 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9174 define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9175 ; X86-LABEL: test_mm512_mask_min_ps:
9176 ; X86: # %bb.0: # %entry
9177 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9178 ; X86-NEXT: kmovw %eax, %k1
9179 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9182 ; X64-LABEL: test_mm512_mask_min_ps:
9183 ; X64: # %bb.0: # %entry
9184 ; X64-NEXT: kmovw %edi, %k1
9185 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9188 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9189 %1 = bitcast i16 %__U to <16 x i1>
9190 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9194 define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9195 ; X86-LABEL: test_mm512_maskz_min_ps:
9196 ; X86: # %bb.0: # %entry
9197 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9198 ; X86-NEXT: kmovw %eax, %k1
9199 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9202 ; X64-LABEL: test_mm512_maskz_min_ps:
9203 ; X64: # %bb.0: # %entry
9204 ; X64-NEXT: kmovw %edi, %k1
9205 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9208 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9209 %1 = bitcast i16 %__U to <16 x i1>
9210 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9214 define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9215 ; X86-LABEL: test_mm512_mask_min_round_ps:
9216 ; X86: # %bb.0: # %entry
9217 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9218 ; X86-NEXT: kmovw %eax, %k1
9219 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9222 ; X64-LABEL: test_mm512_mask_min_round_ps:
9223 ; X64: # %bb.0: # %entry
9224 ; X64-NEXT: kmovw %edi, %k1
9225 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9228 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9229 %1 = bitcast i16 %__U to <16 x i1>
9230 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9234 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
9236 define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9237 ; X86-LABEL: test_mm512_maskz_min_round_ps:
9238 ; X86: # %bb.0: # %entry
9239 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9240 ; X86-NEXT: kmovw %eax, %k1
9241 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9244 ; X64-LABEL: test_mm512_maskz_min_round_ps:
9245 ; X64: # %bb.0: # %entry
9246 ; X64-NEXT: kmovw %edi, %k1
9247 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9250 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9251 %1 = bitcast i16 %__U to <16 x i1>
9252 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9256 define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
9257 ; CHECK-LABEL: test_mm512_min_round_ps:
9258 ; CHECK: # %bb.0: # %entry
9259 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
9260 ; CHECK-NEXT: ret{{[l|q]}}
9262 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9266 define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
9267 ; CHECK-LABEL: test_mm512_sqrt_pd:
9268 ; CHECK: # %bb.0: # %entry
9269 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
9270 ; CHECK-NEXT: ret{{[l|q]}}
9272 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
9276 define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9277 ; X86-LABEL: test_mm512_mask_sqrt_pd:
9278 ; X86: # %bb.0: # %entry
9279 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9280 ; X86-NEXT: kmovw %eax, %k1
9281 ; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9284 ; X64-LABEL: test_mm512_mask_sqrt_pd:
9285 ; X64: # %bb.0: # %entry
9286 ; X64-NEXT: kmovw %edi, %k1
9287 ; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9290 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9291 %1 = bitcast i8 %__U to <8 x i1>
9292 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9296 define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
9297 ; X86-LABEL: test_mm512_maskz_sqrt_pd:
9298 ; X86: # %bb.0: # %entry
9299 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9300 ; X86-NEXT: kmovw %eax, %k1
9301 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9304 ; X64-LABEL: test_mm512_maskz_sqrt_pd:
9305 ; X64: # %bb.0: # %entry
9306 ; X64-NEXT: kmovw %edi, %k1
9307 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9310 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9311 %1 = bitcast i8 %__U to <8 x i1>
9312 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9316 define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9317 ; X86-LABEL: test_mm512_mask_sqrt_round_pd:
9318 ; X86: # %bb.0: # %entry
9319 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9320 ; X86-NEXT: kmovw %eax, %k1
9321 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9324 ; X64-LABEL: test_mm512_mask_sqrt_round_pd:
9325 ; X64: # %bb.0: # %entry
9326 ; X64-NEXT: kmovw %edi, %k1
9327 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9330 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9331 %1 = bitcast i8 %__U to <8 x i1>
9332 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9336 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
9338 define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
9339 ; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
9340 ; X86: # %bb.0: # %entry
9341 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9342 ; X86-NEXT: kmovw %eax, %k1
9343 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9346 ; X64-LABEL: test_mm512_maskz_sqrt_round_pd:
9347 ; X64: # %bb.0: # %entry
9348 ; X64-NEXT: kmovw %edi, %k1
9349 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9352 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9353 %1 = bitcast i8 %__U to <8 x i1>
9354 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9358 define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
9359 ; CHECK-LABEL: test_mm512_sqrt_round_pd:
9360 ; CHECK: # %bb.0: # %entry
9361 ; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0
9362 ; CHECK-NEXT: ret{{[l|q]}}
9364 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9368 define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
9369 ; CHECK-LABEL: test_mm512_sqrt_ps:
9370 ; CHECK: # %bb.0: # %entry
9371 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
9372 ; CHECK-NEXT: ret{{[l|q]}}
9374 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
9378 define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9379 ; X86-LABEL: test_mm512_mask_sqrt_ps:
9380 ; X86: # %bb.0: # %entry
9381 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9382 ; X86-NEXT: kmovw %eax, %k1
9383 ; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9386 ; X64-LABEL: test_mm512_mask_sqrt_ps:
9387 ; X64: # %bb.0: # %entry
9388 ; X64-NEXT: kmovw %edi, %k1
9389 ; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9392 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9393 %1 = bitcast i16 %__U to <16 x i1>
9394 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9398 define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
9399 ; X86-LABEL: test_mm512_maskz_sqrt_ps:
9400 ; X86: # %bb.0: # %entry
9401 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9402 ; X86-NEXT: kmovw %eax, %k1
9403 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9406 ; X64-LABEL: test_mm512_maskz_sqrt_ps:
9407 ; X64: # %bb.0: # %entry
9408 ; X64-NEXT: kmovw %edi, %k1
9409 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9412 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9413 %1 = bitcast i16 %__U to <16 x i1>
9414 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9418 define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9419 ; X86-LABEL: test_mm512_mask_sqrt_round_ps:
9420 ; X86: # %bb.0: # %entry
9421 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9422 ; X86-NEXT: kmovw %eax, %k1
9423 ; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9426 ; X64-LABEL: test_mm512_mask_sqrt_round_ps:
9427 ; X64: # %bb.0: # %entry
9428 ; X64-NEXT: kmovw %edi, %k1
9429 ; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9432 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9433 %1 = bitcast i16 %__U to <16 x i1>
9434 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9438 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
9440 define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
9441 ; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
9442 ; X86: # %bb.0: # %entry
9443 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9444 ; X86-NEXT: kmovw %eax, %k1
9445 ; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9448 ; X64-LABEL: test_mm512_maskz_sqrt_round_ps:
9449 ; X64: # %bb.0: # %entry
9450 ; X64-NEXT: kmovw %edi, %k1
9451 ; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9454 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9455 %1 = bitcast i16 %__U to <16 x i1>
9456 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9460 define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
9461 ; CHECK-LABEL: test_mm512_sqrt_round_ps:
9462 ; CHECK: # %bb.0: # %entry
9463 ; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0
9464 ; CHECK-NEXT: ret{{[l|q]}}
9466 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9470 define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
9471 ; CHECK-LABEL: test_mm512_rol_epi32:
9472 ; CHECK: # %bb.0: # %entry
9473 ; CHECK-NEXT: vprold $5, %zmm0, %zmm0
9474 ; CHECK-NEXT: ret{{[l|q]}}
9476 %0 = bitcast <8 x i64> %__A to <16 x i32>
9477 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9478 %2 = bitcast <16 x i32> %1 to <8 x i64>
9482 define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9483 ; X86-LABEL: test_mm512_mask_rol_epi32:
9484 ; X86: # %bb.0: # %entry
9485 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9486 ; X86-NEXT: kmovw %eax, %k1
9487 ; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9490 ; X64-LABEL: test_mm512_mask_rol_epi32:
9491 ; X64: # %bb.0: # %entry
9492 ; X64-NEXT: kmovw %edi, %k1
9493 ; X64-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9496 %0 = bitcast <8 x i64> %__A to <16 x i32>
9497 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9498 %2 = bitcast <8 x i64> %__W to <16 x i32>
9499 %3 = bitcast i16 %__U to <16 x i1>
9500 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9501 %5 = bitcast <16 x i32> %4 to <8 x i64>
9505 define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9506 ; X86-LABEL: test_mm512_maskz_rol_epi32:
9507 ; X86: # %bb.0: # %entry
9508 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9509 ; X86-NEXT: kmovw %eax, %k1
9510 ; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9513 ; X64-LABEL: test_mm512_maskz_rol_epi32:
9514 ; X64: # %bb.0: # %entry
9515 ; X64-NEXT: kmovw %edi, %k1
9516 ; X64-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9519 %0 = bitcast <8 x i64> %__A to <16 x i32>
9520 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9521 %2 = bitcast i16 %__U to <16 x i1>
9522 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9523 %4 = bitcast <16 x i32> %3 to <8 x i64>
9527 define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
9528 ; CHECK-LABEL: test_mm512_rol_epi64:
9529 ; CHECK: # %bb.0: # %entry
9530 ; CHECK-NEXT: vprolq $5, %zmm0, %zmm0
9531 ; CHECK-NEXT: ret{{[l|q]}}
9533 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9537 define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9538 ; X86-LABEL: test_mm512_mask_rol_epi64:
9539 ; X86: # %bb.0: # %entry
9540 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9541 ; X86-NEXT: kmovw %eax, %k1
9542 ; X86-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9545 ; X64-LABEL: test_mm512_mask_rol_epi64:
9546 ; X64: # %bb.0: # %entry
9547 ; X64-NEXT: kmovw %edi, %k1
9548 ; X64-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9551 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9552 %1 = bitcast i8 %__U to <8 x i1>
9553 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9557 define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9558 ; X86-LABEL: test_mm512_maskz_rol_epi64:
9559 ; X86: # %bb.0: # %entry
9560 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9561 ; X86-NEXT: kmovw %eax, %k1
9562 ; X86-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9565 ; X64-LABEL: test_mm512_maskz_rol_epi64:
9566 ; X64: # %bb.0: # %entry
9567 ; X64-NEXT: kmovw %edi, %k1
9568 ; X64-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9571 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9572 %1 = bitcast i8 %__U to <8 x i1>
9573 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9577 define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9578 ; CHECK-LABEL: test_mm512_rolv_epi32:
9579 ; CHECK: # %bb.0: # %entry
9580 ; CHECK-NEXT: vprolvd %zmm1, %zmm0, %zmm0
9581 ; CHECK-NEXT: ret{{[l|q]}}
9583 %0 = bitcast <8 x i64> %__A to <16 x i32>
9584 %1 = bitcast <8 x i64> %__B to <16 x i32>
9585 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9586 %3 = bitcast <16 x i32> %2 to <8 x i64>
9590 define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9591 ; X86-LABEL: test_mm512_mask_rolv_epi32:
9592 ; X86: # %bb.0: # %entry
9593 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9594 ; X86-NEXT: kmovw %eax, %k1
9595 ; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9598 ; X64-LABEL: test_mm512_mask_rolv_epi32:
9599 ; X64: # %bb.0: # %entry
9600 ; X64-NEXT: kmovw %edi, %k1
9601 ; X64-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9604 %0 = bitcast <8 x i64> %__A to <16 x i32>
9605 %1 = bitcast <8 x i64> %__B to <16 x i32>
9606 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9607 %3 = bitcast <8 x i64> %__W to <16 x i32>
9608 %4 = bitcast i16 %__U to <16 x i1>
9609 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9610 %6 = bitcast <16 x i32> %5 to <8 x i64>
9614 define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9615 ; X86-LABEL: test_mm512_maskz_rolv_epi32:
9616 ; X86: # %bb.0: # %entry
9617 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9618 ; X86-NEXT: kmovw %eax, %k1
9619 ; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9622 ; X64-LABEL: test_mm512_maskz_rolv_epi32:
9623 ; X64: # %bb.0: # %entry
9624 ; X64-NEXT: kmovw %edi, %k1
9625 ; X64-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9628 %0 = bitcast <8 x i64> %__A to <16 x i32>
9629 %1 = bitcast <8 x i64> %__B to <16 x i32>
9630 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9631 %3 = bitcast i16 %__U to <16 x i1>
9632 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9633 %5 = bitcast <16 x i32> %4 to <8 x i64>
9637 define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9638 ; CHECK-LABEL: test_mm512_rolv_epi64:
9639 ; CHECK: # %bb.0: # %entry
9640 ; CHECK-NEXT: vprolvq %zmm1, %zmm0, %zmm0
9641 ; CHECK-NEXT: ret{{[l|q]}}
9643 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9647 define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9648 ; X86-LABEL: test_mm512_mask_rolv_epi64:
9649 ; X86: # %bb.0: # %entry
9650 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9651 ; X86-NEXT: kmovw %eax, %k1
9652 ; X86-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9655 ; X64-LABEL: test_mm512_mask_rolv_epi64:
9656 ; X64: # %bb.0: # %entry
9657 ; X64-NEXT: kmovw %edi, %k1
9658 ; X64-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9661 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9662 %1 = bitcast i8 %__U to <8 x i1>
9663 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9667 define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9668 ; X86-LABEL: test_mm512_maskz_rolv_epi64:
9669 ; X86: # %bb.0: # %entry
9670 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9671 ; X86-NEXT: kmovw %eax, %k1
9672 ; X86-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9675 ; X64-LABEL: test_mm512_maskz_rolv_epi64:
9676 ; X64: # %bb.0: # %entry
9677 ; X64-NEXT: kmovw %edi, %k1
9678 ; X64-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9681 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9682 %1 = bitcast i8 %__U to <8 x i1>
9683 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9687 define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
9688 ; CHECK-LABEL: test_mm512_ror_epi32:
9689 ; CHECK: # %bb.0: # %entry
9690 ; CHECK-NEXT: vprord $5, %zmm0, %zmm0
9691 ; CHECK-NEXT: ret{{[l|q]}}
9693 %0 = bitcast <8 x i64> %__A to <16 x i32>
9694 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9695 %2 = bitcast <16 x i32> %1 to <8 x i64>
9700 define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9701 ; X86-LABEL: test_mm512_mask_ror_epi32:
9702 ; X86: # %bb.0: # %entry
9703 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9704 ; X86-NEXT: kmovw %eax, %k1
9705 ; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9708 ; X64-LABEL: test_mm512_mask_ror_epi32:
9709 ; X64: # %bb.0: # %entry
9710 ; X64-NEXT: kmovw %edi, %k1
9711 ; X64-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9714 %0 = bitcast <8 x i64> %__A to <16 x i32>
9715 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9716 %2 = bitcast <8 x i64> %__W to <16 x i32>
9717 %3 = bitcast i16 %__U to <16 x i1>
9718 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9719 %5 = bitcast <16 x i32> %4 to <8 x i64>
9723 define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9724 ; X86-LABEL: test_mm512_maskz_ror_epi32:
9725 ; X86: # %bb.0: # %entry
9726 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9727 ; X86-NEXT: kmovw %eax, %k1
9728 ; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9731 ; X64-LABEL: test_mm512_maskz_ror_epi32:
9732 ; X64: # %bb.0: # %entry
9733 ; X64-NEXT: kmovw %edi, %k1
9734 ; X64-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9737 %0 = bitcast <8 x i64> %__A to <16 x i32>
9738 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9739 %2 = bitcast i16 %__U to <16 x i1>
9740 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9741 %4 = bitcast <16 x i32> %3 to <8 x i64>
9745 define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
9746 ; CHECK-LABEL: test_mm512_ror_epi64:
9747 ; CHECK: # %bb.0: # %entry
9748 ; CHECK-NEXT: vprorq $5, %zmm0, %zmm0
9749 ; CHECK-NEXT: ret{{[l|q]}}
9751 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9755 define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9756 ; X86-LABEL: test_mm512_mask_ror_epi64:
9757 ; X86: # %bb.0: # %entry
9758 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9759 ; X86-NEXT: kmovw %eax, %k1
9760 ; X86-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9763 ; X64-LABEL: test_mm512_mask_ror_epi64:
9764 ; X64: # %bb.0: # %entry
9765 ; X64-NEXT: kmovw %edi, %k1
9766 ; X64-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9769 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9770 %1 = bitcast i8 %__U to <8 x i1>
9771 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9775 define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9776 ; X86-LABEL: test_mm512_maskz_ror_epi64:
9777 ; X86: # %bb.0: # %entry
9778 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9779 ; X86-NEXT: kmovw %eax, %k1
9780 ; X86-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9783 ; X64-LABEL: test_mm512_maskz_ror_epi64:
9784 ; X64: # %bb.0: # %entry
9785 ; X64-NEXT: kmovw %edi, %k1
9786 ; X64-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9789 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9790 %1 = bitcast i8 %__U to <8 x i1>
9791 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9795 define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9796 ; CHECK-LABEL: test_mm512_rorv_epi32:
9797 ; CHECK: # %bb.0: # %entry
9798 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
9799 ; CHECK-NEXT: ret{{[l|q]}}
9801 %0 = bitcast <8 x i64> %__A to <16 x i32>
9802 %1 = bitcast <8 x i64> %__B to <16 x i32>
9803 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9804 %3 = bitcast <16 x i32> %2 to <8 x i64>
9808 define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9809 ; X86-LABEL: test_mm512_mask_rorv_epi32:
9810 ; X86: # %bb.0: # %entry
9811 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9812 ; X86-NEXT: kmovw %eax, %k1
9813 ; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9816 ; X64-LABEL: test_mm512_mask_rorv_epi32:
9817 ; X64: # %bb.0: # %entry
9818 ; X64-NEXT: kmovw %edi, %k1
9819 ; X64-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9822 %0 = bitcast <8 x i64> %__A to <16 x i32>
9823 %1 = bitcast <8 x i64> %__B to <16 x i32>
9824 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9825 %3 = bitcast <8 x i64> %__W to <16 x i32>
9826 %4 = bitcast i16 %__U to <16 x i1>
9827 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9828 %6 = bitcast <16 x i32> %5 to <8 x i64>
9832 define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9833 ; X86-LABEL: test_mm512_maskz_rorv_epi32:
9834 ; X86: # %bb.0: # %entry
9835 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9836 ; X86-NEXT: kmovw %eax, %k1
9837 ; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9840 ; X64-LABEL: test_mm512_maskz_rorv_epi32:
9841 ; X64: # %bb.0: # %entry
9842 ; X64-NEXT: kmovw %edi, %k1
9843 ; X64-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9846 %0 = bitcast <8 x i64> %__A to <16 x i32>
9847 %1 = bitcast <8 x i64> %__B to <16 x i32>
9848 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9849 %3 = bitcast i16 %__U to <16 x i1>
9850 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9851 %5 = bitcast <16 x i32> %4 to <8 x i64>
9855 define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9856 ; CHECK-LABEL: test_mm512_rorv_epi64:
9857 ; CHECK: # %bb.0: # %entry
9858 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
9859 ; CHECK-NEXT: ret{{[l|q]}}
9861 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9865 define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9866 ; X86-LABEL: test_mm512_mask_rorv_epi64:
9867 ; X86: # %bb.0: # %entry
9868 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9869 ; X86-NEXT: kmovw %eax, %k1
9870 ; X86-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9873 ; X64-LABEL: test_mm512_mask_rorv_epi64:
9874 ; X64: # %bb.0: # %entry
9875 ; X64-NEXT: kmovw %edi, %k1
9876 ; X64-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9879 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9880 %1 = bitcast i8 %__U to <8 x i1>
9881 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9885 define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9886 ; X86-LABEL: test_mm512_maskz_rorv_epi64:
9887 ; X86: # %bb.0: # %entry
9888 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
9889 ; X86-NEXT: kmovw %eax, %k1
9890 ; X86-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9893 ; X64-LABEL: test_mm512_maskz_rorv_epi64:
9894 ; X64: # %bb.0: # %entry
9895 ; X64-NEXT: kmovw %edi, %k1
9896 ; X64-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9899 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9900 %1 = bitcast i8 %__U to <8 x i1>
9901 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9905 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
9906 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
9907 declare float @llvm.fma.f32(float, float, float) #9
9908 declare double @llvm.fma.f64(double, double, double) #9
9909 declare <8 x i64> @llvm.masked.expandload.v8i64(ptr, <8 x i1>, <8 x i64>)
9910 declare <8 x double> @llvm.masked.expandload.v8f64(ptr, <8 x i1>, <8 x double>)
9911 declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>) #10
9912 declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
9913 declare void @llvm.masked.compressstore.v8f64(<8 x double>, ptr, <8 x i1>)
9914 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, ptr, <8 x i1>)
9915 declare void @llvm.masked.compressstore.v16f32(<16 x float>, ptr, <16 x i1>)
9916 declare void @llvm.masked.compressstore.v16i32(<16 x i32>, ptr, <16 x i1>)
9917 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
9918 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
9919 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
9920 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
9921 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
9922 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
9923 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
9924 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
9925 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
9926 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
9928 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9929 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
9930 declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9931 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)