1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
8 define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
9 ; X86-LABEL: test_mm512_kunpackb:
10 ; X86: # %bb.0: # %entry
11 ; X86-NEXT: pushl %ebp
12 ; X86-NEXT: .cfi_def_cfa_offset 8
13 ; X86-NEXT: .cfi_offset %ebp, -8
14 ; X86-NEXT: movl %esp, %ebp
15 ; X86-NEXT: .cfi_def_cfa_register %ebp
16 ; X86-NEXT: andl $-64, %esp
17 ; X86-NEXT: subl $64, %esp
18 ; X86-NEXT: vmovdqa64 136(%ebp), %zmm3
19 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
20 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
21 ; X86-NEXT: kunpckbw %k0, %k1, %k1
22 ; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
23 ; X86-NEXT: kmovw %k0, %eax
24 ; X86-NEXT: movzwl %ax, %eax
25 ; X86-NEXT: movl %ebp, %esp
27 ; X86-NEXT: .cfi_def_cfa %esp, 4
28 ; X86-NEXT: vzeroupper
31 ; X64-LABEL: test_mm512_kunpackb:
32 ; X64: # %bb.0: # %entry
33 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
34 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
35 ; X64-NEXT: kunpckbw %k0, %k1, %k1
36 ; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
37 ; X64-NEXT: kmovw %k0, %eax
38 ; X64-NEXT: movzwl %ax, %eax
39 ; X64-NEXT: vzeroupper
42 %0 = bitcast <8 x i64> %__E to <16 x i32>
43 %1 = bitcast <8 x i64> %__F to <16 x i32>
44 %2 = bitcast <8 x i64> %__A to <16 x i32>
45 %3 = bitcast <8 x i64> %__B to <16 x i32>
46 %4 = icmp ne <16 x i32> %2, %3
47 %5 = bitcast <8 x i64> %__C to <16 x i32>
48 %6 = bitcast <8 x i64> %__D to <16 x i32>
49 %7 = icmp ne <16 x i32> %5, %6
50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
53 %11 = icmp ne <16 x i32> %0, %1
54 %12 = and <16 x i1> %11, %10
55 %13 = bitcast <16 x i1> %12 to i16
59 define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
60 ; X86-LABEL: test_mm512_kortestc:
61 ; X86: # %bb.0: # %entry
62 ; X86-NEXT: pushl %ebp
63 ; X86-NEXT: .cfi_def_cfa_offset 8
64 ; X86-NEXT: .cfi_offset %ebp, -8
65 ; X86-NEXT: movl %esp, %ebp
66 ; X86-NEXT: .cfi_def_cfa_register %ebp
67 ; X86-NEXT: andl $-64, %esp
68 ; X86-NEXT: subl $64, %esp
69 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
70 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
71 ; X86-NEXT: korw %k0, %k1, %k0
72 ; X86-NEXT: kmovw %k0, %eax
73 ; X86-NEXT: cmpw $-1, %ax
75 ; X86-NEXT: andb $1, %al
76 ; X86-NEXT: movzbl %al, %eax
77 ; X86-NEXT: movl %ebp, %esp
79 ; X86-NEXT: .cfi_def_cfa %esp, 4
80 ; X86-NEXT: vzeroupper
83 ; X64-LABEL: test_mm512_kortestc:
84 ; X64: # %bb.0: # %entry
85 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
86 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
87 ; X64-NEXT: korw %k0, %k1, %k0
88 ; X64-NEXT: kmovw %k0, %eax
89 ; X64-NEXT: cmpw $-1, %ax
91 ; X64-NEXT: andb $1, %al
92 ; X64-NEXT: movzbl %al, %eax
93 ; X64-NEXT: vzeroupper
96 %0 = bitcast <8 x i64> %__A to <16 x i32>
97 %1 = bitcast <8 x i64> %__B to <16 x i32>
98 %2 = icmp ne <16 x i32> %0, %1
99 %3 = bitcast <8 x i64> %__C to <16 x i32>
100 %4 = bitcast <8 x i64> %__D to <16 x i32>
101 %5 = icmp ne <16 x i32> %3, %4
102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16
103 %8 = icmp eq i16 %7, -1
104 %9 = zext i1 %8 to i32
108 define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
109 ; X86-LABEL: test_mm512_kortestz:
110 ; X86: # %bb.0: # %entry
111 ; X86-NEXT: pushl %ebp
112 ; X86-NEXT: .cfi_def_cfa_offset 8
113 ; X86-NEXT: .cfi_offset %ebp, -8
114 ; X86-NEXT: movl %esp, %ebp
115 ; X86-NEXT: .cfi_def_cfa_register %ebp
116 ; X86-NEXT: andl $-64, %esp
117 ; X86-NEXT: subl $64, %esp
118 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
119 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
120 ; X86-NEXT: korw %k0, %k1, %k0
121 ; X86-NEXT: kmovw %k0, %eax
122 ; X86-NEXT: cmpw $0, %ax
124 ; X86-NEXT: andb $1, %al
125 ; X86-NEXT: movzbl %al, %eax
126 ; X86-NEXT: movl %ebp, %esp
127 ; X86-NEXT: popl %ebp
128 ; X86-NEXT: .cfi_def_cfa %esp, 4
129 ; X86-NEXT: vzeroupper
132 ; X64-LABEL: test_mm512_kortestz:
133 ; X64: # %bb.0: # %entry
134 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
135 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
136 ; X64-NEXT: korw %k0, %k1, %k0
137 ; X64-NEXT: kmovw %k0, %eax
138 ; X64-NEXT: cmpw $0, %ax
140 ; X64-NEXT: andb $1, %al
141 ; X64-NEXT: movzbl %al, %eax
142 ; X64-NEXT: vzeroupper
145 %0 = bitcast <8 x i64> %__A to <16 x i32>
146 %1 = bitcast <8 x i64> %__B to <16 x i32>
147 %2 = icmp ne <16 x i32> %0, %1
148 %3 = bitcast <8 x i64> %__C to <16 x i32>
149 %4 = bitcast <8 x i64> %__D to <16 x i32>
150 %5 = icmp ne <16 x i32> %3, %4
151 %6 = or <16 x i1> %5, %2
152 %7 = bitcast <16 x i1> %6 to i16
153 %8 = icmp eq i16 %7, 0
154 %9 = zext i1 %8 to i32
158 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
159 ; CHECK-LABEL: test_mm512_shuffle_f32x4:
160 ; CHECK: # %bb.0: # %entry
161 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
162 ; CHECK-NEXT: ret{{[l|q]}}
164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
165 ret <16 x float> %shuffle
169 define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
170 ; X86-LABEL: test_mm512_mask_shuffle_f32x4:
171 ; X86: # %bb.0: # %entry
172 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
173 ; X86-NEXT: kmovw %eax, %k1
174 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
177 ; X64-LABEL: test_mm512_mask_shuffle_f32x4:
178 ; X64: # %bb.0: # %entry
179 ; X64-NEXT: kmovw %edi, %k1
180 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
183 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
184 %0 = bitcast i16 %__U to <16 x i1>
185 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
189 define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
190 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
191 ; X86: # %bb.0: # %entry
192 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
193 ; X86-NEXT: kmovw %eax, %k1
194 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
197 ; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
198 ; X64: # %bb.0: # %entry
199 ; X64-NEXT: kmovw %edi, %k1
200 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
203 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
204 %0 = bitcast i16 %__U to <16 x i1>
205 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
209 define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
210 ; CHECK-LABEL: test_mm512_shuffle_f64x2:
211 ; CHECK: # %bb.0: # %entry
212 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
213 ; CHECK-NEXT: ret{{[l|q]}}
215 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
216 ret <8 x double> %shuffle
219 define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
220 ; X86-LABEL: test_mm512_mask_shuffle_f64x2:
221 ; X86: # %bb.0: # %entry
222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
223 ; X86-NEXT: kmovw %eax, %k1
224 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
227 ; X64-LABEL: test_mm512_mask_shuffle_f64x2:
228 ; X64: # %bb.0: # %entry
229 ; X64-NEXT: kmovw %edi, %k1
230 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
233 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
234 %0 = bitcast i8 %__U to <8 x i1>
235 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
239 define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
240 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
241 ; X86: # %bb.0: # %entry
242 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
243 ; X86-NEXT: kmovw %eax, %k1
244 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
247 ; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
248 ; X64: # %bb.0: # %entry
249 ; X64-NEXT: kmovw %edi, %k1
250 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
253 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
254 %0 = bitcast i8 %__U to <8 x i1>
255 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
259 define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
260 ; CHECK-LABEL: test_mm512_shuffle_i32x4:
261 ; CHECK: # %bb.0: # %entry
262 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
263 ; CHECK-NEXT: ret{{[l|q]}}
265 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
266 ret <8 x i64> %shuffle
269 define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
270 ; X86-LABEL: test_mm512_mask_shuffle_i32x4:
271 ; X86: # %bb.0: # %entry
272 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
273 ; X86-NEXT: kmovw %eax, %k1
274 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
277 ; X64-LABEL: test_mm512_mask_shuffle_i32x4:
278 ; X64: # %bb.0: # %entry
279 ; X64-NEXT: kmovw %edi, %k1
280 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
283 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
284 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
285 %1 = bitcast <8 x i64> %__W to <16 x i32>
286 %2 = bitcast i16 %__U to <16 x i1>
287 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
288 %4 = bitcast <16 x i32> %3 to <8 x i64>
292 define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
293 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
294 ; X86: # %bb.0: # %entry
295 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
296 ; X86-NEXT: kmovw %eax, %k1
297 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
300 ; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
301 ; X64: # %bb.0: # %entry
302 ; X64-NEXT: kmovw %edi, %k1
303 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
306 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
307 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
308 %1 = bitcast i16 %__U to <16 x i1>
309 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
310 %3 = bitcast <16 x i32> %2 to <8 x i64>
314 define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
315 ; CHECK-LABEL: test_mm512_shuffle_i64x2:
316 ; CHECK: # %bb.0: # %entry
317 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
318 ; CHECK-NEXT: ret{{[l|q]}}
320 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
321 ret <8 x i64> %shuffle
324 define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
325 ; X86-LABEL: test_mm512_mask_shuffle_i64x2:
326 ; X86: # %bb.0: # %entry
327 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
328 ; X86-NEXT: kmovw %eax, %k1
329 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
332 ; X64-LABEL: test_mm512_mask_shuffle_i64x2:
333 ; X64: # %bb.0: # %entry
334 ; X64-NEXT: kmovw %edi, %k1
335 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
338 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
339 %0 = bitcast i8 %__U to <8 x i1>
340 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
344 define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
345 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
346 ; X86: # %bb.0: # %entry
347 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
348 ; X86-NEXT: kmovw %eax, %k1
349 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
352 ; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
353 ; X64: # %bb.0: # %entry
354 ; X64-NEXT: kmovw %edi, %k1
355 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
358 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
359 %0 = bitcast i8 %__U to <8 x i1>
360 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
365 define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
366 ; CHECK-LABEL: test_mm512_testn_epi32_mask:
367 ; CHECK: # %bb.0: # %entry
368 ; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
369 ; CHECK-NEXT: kmovw %k0, %eax
370 ; CHECK-NEXT: movzwl %ax, %eax
371 ; CHECK-NEXT: vzeroupper
372 ; CHECK-NEXT: ret{{[l|q]}}
374 %and1.i.i = and <8 x i64> %__B, %__A
375 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
376 %1 = icmp eq <16 x i32> %0, zeroinitializer
377 %2 = bitcast <16 x i1> %1 to i16
381 define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
382 ; X86-LABEL: test_mm512_mask_testn_epi32_mask:
383 ; X86: # %bb.0: # %entry
384 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
385 ; X86-NEXT: kmovw %eax, %k1
386 ; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
387 ; X86-NEXT: kmovw %k0, %eax
388 ; X86-NEXT: movzwl %ax, %eax
389 ; X86-NEXT: vzeroupper
392 ; X64-LABEL: test_mm512_mask_testn_epi32_mask:
393 ; X64: # %bb.0: # %entry
394 ; X64-NEXT: kmovw %edi, %k1
395 ; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
396 ; X64-NEXT: kmovw %k0, %eax
397 ; X64-NEXT: movzwl %ax, %eax
398 ; X64-NEXT: vzeroupper
401 %and1.i.i = and <8 x i64> %__B, %__A
402 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
403 %1 = icmp eq <16 x i32> %0, zeroinitializer
404 %2 = bitcast i16 %__U to <16 x i1>
405 %3 = and <16 x i1> %1, %2
406 %4 = bitcast <16 x i1> %3 to i16
410 define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
411 ; CHECK-LABEL: test_mm512_testn_epi64_mask:
412 ; CHECK: # %bb.0: # %entry
413 ; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
414 ; CHECK-NEXT: kmovw %k0, %eax
415 ; CHECK-NEXT: movzbl %al, %eax
416 ; CHECK-NEXT: vzeroupper
417 ; CHECK-NEXT: ret{{[l|q]}}
419 %and1.i.i = and <8 x i64> %__B, %__A
420 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
421 %1 = bitcast <8 x i1> %0 to i8
425 define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
426 ; X86-LABEL: test_mm512_mask_testn_epi64_mask:
427 ; X86: # %bb.0: # %entry
428 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
429 ; X86-NEXT: kmovw %eax, %k1
430 ; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
431 ; X86-NEXT: kmovw %k0, %eax
432 ; X86-NEXT: movzbl %al, %eax
433 ; X86-NEXT: vzeroupper
436 ; X64-LABEL: test_mm512_mask_testn_epi64_mask:
437 ; X64: # %bb.0: # %entry
438 ; X64-NEXT: kmovw %edi, %k1
439 ; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
440 ; X64-NEXT: kmovw %k0, %eax
441 ; X64-NEXT: movzbl %al, %eax
442 ; X64-NEXT: vzeroupper
445 %and1.i.i = and <8 x i64> %__B, %__A
446 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
447 %1 = bitcast i8 %__U to <8 x i1>
448 %2 = and <8 x i1> %0, %1
449 %3 = bitcast <8 x i1> %2 to i8
453 define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
454 ; X86-LABEL: test_mm512_mask_test_epi32_mask:
455 ; X86: # %bb.0: # %entry
456 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
457 ; X86-NEXT: kmovw %eax, %k1
458 ; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
459 ; X86-NEXT: kmovw %k0, %eax
460 ; X86-NEXT: movzwl %ax, %eax
461 ; X86-NEXT: vzeroupper
464 ; X64-LABEL: test_mm512_mask_test_epi32_mask:
465 ; X64: # %bb.0: # %entry
466 ; X64-NEXT: kmovw %edi, %k1
467 ; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
468 ; X64-NEXT: kmovw %k0, %eax
469 ; X64-NEXT: movzwl %ax, %eax
470 ; X64-NEXT: vzeroupper
473 %and1.i.i = and <8 x i64> %__B, %__A
474 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
475 %1 = icmp ne <16 x i32> %0, zeroinitializer
476 %2 = bitcast i16 %__U to <16 x i1>
477 %3 = and <16 x i1> %1, %2
478 %4 = bitcast <16 x i1> %3 to i16
482 define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
483 ; X86-LABEL: test_mm512_mask_test_epi64_mask:
484 ; X86: # %bb.0: # %entry
485 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
486 ; X86-NEXT: kmovw %eax, %k1
487 ; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
488 ; X86-NEXT: kmovw %k0, %eax
489 ; X86-NEXT: movzbl %al, %eax
490 ; X86-NEXT: vzeroupper
493 ; X64-LABEL: test_mm512_mask_test_epi64_mask:
494 ; X64: # %bb.0: # %entry
495 ; X64-NEXT: kmovw %edi, %k1
496 ; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
497 ; X64-NEXT: kmovw %k0, %eax
498 ; X64-NEXT: movzbl %al, %eax
499 ; X64-NEXT: vzeroupper
502 %and1.i.i = and <8 x i64> %__B, %__A
503 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
504 %1 = bitcast i8 %__U to <8 x i1>
505 %2 = and <8 x i1> %0, %1
506 %3 = bitcast <8 x i1> %2 to i8
510 define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
511 ; X86-LABEL: test_mm512_mask_set1_epi32:
512 ; X86: # %bb.0: # %entry
513 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
514 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
515 ; X86-NEXT: kmovw %ecx, %k1
516 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1}
519 ; X64-LABEL: test_mm512_mask_set1_epi32:
520 ; X64: # %bb.0: # %entry
521 ; X64-NEXT: kmovw %edi, %k1
522 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1}
525 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
526 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
527 %0 = bitcast <8 x i64> %__O to <16 x i32>
528 %1 = bitcast i16 %__M to <16 x i1>
529 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
530 %3 = bitcast <16 x i32> %2 to <8 x i64>
534 define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) {
535 ; X86-LABEL: test_mm512_maskz_set1_epi32:
536 ; X86: # %bb.0: # %entry
537 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
538 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
539 ; X86-NEXT: kmovw %ecx, %k1
540 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
543 ; X64-LABEL: test_mm512_maskz_set1_epi32:
544 ; X64: # %bb.0: # %entry
545 ; X64-NEXT: kmovw %edi, %k1
546 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z}
549 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
550 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
551 %0 = bitcast i16 %__M to <16 x i1>
552 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
553 %2 = bitcast <16 x i32> %1 to <8 x i64>
557 define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
558 ; X86-LABEL: test_mm512_mask_set1_epi64:
559 ; X86: # %bb.0: # %entry
560 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
561 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
562 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
563 ; X86-NEXT: kmovw %eax, %k1
564 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
567 ; X64-LABEL: test_mm512_mask_set1_epi64:
568 ; X64: # %bb.0: # %entry
569 ; X64-NEXT: kmovw %edi, %k1
570 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1}
573 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
574 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
575 %0 = bitcast i8 %__M to <8 x i1>
576 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
580 define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
581 ; X86-LABEL: test_mm512_maskz_set1_epi64:
582 ; X86: # %bb.0: # %entry
583 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
584 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
585 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
586 ; X86-NEXT: kmovw %eax, %k1
587 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
590 ; X64-LABEL: test_mm512_maskz_set1_epi64:
591 ; X64: # %bb.0: # %entry
592 ; X64-NEXT: kmovw %edi, %k1
593 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z}
596 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
597 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
598 %0 = bitcast i8 %__M to <8 x i1>
599 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
604 define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
605 ; CHECK-LABEL: test_mm512_broadcastd_epi32:
607 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
608 ; CHECK-NEXT: ret{{[l|q]}}
609 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
610 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
611 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
615 define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
616 ; X86-LABEL: test_mm512_mask_broadcastd_epi32:
618 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
619 ; X86-NEXT: kmovw %eax, %k1
620 ; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
623 ; X64-LABEL: test_mm512_mask_broadcastd_epi32:
625 ; X64-NEXT: kmovw %edi, %k1
626 ; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
628 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
629 %arg1 = bitcast i16 %a1 to <16 x i1>
630 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
631 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
632 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
633 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
637 define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
638 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
640 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
641 ; X86-NEXT: kmovw %eax, %k1
642 ; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
645 ; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
647 ; X64-NEXT: kmovw %edi, %k1
648 ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
650 %arg0 = bitcast i16 %a0 to <16 x i1>
651 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
652 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
653 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
654 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
658 define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
659 ; CHECK-LABEL: test_mm512_broadcastq_epi64:
661 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
662 ; CHECK-NEXT: ret{{[l|q]}}
663 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
667 define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
668 ; X86-LABEL: test_mm512_mask_broadcastq_epi64:
670 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
671 ; X86-NEXT: kmovw %eax, %k1
672 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
675 ; X64-LABEL: test_mm512_mask_broadcastq_epi64:
677 ; X64-NEXT: kmovw %edi, %k1
678 ; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
680 %arg1 = bitcast i8 %a1 to <8 x i1>
681 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
682 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
686 define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
687 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
689 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
690 ; X86-NEXT: kmovw %eax, %k1
691 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
694 ; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
696 ; X64-NEXT: kmovw %edi, %k1
697 ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
699 %arg0 = bitcast i8 %a0 to <8 x i1>
700 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
701 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
705 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
706 ; CHECK-LABEL: test_mm512_broadcastsd_pd:
708 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
709 ; CHECK-NEXT: ret{{[l|q]}}
710 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
711 ret <8 x double> %res
714 define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
715 ; X86-LABEL: test_mm512_mask_broadcastsd_pd:
717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
718 ; X86-NEXT: kmovw %eax, %k1
719 ; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
722 ; X64-LABEL: test_mm512_mask_broadcastsd_pd:
724 ; X64-NEXT: kmovw %edi, %k1
725 ; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
727 %arg1 = bitcast i8 %a1 to <8 x i1>
728 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
729 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
730 ret <8 x double> %res1
733 define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
734 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
736 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
737 ; X86-NEXT: kmovw %eax, %k1
738 ; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
741 ; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
743 ; X64-NEXT: kmovw %edi, %k1
744 ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
746 %arg0 = bitcast i8 %a0 to <8 x i1>
747 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
748 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
749 ret <8 x double> %res1
752 define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
753 ; CHECK-LABEL: test_mm512_broadcastss_ps:
755 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
756 ; CHECK-NEXT: ret{{[l|q]}}
757 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
758 ret <16 x float> %res
761 define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
762 ; X86-LABEL: test_mm512_mask_broadcastss_ps:
764 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
765 ; X86-NEXT: kmovw %eax, %k1
766 ; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
769 ; X64-LABEL: test_mm512_mask_broadcastss_ps:
771 ; X64-NEXT: kmovw %edi, %k1
772 ; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
774 %arg1 = bitcast i16 %a1 to <16 x i1>
775 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
776 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
777 ret <16 x float> %res1
780 define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
781 ; X86-LABEL: test_mm512_maskz_broadcastss_ps:
783 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
784 ; X86-NEXT: kmovw %eax, %k1
785 ; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
788 ; X64-LABEL: test_mm512_maskz_broadcastss_ps:
790 ; X64-NEXT: kmovw %edi, %k1
791 ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
793 %arg0 = bitcast i16 %a0 to <16 x i1>
794 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
795 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
796 ret <16 x float> %res1
799 define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
800 ; CHECK-LABEL: test_mm512_movedup_pd:
802 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
803 ; CHECK-NEXT: ret{{[l|q]}}
804 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
805 ret <8 x double> %res
808 define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
809 ; X86-LABEL: test_mm512_mask_movedup_pd:
811 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
812 ; X86-NEXT: kmovw %eax, %k1
813 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
816 ; X64-LABEL: test_mm512_mask_movedup_pd:
818 ; X64-NEXT: kmovw %edi, %k1
819 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
821 %arg1 = bitcast i8 %a1 to <8 x i1>
822 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
823 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
824 ret <8 x double> %res1
827 define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
828 ; X86-LABEL: test_mm512_maskz_movedup_pd:
830 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
831 ; X86-NEXT: kmovw %eax, %k1
832 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
835 ; X64-LABEL: test_mm512_maskz_movedup_pd:
837 ; X64-NEXT: kmovw %edi, %k1
838 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
840 %arg0 = bitcast i8 %a0 to <8 x i1>
841 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
842 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
843 ret <8 x double> %res1
846 define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
847 ; CHECK-LABEL: test_mm512_movehdup_ps:
849 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
850 ; CHECK-NEXT: ret{{[l|q]}}
851 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
852 ret <16 x float> %res
855 define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
856 ; X86-LABEL: test_mm512_mask_movehdup_ps:
858 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
859 ; X86-NEXT: kmovw %eax, %k1
860 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
863 ; X64-LABEL: test_mm512_mask_movehdup_ps:
865 ; X64-NEXT: kmovw %edi, %k1
866 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
868 %arg1 = bitcast i16 %a1 to <16 x i1>
869 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
870 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
871 ret <16 x float> %res1
874 define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
875 ; X86-LABEL: test_mm512_maskz_movehdup_ps:
877 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
878 ; X86-NEXT: kmovw %eax, %k1
879 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
882 ; X64-LABEL: test_mm512_maskz_movehdup_ps:
884 ; X64-NEXT: kmovw %edi, %k1
885 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
887 %arg0 = bitcast i16 %a0 to <16 x i1>
888 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
889 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
890 ret <16 x float> %res1
893 define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
894 ; CHECK-LABEL: test_mm512_moveldup_ps:
896 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
897 ; CHECK-NEXT: ret{{[l|q]}}
898 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
899 ret <16 x float> %res
902 define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
903 ; X86-LABEL: test_mm512_mask_moveldup_ps:
905 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
906 ; X86-NEXT: kmovw %eax, %k1
907 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
910 ; X64-LABEL: test_mm512_mask_moveldup_ps:
912 ; X64-NEXT: kmovw %edi, %k1
913 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
915 %arg1 = bitcast i16 %a1 to <16 x i1>
916 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
917 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
918 ret <16 x float> %res1
921 define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
922 ; X86-LABEL: test_mm512_maskz_moveldup_ps:
924 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
925 ; X86-NEXT: kmovw %eax, %k1
926 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
929 ; X64-LABEL: test_mm512_maskz_moveldup_ps:
931 ; X64-NEXT: kmovw %edi, %k1
932 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
934 %arg0 = bitcast i16 %a0 to <16 x i1>
935 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
936 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
937 ret <16 x float> %res1
940 define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
941 ; CHECK-LABEL: test_mm512_permute_pd:
943 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
944 ; CHECK-NEXT: ret{{[l|q]}}
945 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
946 ret <8 x double> %res
949 define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
950 ; X86-LABEL: test_mm512_mask_permute_pd:
952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
953 ; X86-NEXT: kmovw %eax, %k1
954 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
957 ; X64-LABEL: test_mm512_mask_permute_pd:
959 ; X64-NEXT: kmovw %edi, %k1
960 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
962 %arg1 = bitcast i8 %a1 to <8 x i1>
963 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
964 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
965 ret <8 x double> %res1
968 define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
969 ; X86-LABEL: test_mm512_maskz_permute_pd:
971 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
972 ; X86-NEXT: kmovw %eax, %k1
973 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
976 ; X64-LABEL: test_mm512_maskz_permute_pd:
978 ; X64-NEXT: kmovw %edi, %k1
979 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
981 %arg0 = bitcast i8 %a0 to <8 x i1>
982 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
983 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
984 ret <8 x double> %res1
987 define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
988 ; CHECK-LABEL: test_mm512_permute_ps:
990 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
991 ; CHECK-NEXT: ret{{[l|q]}}
992 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
993 ret <16 x float> %res
996 define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
997 ; X86-LABEL: test_mm512_mask_permute_ps:
999 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1000 ; X86-NEXT: kmovw %eax, %k1
1001 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1004 ; X64-LABEL: test_mm512_mask_permute_ps:
1006 ; X64-NEXT: kmovw %edi, %k1
1007 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1009 %arg1 = bitcast i16 %a1 to <16 x i1>
1010 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1011 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1012 ret <16 x float> %res1
1015 define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
1016 ; X86-LABEL: test_mm512_maskz_permute_ps:
1018 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1019 ; X86-NEXT: kmovw %eax, %k1
1020 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1023 ; X64-LABEL: test_mm512_maskz_permute_ps:
1025 ; X64-NEXT: kmovw %edi, %k1
1026 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1028 %arg0 = bitcast i16 %a0 to <16 x i1>
1029 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1030 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1031 ret <16 x float> %res1
1034 define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
1035 ; CHECK-LABEL: test_mm512_permutex_epi64:
1037 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1038 ; CHECK-NEXT: ret{{[l|q]}}
1039 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1043 define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
1044 ; X86-LABEL: test_mm512_mask_permutex_epi64:
1046 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1047 ; X86-NEXT: kmovw %eax, %k1
1048 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1051 ; X64-LABEL: test_mm512_mask_permutex_epi64:
1053 ; X64-NEXT: kmovw %edi, %k1
1054 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1056 %arg1 = bitcast i8 %a1 to <8 x i1>
1057 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1058 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1062 define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
1063 ; X86-LABEL: test_mm512_maskz_permutex_epi64:
1065 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1066 ; X86-NEXT: kmovw %eax, %k1
1067 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1070 ; X64-LABEL: test_mm512_maskz_permutex_epi64:
1072 ; X64-NEXT: kmovw %edi, %k1
1073 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1075 %arg0 = bitcast i8 %a0 to <8 x i1>
1076 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1077 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1081 define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
1082 ; CHECK-LABEL: test_mm512_permutex_pd:
1084 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1085 ; CHECK-NEXT: ret{{[l|q]}}
1086 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1087 ret <8 x double> %res
1090 define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
1091 ; X86-LABEL: test_mm512_mask_permutex_pd:
1093 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1094 ; X86-NEXT: kmovw %eax, %k1
1095 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1098 ; X64-LABEL: test_mm512_mask_permutex_pd:
1100 ; X64-NEXT: kmovw %edi, %k1
1101 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1103 %arg1 = bitcast i8 %a1 to <8 x i1>
1104 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1105 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1106 ret <8 x double> %res1
1109 define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
1110 ; X86-LABEL: test_mm512_maskz_permutex_pd:
1112 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1113 ; X86-NEXT: kmovw %eax, %k1
1114 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1117 ; X64-LABEL: test_mm512_maskz_permutex_pd:
1119 ; X64-NEXT: kmovw %edi, %k1
1120 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1122 %arg0 = bitcast i8 %a0 to <8 x i1>
1123 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1124 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1125 ret <8 x double> %res1
1128 define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
1129 ; CHECK-LABEL: test_mm512_shuffle_epi32:
1131 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1132 ; CHECK-NEXT: ret{{[l|q]}}
1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1134 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1135 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1139 define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
1140 ; X86-LABEL: test_mm512_mask_shuffle_epi32:
1142 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1143 ; X86-NEXT: kmovw %eax, %k1
1144 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1147 ; X64-LABEL: test_mm512_mask_shuffle_epi32:
1149 ; X64-NEXT: kmovw %edi, %k1
1150 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1152 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1153 %arg1 = bitcast i16 %a1 to <16 x i1>
1154 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1155 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1156 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1157 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1161 define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
1162 ; X86-LABEL: test_mm512_maskz_shuffle_epi32:
1164 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1165 ; X86-NEXT: kmovw %eax, %k1
1166 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1169 ; X64-LABEL: test_mm512_maskz_shuffle_epi32:
1171 ; X64-NEXT: kmovw %edi, %k1
1172 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1174 %arg0 = bitcast i16 %a0 to <16 x i1>
1175 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1176 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1177 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1178 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1182 define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
1183 ; CHECK-LABEL: test_mm512_shuffle_pd:
1185 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1186 ; CHECK-NEXT: ret{{[l|q]}}
1187 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1188 ret <8 x double> %res
1191 define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1192 ; X86-LABEL: test_mm512_mask_shuffle_pd:
1194 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1195 ; X86-NEXT: kmovw %eax, %k1
1196 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1199 ; X64-LABEL: test_mm512_mask_shuffle_pd:
1201 ; X64-NEXT: kmovw %edi, %k1
1202 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1204 %arg1 = bitcast i8 %a1 to <8 x i1>
1205 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1206 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1207 ret <8 x double> %res1
1210 define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1211 ; X86-LABEL: test_mm512_maskz_shuffle_pd:
1213 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1214 ; X86-NEXT: kmovw %eax, %k1
1215 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1218 ; X64-LABEL: test_mm512_maskz_shuffle_pd:
1220 ; X64-NEXT: kmovw %edi, %k1
1221 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1223 %arg0 = bitcast i8 %a0 to <8 x i1>
1224 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1225 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1226 ret <8 x double> %res1
1229 define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1230 ; CHECK-LABEL: test_mm512_unpackhi_epi32:
1232 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1233 ; CHECK-NEXT: ret{{[l|q]}}
1234 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1235 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1236 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1237 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1241 define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1242 ; X86-LABEL: test_mm512_mask_unpackhi_epi32:
1244 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1245 ; X86-NEXT: kmovw %eax, %k1
1246 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1249 ; X64-LABEL: test_mm512_mask_unpackhi_epi32:
1251 ; X64-NEXT: kmovw %edi, %k1
1252 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1254 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1255 %arg1 = bitcast i16 %a1 to <16 x i1>
1256 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1257 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1258 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1259 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1264 define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1265 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
1267 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1268 ; X86-NEXT: kmovw %eax, %k1
1269 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1272 ; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
1274 ; X64-NEXT: kmovw %edi, %k1
1275 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1277 %arg0 = bitcast i16 %a0 to <16 x i1>
1278 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1279 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1280 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1281 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1282 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1286 define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1287 ; CHECK-LABEL: test_mm512_unpackhi_epi64:
1289 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1290 ; CHECK-NEXT: ret{{[l|q]}}
1291 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1295 define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1296 ; X86-LABEL: test_mm512_mask_unpackhi_epi64:
1298 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1299 ; X86-NEXT: kmovw %eax, %k1
1300 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1303 ; X64-LABEL: test_mm512_mask_unpackhi_epi64:
1305 ; X64-NEXT: kmovw %edi, %k1
1306 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1308 %arg1 = bitcast i8 %a1 to <8 x i1>
1309 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1310 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1314 define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1315 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
1317 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1318 ; X86-NEXT: kmovw %eax, %k1
1319 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1322 ; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
1324 ; X64-NEXT: kmovw %edi, %k1
1325 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1327 %arg0 = bitcast i8 %a0 to <8 x i1>
1328 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1329 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1333 define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
1334 ; CHECK-LABEL: test_mm512_unpackhi_pd:
1336 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1337 ; CHECK-NEXT: ret{{[l|q]}}
1338 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1339 ret <8 x double> %res
1342 define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1343 ; X86-LABEL: test_mm512_mask_unpackhi_pd:
1345 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1346 ; X86-NEXT: kmovw %eax, %k1
1347 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1350 ; X64-LABEL: test_mm512_mask_unpackhi_pd:
1352 ; X64-NEXT: kmovw %edi, %k1
1353 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1355 %arg1 = bitcast i8 %a1 to <8 x i1>
1356 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1357 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1358 ret <8 x double> %res1
1361 define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1362 ; X86-LABEL: test_mm512_maskz_unpackhi_pd:
1364 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1365 ; X86-NEXT: kmovw %eax, %k1
1366 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1369 ; X64-LABEL: test_mm512_maskz_unpackhi_pd:
1371 ; X64-NEXT: kmovw %edi, %k1
1372 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1374 %arg0 = bitcast i8 %a0 to <8 x i1>
1375 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1376 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1377 ret <8 x double> %res1
1380 define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
1381 ; CHECK-LABEL: test_mm512_unpackhi_ps:
1383 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1384 ; CHECK-NEXT: ret{{[l|q]}}
1385 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1386 ret <16 x float> %res
1389 define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1390 ; X86-LABEL: test_mm512_mask_unpackhi_ps:
1392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1393 ; X86-NEXT: kmovw %eax, %k1
1394 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1397 ; X64-LABEL: test_mm512_mask_unpackhi_ps:
1399 ; X64-NEXT: kmovw %edi, %k1
1400 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1402 %arg1 = bitcast i16 %a1 to <16 x i1>
1403 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1404 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1405 ret <16 x float> %res1
1408 define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1409 ; X86-LABEL: test_mm512_maskz_unpackhi_ps:
1411 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1412 ; X86-NEXT: kmovw %eax, %k1
1413 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1416 ; X64-LABEL: test_mm512_maskz_unpackhi_ps:
1418 ; X64-NEXT: kmovw %edi, %k1
1419 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1421 %arg0 = bitcast i16 %a0 to <16 x i1>
1422 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1423 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1424 ret <16 x float> %res1
1427 define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1428 ; CHECK-LABEL: test_mm512_unpacklo_epi32:
1430 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1431 ; CHECK-NEXT: ret{{[l|q]}}
1432 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1433 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1434 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1435 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1439 define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1440 ; X86-LABEL: test_mm512_mask_unpacklo_epi32:
1442 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1443 ; X86-NEXT: kmovw %eax, %k1
1444 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1447 ; X64-LABEL: test_mm512_mask_unpacklo_epi32:
1449 ; X64-NEXT: kmovw %edi, %k1
1450 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1452 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1453 %arg1 = bitcast i16 %a1 to <16 x i1>
1454 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1455 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1456 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1457 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1458 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1462 define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1463 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
1465 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1466 ; X86-NEXT: kmovw %eax, %k1
1467 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1470 ; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
1472 ; X64-NEXT: kmovw %edi, %k1
1473 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1475 %arg0 = bitcast i16 %a0 to <16 x i1>
1476 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1477 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1478 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1479 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1480 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1484 define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1485 ; CHECK-LABEL: test_mm512_unpacklo_epi64:
1487 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1488 ; CHECK-NEXT: ret{{[l|q]}}
1489 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1493 define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1494 ; X86-LABEL: test_mm512_mask_unpacklo_epi64:
1496 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1497 ; X86-NEXT: kmovw %eax, %k1
1498 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1501 ; X64-LABEL: test_mm512_mask_unpacklo_epi64:
1503 ; X64-NEXT: kmovw %edi, %k1
1504 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1506 %arg1 = bitcast i8 %a1 to <8 x i1>
1507 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1508 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1512 define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1513 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
1515 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1516 ; X86-NEXT: kmovw %eax, %k1
1517 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1520 ; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
1522 ; X64-NEXT: kmovw %edi, %k1
1523 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1525 %arg0 = bitcast i8 %a0 to <8 x i1>
1526 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1527 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1531 define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
1532 ; CHECK-LABEL: test_mm512_unpacklo_pd:
1534 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1535 ; CHECK-NEXT: ret{{[l|q]}}
1536 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1537 ret <8 x double> %res
1540 define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1541 ; X86-LABEL: test_mm512_mask_unpacklo_pd:
1543 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1544 ; X86-NEXT: kmovw %eax, %k1
1545 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1548 ; X64-LABEL: test_mm512_mask_unpacklo_pd:
1550 ; X64-NEXT: kmovw %edi, %k1
1551 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1553 %arg1 = bitcast i8 %a1 to <8 x i1>
1554 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1555 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1556 ret <8 x double> %res1
1559 define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1560 ; X86-LABEL: test_mm512_maskz_unpacklo_pd:
1562 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1563 ; X86-NEXT: kmovw %eax, %k1
1564 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1567 ; X64-LABEL: test_mm512_maskz_unpacklo_pd:
1569 ; X64-NEXT: kmovw %edi, %k1
1570 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1572 %arg0 = bitcast i8 %a0 to <8 x i1>
1573 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1574 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1575 ret <8 x double> %res1
1578 define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
1579 ; CHECK-LABEL: test_mm512_unpacklo_ps:
1581 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1582 ; CHECK-NEXT: ret{{[l|q]}}
1583 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1584 ret <16 x float> %res
1587 define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1588 ; X86-LABEL: test_mm512_mask_unpacklo_ps:
1590 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1591 ; X86-NEXT: kmovw %eax, %k1
1592 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1595 ; X64-LABEL: test_mm512_mask_unpacklo_ps:
1597 ; X64-NEXT: kmovw %edi, %k1
1598 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1600 %arg1 = bitcast i16 %a1 to <16 x i1>
1601 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1602 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1603 ret <16 x float> %res1
1606 define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1607 ; X86-LABEL: test_mm512_maskz_unpacklo_ps:
1609 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: kmovw %eax, %k1
1611 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1614 ; X64-LABEL: test_mm512_maskz_unpacklo_ps:
1616 ; X64-NEXT: kmovw %edi, %k1
1617 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1619 %arg0 = bitcast i16 %a0 to <16 x i1>
1620 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1621 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1622 ret <16 x float> %res1
1625 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
1626 ; CHECK-LABEL: test_mm512_zextpd128_pd512:
1628 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1629 ; CHECK-NEXT: ret{{[l|q]}}
1630 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1631 ret <8 x double> %res
1634 define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
1635 ; CHECK-LABEL: test_mm512_zextpd256_pd512:
1637 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1638 ; CHECK-NEXT: ret{{[l|q]}}
1639 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1640 ret <8 x double> %res
1643 define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
1644 ; CHECK-LABEL: test_mm512_zextps128_ps512:
1646 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1647 ; CHECK-NEXT: ret{{[l|q]}}
1648 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
1649 ret <16 x float> %res
1652 define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
1653 ; CHECK-LABEL: test_mm512_zextps256_ps512:
1655 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1656 ; CHECK-NEXT: ret{{[l|q]}}
1657 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1658 ret <16 x float> %res
1661 define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
1662 ; CHECK-LABEL: test_mm512_zextsi128_si512:
1664 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1665 ; CHECK-NEXT: ret{{[l|q]}}
1666 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1670 define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
1671 ; CHECK-LABEL: test_mm512_zextsi256_si512:
1673 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1674 ; CHECK-NEXT: ret{{[l|q]}}
1675 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1679 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1680 ; CHECK-LABEL: test_mm512_mul_epi32:
1682 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0
1683 ; CHECK-NEXT: ret{{[l|q]}}
1684 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1685 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1686 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1687 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1688 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1692 define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1693 ; X86-LABEL: test_mm512_maskz_mul_epi32:
1694 ; X86: # %bb.0: # %entry
1695 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1696 ; X86-NEXT: kmovw %eax, %k1
1697 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1700 ; X64-LABEL: test_mm512_maskz_mul_epi32:
1701 ; X64: # %bb.0: # %entry
1702 ; X64-NEXT: kmovw %edi, %k1
1703 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1706 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1707 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1708 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1709 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1710 %4 = mul nsw <8 x i64> %3, %1
1711 %5 = bitcast i8 %__k to <8 x i1>
1712 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
1716 define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1717 ; X86-LABEL: test_mm512_mask_mul_epi32:
1718 ; X86: # %bb.0: # %entry
1719 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1720 ; X86-NEXT: kmovw %eax, %k1
1721 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1722 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1725 ; X64-LABEL: test_mm512_mask_mul_epi32:
1726 ; X64: # %bb.0: # %entry
1727 ; X64-NEXT: kmovw %edi, %k1
1728 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1729 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1732 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1733 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1734 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1735 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1736 %4 = mul nsw <8 x i64> %3, %1
1737 %5 = bitcast i8 %__k to <8 x i1>
1738 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src
1742 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1743 ; CHECK-LABEL: test_mm512_mul_epu32:
1745 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
1746 ; CHECK-NEXT: ret{{[l|q]}}
1747 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1748 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1749 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1753 define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1754 ; X86-LABEL: test_mm512_maskz_mul_epu32:
1755 ; X86: # %bb.0: # %entry
1756 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1757 ; X86-NEXT: kmovw %eax, %k1
1758 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1761 ; X64-LABEL: test_mm512_maskz_mul_epu32:
1762 ; X64: # %bb.0: # %entry
1763 ; X64-NEXT: kmovw %edi, %k1
1764 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1767 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1768 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1769 %2 = mul nuw <8 x i64> %1, %0
1770 %3 = bitcast i8 %__k to <8 x i1>
1771 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1775 define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1776 ; X86-LABEL: test_mm512_mask_mul_epu32:
1777 ; X86: # %bb.0: # %entry
1778 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1779 ; X86-NEXT: kmovw %eax, %k1
1780 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1781 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1784 ; X64-LABEL: test_mm512_mask_mul_epu32:
1785 ; X64: # %bb.0: # %entry
1786 ; X64-NEXT: kmovw %edi, %k1
1787 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1788 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1791 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1792 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1793 %2 = mul nuw <8 x i64> %1, %0
1794 %3 = bitcast i8 %__k to <8 x i1>
1795 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src
1799 define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
1800 ; X86-LABEL: test_mm512_set1_epi8:
1801 ; X86: # %bb.0: # %entry
1802 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1803 ; X86-NEXT: vmovd %eax, %xmm0
1804 ; X86-NEXT: vpbroadcastb %xmm0, %ymm0
1805 ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1808 ; X64-LABEL: test_mm512_set1_epi8:
1809 ; X64: # %bb.0: # %entry
1810 ; X64-NEXT: vmovd %edi, %xmm0
1811 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0
1812 ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1815 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
1816 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
1817 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
1821 define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
1822 ; X86-LABEL: test_mm_cvtu32_sd:
1823 ; X86: # %bb.0: # %entry
1824 ; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
1827 ; X64-LABEL: test_mm_cvtu32_sd:
1828 ; X64: # %bb.0: # %entry
1829 ; X64-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0
1832 %conv.i = uitofp i32 %__B to double
1833 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1834 ret <2 x double> %vecins.i
1837 define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
1838 ; X86-LABEL: test_mm_cvtu64_sd:
1839 ; X86: # %bb.0: # %entry
1840 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1841 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1842 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1843 ; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
1844 ; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
1845 ; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1848 ; X64-LABEL: test_mm_cvtu64_sd:
1849 ; X64: # %bb.0: # %entry
1850 ; X64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0
1853 %conv.i = uitofp i64 %__B to double
1854 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1855 ret <2 x double> %vecins.i
1858 define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
1859 ; X86-LABEL: test_mm_cvtu32_ss:
1860 ; X86: # %bb.0: # %entry
1861 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
1864 ; X64-LABEL: test_mm_cvtu32_ss:
1865 ; X64: # %bb.0: # %entry
1866 ; X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
1869 %conv.i = uitofp i32 %__B to float
1870 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1871 ret <4 x float> %vecins.i
1874 define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
1875 ; X86-LABEL: test_mm_cvtu64_ss:
1876 ; X86: # %bb.0: # %entry
1877 ; X86-NEXT: pushl %ebp
1878 ; X86-NEXT: .cfi_def_cfa_offset 8
1879 ; X86-NEXT: .cfi_offset %ebp, -8
1880 ; X86-NEXT: movl %esp, %ebp
1881 ; X86-NEXT: .cfi_def_cfa_register %ebp
1882 ; X86-NEXT: andl $-8, %esp
1883 ; X86-NEXT: subl $16, %esp
1884 ; X86-NEXT: movl 12(%ebp), %eax
1885 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1886 ; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1887 ; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
1888 ; X86-NEXT: xorl %ecx, %ecx
1889 ; X86-NEXT: testl %eax, %eax
1890 ; X86-NEXT: setns %cl
1891 ; X86-NEXT: fildll {{[0-9]+}}(%esp)
1892 ; X86-NEXT: fadds {{\.LCPI.*}}(,%ecx,4)
1893 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
1894 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1895 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1896 ; X86-NEXT: movl %ebp, %esp
1897 ; X86-NEXT: popl %ebp
1898 ; X86-NEXT: .cfi_def_cfa %esp, 4
1901 ; X64-LABEL: test_mm_cvtu64_ss:
1902 ; X64: # %bb.0: # %entry
1903 ; X64-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0
1906 %conv.i = uitofp i64 %__B to float
1907 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1908 ret <4 x float> %vecins.i
1911 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
1912 ; CHECK-LABEL: test_mm512_cvtps_pd:
1913 ; CHECK: # %bb.0: # %entry
1914 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1915 ; CHECK-NEXT: ret{{[l|q]}}
1917 %conv.i = fpext <8 x float> %__A to <8 x double>
1918 ret <8 x double> %conv.i
1921 define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
1922 ; CHECK-LABEL: test_mm512_cvtpslo_pd:
1923 ; CHECK: # %bb.0: # %entry
1924 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1925 ; CHECK-NEXT: ret{{[l|q]}}
1927 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1928 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1929 ret <8 x double> %conv.i.i
1932 define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
1933 ; X86-LABEL: test_mm512_mask_cvtps_pd:
1934 ; X86: # %bb.0: # %entry
1935 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1936 ; X86-NEXT: kmovw %eax, %k1
1937 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1940 ; X64-LABEL: test_mm512_mask_cvtps_pd:
1941 ; X64: # %bb.0: # %entry
1942 ; X64-NEXT: kmovw %edi, %k1
1943 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1946 %conv.i.i = fpext <8 x float> %__A to <8 x double>
1947 %0 = bitcast i8 %__U to <8 x i1>
1948 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
1952 define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
1953 ; X86-LABEL: test_mm512_mask_cvtpslo_pd:
1954 ; X86: # %bb.0: # %entry
1955 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1956 ; X86-NEXT: kmovw %eax, %k1
1957 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1960 ; X64-LABEL: test_mm512_mask_cvtpslo_pd:
1961 ; X64: # %bb.0: # %entry
1962 ; X64-NEXT: kmovw %edi, %k1
1963 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1966 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1967 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1968 %0 = bitcast i8 %__U to <8 x i1>
1969 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
1973 define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
1974 ; X86-LABEL: test_mm512_maskz_cvtps_pd:
1975 ; X86: # %bb.0: # %entry
1976 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1977 ; X86-NEXT: kmovw %eax, %k1
1978 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1981 ; X64-LABEL: test_mm512_maskz_cvtps_pd:
1982 ; X64: # %bb.0: # %entry
1983 ; X64-NEXT: kmovw %edi, %k1
1984 ; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1987 %conv.i.i = fpext <8 x float> %__A to <8 x double>
1988 %0 = bitcast i8 %__U to <8 x i1>
1989 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
1993 define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
1994 ; CHECK-LABEL: test_mm512_cvtepi32_epi8:
1995 ; CHECK: # %bb.0: # %entry
1996 ; CHECK-NEXT: vpmovdb %zmm0, %xmm0
1997 ; CHECK-NEXT: vzeroupper
1998 ; CHECK-NEXT: ret{{[l|q]}}
2000 %0 = bitcast <8 x i64> %__A to <16 x i32>
2001 %conv.i = trunc <16 x i32> %0 to <16 x i8>
2002 %1 = bitcast <16 x i8> %conv.i to <2 x i64>
2006 define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
2007 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
2008 ; X86: # %bb.0: # %entry
2009 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2010 ; X86-NEXT: kmovw %eax, %k1
2011 ; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2012 ; X86-NEXT: vzeroupper
2015 ; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
2016 ; X64: # %bb.0: # %entry
2017 ; X64-NEXT: kmovw %edi, %k1
2018 ; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2019 ; X64-NEXT: vzeroupper
2022 %0 = bitcast <8 x i64> %__A to <16 x i32>
2023 %1 = bitcast <2 x i64> %__O to <16 x i8>
2024 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
2025 %3 = bitcast <16 x i8> %2 to <2 x i64>
2029 define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
2030 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
2031 ; X86: # %bb.0: # %entry
2032 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2033 ; X86-NEXT: kmovw %eax, %k1
2034 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2035 ; X86-NEXT: vzeroupper
2038 ; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
2039 ; X64: # %bb.0: # %entry
2040 ; X64-NEXT: kmovw %edi, %k1
2041 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2042 ; X64-NEXT: vzeroupper
2045 %0 = bitcast <8 x i64> %__A to <16 x i32>
2046 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
2047 %2 = bitcast <16 x i8> %1 to <2 x i64>
2051 define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
2052 ; CHECK-LABEL: test_mm512_cvtepi64_epi32:
2053 ; CHECK: # %bb.0: # %entry
2054 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0
2055 ; CHECK-NEXT: ret{{[l|q]}}
2057 %conv.i = trunc <8 x i64> %__A to <8 x i32>
2058 %0 = bitcast <8 x i32> %conv.i to <4 x i64>
2062 define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2063 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
2064 ; X86: # %bb.0: # %entry
2065 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2066 ; X86-NEXT: kmovw %eax, %k1
2067 ; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2070 ; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
2071 ; X64: # %bb.0: # %entry
2072 ; X64-NEXT: kmovw %edi, %k1
2073 ; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2076 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2077 %0 = bitcast <4 x i64> %__O to <8 x i32>
2078 %1 = bitcast i8 %__M to <8 x i1>
2079 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
2080 %3 = bitcast <8 x i32> %2 to <4 x i64>
2084 define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
2085 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
2086 ; X86: # %bb.0: # %entry
2087 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2088 ; X86-NEXT: kmovw %eax, %k1
2089 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2092 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
2093 ; X64: # %bb.0: # %entry
2094 ; X64-NEXT: kmovw %edi, %k1
2095 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2098 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2099 %0 = bitcast i8 %__M to <8 x i1>
2100 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
2101 %2 = bitcast <8 x i32> %1 to <4 x i64>
2105 define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
2106 ; CHECK-LABEL: test_mm512_cvtepi64_epi16:
2107 ; CHECK: # %bb.0: # %entry
2108 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0
2109 ; CHECK-NEXT: vzeroupper
2110 ; CHECK-NEXT: ret{{[l|q]}}
2112 %conv.i = trunc <8 x i64> %__A to <8 x i16>
2113 %0 = bitcast <8 x i16> %conv.i to <2 x i64>
2117 define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2118 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
2119 ; X86: # %bb.0: # %entry
2120 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2121 ; X86-NEXT: kmovw %eax, %k1
2122 ; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2123 ; X86-NEXT: vzeroupper
2126 ; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
2127 ; X64: # %bb.0: # %entry
2128 ; X64-NEXT: kmovw %edi, %k1
2129 ; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2130 ; X64-NEXT: vzeroupper
2133 %0 = bitcast <2 x i64> %__O to <8 x i16>
2134 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
2135 %2 = bitcast <8 x i16> %1 to <2 x i64>
2139 define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
2140 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
2141 ; X86: # %bb.0: # %entry
2142 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2143 ; X86-NEXT: kmovw %eax, %k1
2144 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2145 ; X86-NEXT: vzeroupper
2148 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
2149 ; X64: # %bb.0: # %entry
2150 ; X64-NEXT: kmovw %edi, %k1
2151 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2152 ; X64-NEXT: vzeroupper
2155 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
2156 %1 = bitcast <8 x i16> %0 to <2 x i64>
2160 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
2161 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
2163 define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2164 ; CHECK-LABEL: test_mm512_ternarylogic_epi32:
2165 ; CHECK: # %bb.0: # %entry
2166 ; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0
2167 ; CHECK-NEXT: ret{{[l|q]}}
2169 %0 = bitcast <8 x i64> %__A to <16 x i32>
2170 %1 = bitcast <8 x i64> %__B to <16 x i32>
2171 %2 = bitcast <8 x i64> %__C to <16 x i32>
2172 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2173 %4 = bitcast <16 x i32> %3 to <8 x i64>
2177 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
2179 define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2180 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
2181 ; X86: # %bb.0: # %entry
2182 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2183 ; X86-NEXT: kmovw %eax, %k1
2184 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2187 ; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
2188 ; X64: # %bb.0: # %entry
2189 ; X64-NEXT: kmovw %edi, %k1
2190 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2193 %0 = bitcast <8 x i64> %__A to <16 x i32>
2194 %1 = bitcast <8 x i64> %__B to <16 x i32>
2195 %2 = bitcast <8 x i64> %__C to <16 x i32>
2196 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2197 %4 = bitcast i16 %__U to <16 x i1>
2198 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2199 %6 = bitcast <16 x i32> %5 to <8 x i64>
2203 define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2204 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
2205 ; X86: # %bb.0: # %entry
2206 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2207 ; X86-NEXT: kmovw %eax, %k1
2208 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2211 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
2212 ; X64: # %bb.0: # %entry
2213 ; X64-NEXT: kmovw %edi, %k1
2214 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2217 %0 = bitcast <8 x i64> %__A to <16 x i32>
2218 %1 = bitcast <8 x i64> %__B to <16 x i32>
2219 %2 = bitcast <8 x i64> %__C to <16 x i32>
2220 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2221 %4 = bitcast i16 %__U to <16 x i1>
2222 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2223 %6 = bitcast <16 x i32> %5 to <8 x i64>
2227 define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2228 ; CHECK-LABEL: test_mm512_ternarylogic_epi64:
2229 ; CHECK: # %bb.0: # %entry
2230 ; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0
2231 ; CHECK-NEXT: ret{{[l|q]}}
2233 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2237 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
2239 define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2240 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
2241 ; X86: # %bb.0: # %entry
2242 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2243 ; X86-NEXT: kmovw %eax, %k1
2244 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2247 ; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
2248 ; X64: # %bb.0: # %entry
2249 ; X64-NEXT: kmovw %edi, %k1
2250 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2253 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2254 %1 = bitcast i8 %__U to <8 x i1>
2255 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2259 define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2260 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
2261 ; X86: # %bb.0: # %entry
2262 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2263 ; X86-NEXT: kmovw %eax, %k1
2264 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2267 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
2268 ; X64: # %bb.0: # %entry
2269 ; X64-NEXT: kmovw %edi, %k1
2270 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2273 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2274 %1 = bitcast i8 %__U to <8 x i1>
2275 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2279 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2281 define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
2282 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
2283 ; X86: # %bb.0: # %entry
2284 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2285 ; X86-NEXT: kmovw %eax, %k1
2286 ; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2287 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2290 ; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
2291 ; X64: # %bb.0: # %entry
2292 ; X64-NEXT: kmovw %edi, %k1
2293 ; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2294 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2297 %0 = bitcast <8 x i64> %__A to <16 x i32>
2298 %1 = bitcast <8 x i64> %__I to <16 x i32>
2299 %2 = bitcast <8 x i64> %__B to <16 x i32>
2300 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2301 %4 = bitcast i16 %__U to <16 x i1>
2302 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
2303 %6 = bitcast <16 x i32> %5 to <8 x i64>
2307 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2309 define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
2310 ; X86-LABEL: test_mm512_mask2_permutex2var_pd:
2311 ; X86: # %bb.0: # %entry
2312 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2313 ; X86-NEXT: kmovw %eax, %k1
2314 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2315 ; X86-NEXT: vmovapd %zmm1, %zmm0
2318 ; X64-LABEL: test_mm512_mask2_permutex2var_pd:
2319 ; X64: # %bb.0: # %entry
2320 ; X64-NEXT: kmovw %edi, %k1
2321 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2322 ; X64-NEXT: vmovapd %zmm1, %zmm0
2325 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2326 %1 = bitcast <8 x i64> %__I to <8 x double>
2327 %2 = bitcast i8 %__U to <8 x i1>
2328 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
2332 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2334 define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
2335 ; X86-LABEL: test_mm512_mask2_permutex2var_ps:
2336 ; X86: # %bb.0: # %entry
2337 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2338 ; X86-NEXT: kmovw %eax, %k1
2339 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2340 ; X86-NEXT: vmovaps %zmm1, %zmm0
2343 ; X64-LABEL: test_mm512_mask2_permutex2var_ps:
2344 ; X64: # %bb.0: # %entry
2345 ; X64-NEXT: kmovw %edi, %k1
2346 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2347 ; X64-NEXT: vmovaps %zmm1, %zmm0
2350 %0 = bitcast <8 x i64> %__I to <16 x i32>
2351 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2352 %2 = bitcast <8 x i64> %__I to <16 x float>
2353 %3 = bitcast i16 %__U to <16 x i1>
2354 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
2358 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2360 define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
2361 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
2362 ; X86: # %bb.0: # %entry
2363 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2364 ; X86-NEXT: kmovw %eax, %k1
2365 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2366 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2369 ; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
2370 ; X64: # %bb.0: # %entry
2371 ; X64-NEXT: kmovw %edi, %k1
2372 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2373 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2376 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2377 %1 = bitcast i8 %__U to <8 x i1>
2378 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
2382 define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2383 ; CHECK-LABEL: test_mm512_permutex2var_epi32:
2384 ; CHECK: # %bb.0: # %entry
2385 ; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
2386 ; CHECK-NEXT: ret{{[l|q]}}
2388 %0 = bitcast <8 x i64> %__A to <16 x i32>
2389 %1 = bitcast <8 x i64> %__I to <16 x i32>
2390 %2 = bitcast <8 x i64> %__B to <16 x i32>
2391 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2392 %4 = bitcast <16 x i32> %3 to <8 x i64>
2396 define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2397 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
2398 ; X86: # %bb.0: # %entry
2399 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2400 ; X86-NEXT: kmovw %eax, %k1
2401 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2404 ; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
2405 ; X64: # %bb.0: # %entry
2406 ; X64-NEXT: kmovw %edi, %k1
2407 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2410 %0 = bitcast <8 x i64> %__A to <16 x i32>
2411 %1 = bitcast <8 x i64> %__I to <16 x i32>
2412 %2 = bitcast <8 x i64> %__B to <16 x i32>
2413 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2414 %4 = bitcast i16 %__U to <16 x i1>
2415 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2416 %6 = bitcast <16 x i32> %5 to <8 x i64>
2420 define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2421 ; X86-LABEL: test_mm512_mask_permutex2var_epi32:
2422 ; X86: # %bb.0: # %entry
2423 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2424 ; X86-NEXT: kmovw %eax, %k1
2425 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2428 ; X64-LABEL: test_mm512_mask_permutex2var_epi32:
2429 ; X64: # %bb.0: # %entry
2430 ; X64-NEXT: kmovw %edi, %k1
2431 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2434 %0 = bitcast <8 x i64> %__A to <16 x i32>
2435 %1 = bitcast <8 x i64> %__I to <16 x i32>
2436 %2 = bitcast <8 x i64> %__B to <16 x i32>
2437 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2438 %4 = bitcast i16 %__U to <16 x i1>
2439 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2440 %6 = bitcast <16 x i32> %5 to <8 x i64>
2444 define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2445 ; CHECK-LABEL: test_mm512_permutex2var_pd:
2446 ; CHECK: # %bb.0: # %entry
2447 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
2448 ; CHECK-NEXT: ret{{[l|q]}}
2450 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2454 define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
2455 ; X86-LABEL: test_mm512_mask_permutex2var_pd:
2456 ; X86: # %bb.0: # %entry
2457 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2458 ; X86-NEXT: kmovw %eax, %k1
2459 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2462 ; X64-LABEL: test_mm512_mask_permutex2var_pd:
2463 ; X64: # %bb.0: # %entry
2464 ; X64-NEXT: kmovw %edi, %k1
2465 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2468 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2469 %1 = bitcast i8 %__U to <8 x i1>
2470 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2474 define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2475 ; X86-LABEL: test_mm512_maskz_permutex2var_pd:
2476 ; X86: # %bb.0: # %entry
2477 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2478 ; X86-NEXT: kmovw %eax, %k1
2479 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2482 ; X64-LABEL: test_mm512_maskz_permutex2var_pd:
2483 ; X64: # %bb.0: # %entry
2484 ; X64-NEXT: kmovw %edi, %k1
2485 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2488 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2489 %1 = bitcast i8 %__U to <8 x i1>
2490 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
2494 define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2495 ; CHECK-LABEL: test_mm512_permutex2var_ps:
2496 ; CHECK: # %bb.0: # %entry
2497 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
2498 ; CHECK-NEXT: ret{{[l|q]}}
2500 %0 = bitcast <8 x i64> %__I to <16 x i32>
2501 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2505 define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
2506 ; X86-LABEL: test_mm512_mask_permutex2var_ps:
2507 ; X86: # %bb.0: # %entry
2508 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2509 ; X86-NEXT: kmovw %eax, %k1
2510 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2513 ; X64-LABEL: test_mm512_mask_permutex2var_ps:
2514 ; X64: # %bb.0: # %entry
2515 ; X64-NEXT: kmovw %edi, %k1
2516 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2519 %0 = bitcast <8 x i64> %__I to <16 x i32>
2520 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2521 %2 = bitcast i16 %__U to <16 x i1>
2522 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
2526 define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2527 ; X86-LABEL: test_mm512_maskz_permutex2var_ps:
2528 ; X86: # %bb.0: # %entry
2529 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2530 ; X86-NEXT: kmovw %eax, %k1
2531 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2534 ; X64-LABEL: test_mm512_maskz_permutex2var_ps:
2535 ; X64: # %bb.0: # %entry
2536 ; X64-NEXT: kmovw %edi, %k1
2537 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2540 %0 = bitcast <8 x i64> %__I to <16 x i32>
2541 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2542 %2 = bitcast i16 %__U to <16 x i1>
2543 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2547 define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2548 ; CHECK-LABEL: test_mm512_permutex2var_epi64:
2549 ; CHECK: # %bb.0: # %entry
2550 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
2551 ; CHECK-NEXT: ret{{[l|q]}}
2553 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2557 define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2558 ; X86-LABEL: test_mm512_mask_permutex2var_epi64:
2559 ; X86: # %bb.0: # %entry
2560 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2561 ; X86-NEXT: kmovw %eax, %k1
2562 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2565 ; X64-LABEL: test_mm512_mask_permutex2var_epi64:
2566 ; X64: # %bb.0: # %entry
2567 ; X64-NEXT: kmovw %edi, %k1
2568 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2571 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2572 %1 = bitcast i8 %__U to <8 x i1>
2573 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2577 define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2578 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
2579 ; X86: # %bb.0: # %entry
2580 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2581 ; X86-NEXT: kmovw %eax, %k1
2582 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2585 ; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
2586 ; X64: # %bb.0: # %entry
2587 ; X64-NEXT: kmovw %edi, %k1
2588 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2591 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2592 %1 = bitcast i8 %__U to <8 x i1>
2593 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2596 define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2597 ; X86-LABEL: test_mm_mask_add_ss:
2598 ; X86: # %bb.0: # %entry
2599 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2600 ; X86-NEXT: kmovw %eax, %k1
2601 ; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2604 ; X64-LABEL: test_mm_mask_add_ss:
2605 ; X64: # %bb.0: # %entry
2606 ; X64-NEXT: kmovw %edi, %k1
2607 ; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2610 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2611 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2612 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2614 %tobool.i = icmp eq i8 %0, 0
2615 %vecext1.i = extractelement <4 x float> %__W, i32 0
2616 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
2617 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2618 ret <4 x float> %vecins.i
2621 define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2622 ; X86-LABEL: test_mm_maskz_add_ss:
2623 ; X86: # %bb.0: # %entry
2624 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2625 ; X86-NEXT: kmovw %eax, %k1
2626 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2629 ; X64-LABEL: test_mm_maskz_add_ss:
2630 ; X64: # %bb.0: # %entry
2631 ; X64-NEXT: kmovw %edi, %k1
2632 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2635 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2636 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2637 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2639 %tobool.i = icmp eq i8 %0, 0
2640 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
2641 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2642 ret <4 x float> %vecins.i
2645 define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2646 ; X86-LABEL: test_mm_mask_add_sd:
2647 ; X86: # %bb.0: # %entry
2648 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2649 ; X86-NEXT: kmovw %eax, %k1
2650 ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2653 ; X64-LABEL: test_mm_mask_add_sd:
2654 ; X64: # %bb.0: # %entry
2655 ; X64-NEXT: kmovw %edi, %k1
2656 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2659 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2660 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2661 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2663 %tobool.i = icmp eq i8 %0, 0
2664 %vecext1.i = extractelement <2 x double> %__W, i32 0
2665 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
2666 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2667 ret <2 x double> %vecins.i
2670 define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2671 ; X86-LABEL: test_mm_maskz_add_sd:
2672 ; X86: # %bb.0: # %entry
2673 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2674 ; X86-NEXT: kmovw %eax, %k1
2675 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2678 ; X64-LABEL: test_mm_maskz_add_sd:
2679 ; X64: # %bb.0: # %entry
2680 ; X64-NEXT: kmovw %edi, %k1
2681 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2684 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2685 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2686 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2688 %tobool.i = icmp eq i8 %0, 0
2689 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
2690 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2691 ret <2 x double> %vecins.i
2694 define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2695 ; X86-LABEL: test_mm_mask_sub_ss:
2696 ; X86: # %bb.0: # %entry
2697 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2698 ; X86-NEXT: kmovw %eax, %k1
2699 ; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2702 ; X64-LABEL: test_mm_mask_sub_ss:
2703 ; X64: # %bb.0: # %entry
2704 ; X64-NEXT: kmovw %edi, %k1
2705 ; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2708 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2709 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2710 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2712 %tobool.i = icmp eq i8 %0, 0
2713 %vecext1.i = extractelement <4 x float> %__W, i32 0
2714 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
2715 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2716 ret <4 x float> %vecins.i
2719 define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2720 ; X86-LABEL: test_mm_maskz_sub_ss:
2721 ; X86: # %bb.0: # %entry
2722 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2723 ; X86-NEXT: kmovw %eax, %k1
2724 ; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2727 ; X64-LABEL: test_mm_maskz_sub_ss:
2728 ; X64: # %bb.0: # %entry
2729 ; X64-NEXT: kmovw %edi, %k1
2730 ; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2733 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2734 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2735 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2737 %tobool.i = icmp eq i8 %0, 0
2738 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
2739 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2740 ret <4 x float> %vecins.i
2743 define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2744 ; X86-LABEL: test_mm_mask_sub_sd:
2745 ; X86: # %bb.0: # %entry
2746 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2747 ; X86-NEXT: kmovw %eax, %k1
2748 ; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2751 ; X64-LABEL: test_mm_mask_sub_sd:
2752 ; X64: # %bb.0: # %entry
2753 ; X64-NEXT: kmovw %edi, %k1
2754 ; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2757 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2758 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2759 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2761 %tobool.i = icmp eq i8 %0, 0
2762 %vecext1.i = extractelement <2 x double> %__W, i32 0
2763 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
2764 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2765 ret <2 x double> %vecins.i
2768 define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2769 ; X86-LABEL: test_mm_maskz_sub_sd:
2770 ; X86: # %bb.0: # %entry
2771 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2772 ; X86-NEXT: kmovw %eax, %k1
2773 ; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2776 ; X64-LABEL: test_mm_maskz_sub_sd:
2777 ; X64: # %bb.0: # %entry
2778 ; X64-NEXT: kmovw %edi, %k1
2779 ; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2782 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2783 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2784 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2786 %tobool.i = icmp eq i8 %0, 0
2787 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
2788 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2789 ret <2 x double> %vecins.i
2792 define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2793 ; X86-LABEL: test_mm_mask_mul_ss:
2794 ; X86: # %bb.0: # %entry
2795 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2796 ; X86-NEXT: kmovw %eax, %k1
2797 ; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2800 ; X64-LABEL: test_mm_mask_mul_ss:
2801 ; X64: # %bb.0: # %entry
2802 ; X64-NEXT: kmovw %edi, %k1
2803 ; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2806 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2807 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2808 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2810 %tobool.i = icmp eq i8 %0, 0
2811 %vecext1.i = extractelement <4 x float> %__W, i32 0
2812 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
2813 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2814 ret <4 x float> %vecins.i
2817 define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2818 ; X86-LABEL: test_mm_maskz_mul_ss:
2819 ; X86: # %bb.0: # %entry
2820 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2821 ; X86-NEXT: kmovw %eax, %k1
2822 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2825 ; X64-LABEL: test_mm_maskz_mul_ss:
2826 ; X64: # %bb.0: # %entry
2827 ; X64-NEXT: kmovw %edi, %k1
2828 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2831 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2832 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2833 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2835 %tobool.i = icmp eq i8 %0, 0
2836 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
2837 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2838 ret <4 x float> %vecins.i
2841 define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2842 ; X86-LABEL: test_mm_mask_mul_sd:
2843 ; X86: # %bb.0: # %entry
2844 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2845 ; X86-NEXT: kmovw %eax, %k1
2846 ; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2849 ; X64-LABEL: test_mm_mask_mul_sd:
2850 ; X64: # %bb.0: # %entry
2851 ; X64-NEXT: kmovw %edi, %k1
2852 ; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2855 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2856 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2857 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2859 %tobool.i = icmp eq i8 %0, 0
2860 %vecext1.i = extractelement <2 x double> %__W, i32 0
2861 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
2862 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2863 ret <2 x double> %vecins.i
2866 define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2867 ; X86-LABEL: test_mm_maskz_mul_sd:
2868 ; X86: # %bb.0: # %entry
2869 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2870 ; X86-NEXT: kmovw %eax, %k1
2871 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2874 ; X64-LABEL: test_mm_maskz_mul_sd:
2875 ; X64: # %bb.0: # %entry
2876 ; X64-NEXT: kmovw %edi, %k1
2877 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2880 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2881 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2882 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2884 %tobool.i = icmp eq i8 %0, 0
2885 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
2886 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2887 ret <2 x double> %vecins.i
2890 define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2891 ; X86-LABEL: test_mm_mask_div_ss:
2892 ; X86: # %bb.0: # %entry
2893 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2894 ; X86-NEXT: kmovw %eax, %k1
2895 ; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2898 ; X64-LABEL: test_mm_mask_div_ss:
2899 ; X64: # %bb.0: # %entry
2900 ; X64-NEXT: kmovw %edi, %k1
2901 ; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2904 %0 = extractelement <4 x float> %__A, i64 0
2905 %1 = extractelement <4 x float> %__B, i64 0
2906 %2 = extractelement <4 x float> %__W, i64 0
2907 %3 = fdiv float %0, %1
2908 %4 = bitcast i8 %__U to <8 x i1>
2909 %5 = extractelement <8 x i1> %4, i64 0
2910 %6 = select i1 %5, float %3, float %2
2911 %7 = insertelement <4 x float> %__A, float %6, i64 0
2915 define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2916 ; X86-LABEL: test_mm_maskz_div_ss:
2917 ; X86: # %bb.0: # %entry
2918 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2919 ; X86-NEXT: kmovw %eax, %k1
2920 ; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2923 ; X64-LABEL: test_mm_maskz_div_ss:
2924 ; X64: # %bb.0: # %entry
2925 ; X64-NEXT: kmovw %edi, %k1
2926 ; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2929 %0 = extractelement <4 x float> %__A, i64 0
2930 %1 = extractelement <4 x float> %__B, i64 0
2931 %2 = fdiv float %0, %1
2932 %3 = bitcast i8 %__U to <8 x i1>
2933 %4 = extractelement <8 x i1> %3, i64 0
2934 %5 = select i1 %4, float %2, float 0.000000e+00
2935 %6 = insertelement <4 x float> %__A, float %5, i64 0
2939 define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2940 ; X86-LABEL: test_mm_mask_div_sd:
2941 ; X86: # %bb.0: # %entry
2942 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2943 ; X86-NEXT: kmovw %eax, %k1
2944 ; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2947 ; X64-LABEL: test_mm_mask_div_sd:
2948 ; X64: # %bb.0: # %entry
2949 ; X64-NEXT: kmovw %edi, %k1
2950 ; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2953 %0 = extractelement <2 x double> %__A, i64 0
2954 %1 = extractelement <2 x double> %__B, i64 0
2955 %2 = extractelement <2 x double> %__W, i64 0
2956 %3 = fdiv double %0, %1
2957 %4 = bitcast i8 %__U to <8 x i1>
2958 %5 = extractelement <8 x i1> %4, i64 0
2959 %6 = select i1 %5, double %3, double %2
2960 %7 = insertelement <2 x double> %__A, double %6, i64 0
2964 define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2965 ; X86-LABEL: test_mm_maskz_div_sd:
2966 ; X86: # %bb.0: # %entry
2967 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2968 ; X86-NEXT: kmovw %eax, %k1
2969 ; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2972 ; X64-LABEL: test_mm_maskz_div_sd:
2973 ; X64: # %bb.0: # %entry
2974 ; X64-NEXT: kmovw %edi, %k1
2975 ; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2978 %0 = extractelement <2 x double> %__A, i64 0
2979 %1 = extractelement <2 x double> %__B, i64 0
2980 %2 = fdiv double %0, %1
2981 %3 = bitcast i8 %__U to <8 x i1>
2982 %4 = extractelement <8 x i1> %3, i64 0
2983 %5 = select i1 %4, double %2, double 0.000000e+00
2984 %6 = insertelement <2 x double> %__A, double %5, i64 0
2989 define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
2990 ; CHECK-LABEL: test_mm512_fmadd_round_pd:
2991 ; CHECK: # %bb.0: # %entry
2992 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
2993 ; CHECK-NEXT: ret{{[l|q]}}
2995 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
2999 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3001 define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3002 ; X86-LABEL: test_mm512_mask_fmadd_round_pd:
3003 ; X86: # %bb.0: # %entry
3004 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3005 ; X86-NEXT: kmovw %eax, %k1
3006 ; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3009 ; X64-LABEL: test_mm512_mask_fmadd_round_pd:
3010 ; X64: # %bb.0: # %entry
3011 ; X64-NEXT: kmovw %edi, %k1
3012 ; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3015 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3016 %1 = bitcast i8 %__U to <8 x i1>
3017 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3021 define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3022 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
3023 ; X86: # %bb.0: # %entry
3024 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3025 ; X86-NEXT: kmovw %eax, %k1
3026 ; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3027 ; X86-NEXT: vmovapd %zmm2, %zmm0
3030 ; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
3031 ; X64: # %bb.0: # %entry
3032 ; X64-NEXT: kmovw %edi, %k1
3033 ; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3034 ; X64-NEXT: vmovapd %zmm2, %zmm0
3037 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3038 %1 = bitcast i8 %__U to <8 x i1>
3039 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3043 define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3044 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
3045 ; X86: # %bb.0: # %entry
3046 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3047 ; X86-NEXT: kmovw %eax, %k1
3048 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3051 ; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
3052 ; X64: # %bb.0: # %entry
3053 ; X64-NEXT: kmovw %edi, %k1
3054 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3057 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3058 %1 = bitcast i8 %__U to <8 x i1>
3059 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3063 define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3064 ; X86-LABEL: test_mm512_fmsub_round_pd:
3065 ; X86: # %bb.0: # %entry
3066 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3067 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3070 ; X64-LABEL: test_mm512_fmsub_round_pd:
3071 ; X64: # %bb.0: # %entry
3072 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3073 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3076 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3077 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3081 define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3082 ; X86-LABEL: test_mm512_mask_fmsub_round_pd:
3083 ; X86: # %bb.0: # %entry
3084 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3085 ; X86-NEXT: kmovw %eax, %k1
3086 ; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3089 ; X64-LABEL: test_mm512_mask_fmsub_round_pd:
3090 ; X64: # %bb.0: # %entry
3091 ; X64-NEXT: kmovw %edi, %k1
3092 ; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3095 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3096 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3097 %1 = bitcast i8 %__U to <8 x i1>
3098 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3102 define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3103 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
3104 ; X86: # %bb.0: # %entry
3105 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3106 ; X86-NEXT: kmovw %eax, %k1
3107 ; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3110 ; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
3111 ; X64: # %bb.0: # %entry
3112 ; X64-NEXT: kmovw %edi, %k1
3113 ; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3116 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3117 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3118 %1 = bitcast i8 %__U to <8 x i1>
3119 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3123 define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3124 ; X86-LABEL: test_mm512_fnmadd_round_pd:
3125 ; X86: # %bb.0: # %entry
3126 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3127 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3130 ; X64-LABEL: test_mm512_fnmadd_round_pd:
3131 ; X64: # %bb.0: # %entry
3132 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3133 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3136 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3137 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3141 define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3142 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
3143 ; X86: # %bb.0: # %entry
3144 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3145 ; X86-NEXT: kmovw %eax, %k1
3146 ; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3147 ; X86-NEXT: vmovapd %zmm2, %zmm0
3150 ; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
3151 ; X64: # %bb.0: # %entry
3152 ; X64-NEXT: kmovw %edi, %k1
3153 ; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3154 ; X64-NEXT: vmovapd %zmm2, %zmm0
3157 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3158 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3159 %1 = bitcast i8 %__U to <8 x i1>
3160 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3164 define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3165 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
3166 ; X86: # %bb.0: # %entry
3167 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3168 ; X86-NEXT: kmovw %eax, %k1
3169 ; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3172 ; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
3173 ; X64: # %bb.0: # %entry
3174 ; X64-NEXT: kmovw %edi, %k1
3175 ; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3178 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3179 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3180 %1 = bitcast i8 %__U to <8 x i1>
3181 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3185 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3186 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
3187 ; CHECK: # %bb.0: # %entry
3188 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3189 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3190 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3191 ; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
3192 ; CHECK-NEXT: ret{{[l|q]}}
3194 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3195 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3196 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3200 define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3201 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
3202 ; X86: # %bb.0: # %entry
3203 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3204 ; X86-NEXT: kmovw %eax, %k1
3205 ; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3208 ; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
3209 ; X64: # %bb.0: # %entry
3210 ; X64-NEXT: kmovw %edi, %k1
3211 ; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3214 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3215 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3216 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3217 %1 = bitcast i8 %__U to <8 x i1>
3218 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3222 define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3223 ; CHECK-LABEL: test_mm512_fmadd_pd:
3224 ; CHECK: # %bb.0: # %entry
3225 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3226 ; CHECK-NEXT: ret{{[l|q]}}
3228 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3232 define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3233 ; X86-LABEL: test_mm512_mask_fmadd_pd:
3234 ; X86: # %bb.0: # %entry
3235 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3236 ; X86-NEXT: kmovw %eax, %k1
3237 ; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3240 ; X64-LABEL: test_mm512_mask_fmadd_pd:
3241 ; X64: # %bb.0: # %entry
3242 ; X64-NEXT: kmovw %edi, %k1
3243 ; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3246 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3247 %1 = bitcast i8 %__U to <8 x i1>
3248 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3252 define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3253 ; X86-LABEL: test_mm512_mask3_fmadd_pd:
3254 ; X86: # %bb.0: # %entry
3255 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3256 ; X86-NEXT: kmovw %eax, %k1
3257 ; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3258 ; X86-NEXT: vmovapd %zmm2, %zmm0
3261 ; X64-LABEL: test_mm512_mask3_fmadd_pd:
3262 ; X64: # %bb.0: # %entry
3263 ; X64-NEXT: kmovw %edi, %k1
3264 ; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3265 ; X64-NEXT: vmovapd %zmm2, %zmm0
3268 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3269 %1 = bitcast i8 %__U to <8 x i1>
3270 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3274 define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3275 ; X86-LABEL: test_mm512_maskz_fmadd_pd:
3276 ; X86: # %bb.0: # %entry
3277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3278 ; X86-NEXT: kmovw %eax, %k1
3279 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3282 ; X64-LABEL: test_mm512_maskz_fmadd_pd:
3283 ; X64: # %bb.0: # %entry
3284 ; X64-NEXT: kmovw %edi, %k1
3285 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3288 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3289 %1 = bitcast i8 %__U to <8 x i1>
3290 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3294 define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3295 ; X86-LABEL: test_mm512_fmsub_pd:
3296 ; X86: # %bb.0: # %entry
3297 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3298 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3301 ; X64-LABEL: test_mm512_fmsub_pd:
3302 ; X64: # %bb.0: # %entry
3303 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3304 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3307 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3308 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3312 define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3313 ; X86-LABEL: test_mm512_mask_fmsub_pd:
3314 ; X86: # %bb.0: # %entry
3315 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3316 ; X86-NEXT: kmovw %eax, %k1
3317 ; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3320 ; X64-LABEL: test_mm512_mask_fmsub_pd:
3321 ; X64: # %bb.0: # %entry
3322 ; X64-NEXT: kmovw %edi, %k1
3323 ; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3326 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3327 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3328 %1 = bitcast i8 %__U to <8 x i1>
3329 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3333 define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3334 ; X86-LABEL: test_mm512_maskz_fmsub_pd:
3335 ; X86: # %bb.0: # %entry
3336 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3337 ; X86-NEXT: kmovw %eax, %k1
3338 ; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3341 ; X64-LABEL: test_mm512_maskz_fmsub_pd:
3342 ; X64: # %bb.0: # %entry
3343 ; X64-NEXT: kmovw %edi, %k1
3344 ; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3347 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3348 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3349 %1 = bitcast i8 %__U to <8 x i1>
3350 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3354 define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3355 ; X86-LABEL: test_mm512_fnmadd_pd:
3356 ; X86: # %bb.0: # %entry
3357 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3358 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3361 ; X64-LABEL: test_mm512_fnmadd_pd:
3362 ; X64: # %bb.0: # %entry
3363 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3364 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3367 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3368 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3372 define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3373 ; X86-LABEL: test_mm512_mask3_fnmadd_pd:
3374 ; X86: # %bb.0: # %entry
3375 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3376 ; X86-NEXT: kmovw %eax, %k1
3377 ; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3378 ; X86-NEXT: vmovapd %zmm2, %zmm0
3381 ; X64-LABEL: test_mm512_mask3_fnmadd_pd:
3382 ; X64: # %bb.0: # %entry
3383 ; X64-NEXT: kmovw %edi, %k1
3384 ; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3385 ; X64-NEXT: vmovapd %zmm2, %zmm0
3388 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3389 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3390 %1 = bitcast i8 %__U to <8 x i1>
3391 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3395 define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3396 ; X86-LABEL: test_mm512_maskz_fnmadd_pd:
3397 ; X86: # %bb.0: # %entry
3398 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3399 ; X86-NEXT: kmovw %eax, %k1
3400 ; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3403 ; X64-LABEL: test_mm512_maskz_fnmadd_pd:
3404 ; X64: # %bb.0: # %entry
3405 ; X64-NEXT: kmovw %edi, %k1
3406 ; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3409 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3410 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3411 %1 = bitcast i8 %__U to <8 x i1>
3412 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3416 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3417 ; CHECK-LABEL: test_mm512_fnmsub_pd:
3418 ; CHECK: # %bb.0: # %entry
3419 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3420 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3421 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3422 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3423 ; CHECK-NEXT: ret{{[l|q]}}
3425 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3426 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3427 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3431 define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3432 ; X86-LABEL: test_mm512_maskz_fnmsub_pd:
3433 ; X86: # %bb.0: # %entry
3434 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3435 ; X86-NEXT: kmovw %eax, %k1
3436 ; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3439 ; X64-LABEL: test_mm512_maskz_fnmsub_pd:
3440 ; X64: # %bb.0: # %entry
3441 ; X64-NEXT: kmovw %edi, %k1
3442 ; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3445 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3446 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3447 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3448 %1 = bitcast i8 %__U to <8 x i1>
3449 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3453 define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3454 ; CHECK-LABEL: test_mm512_fmadd_round_ps:
3455 ; CHECK: # %bb.0: # %entry
3456 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3457 ; CHECK-NEXT: ret{{[l|q]}}
3459 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3463 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
3465 define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3466 ; X86-LABEL: test_mm512_mask_fmadd_round_ps:
3467 ; X86: # %bb.0: # %entry
3468 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3469 ; X86-NEXT: kmovw %eax, %k1
3470 ; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3473 ; X64-LABEL: test_mm512_mask_fmadd_round_ps:
3474 ; X64: # %bb.0: # %entry
3475 ; X64-NEXT: kmovw %edi, %k1
3476 ; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3479 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3480 %1 = bitcast i16 %__U to <16 x i1>
3481 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3485 define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3486 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
3487 ; X86: # %bb.0: # %entry
3488 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3489 ; X86-NEXT: kmovw %eax, %k1
3490 ; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3491 ; X86-NEXT: vmovaps %zmm2, %zmm0
3494 ; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
3495 ; X64: # %bb.0: # %entry
3496 ; X64-NEXT: kmovw %edi, %k1
3497 ; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3498 ; X64-NEXT: vmovaps %zmm2, %zmm0
3501 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3502 %1 = bitcast i16 %__U to <16 x i1>
3503 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3507 define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3508 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
3509 ; X86: # %bb.0: # %entry
3510 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3511 ; X86-NEXT: kmovw %eax, %k1
3512 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3515 ; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
3516 ; X64: # %bb.0: # %entry
3517 ; X64-NEXT: kmovw %edi, %k1
3518 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3521 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3522 %1 = bitcast i16 %__U to <16 x i1>
3523 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3527 define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3528 ; X86-LABEL: test_mm512_fmsub_round_ps:
3529 ; X86: # %bb.0: # %entry
3530 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3531 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3534 ; X64-LABEL: test_mm512_fmsub_round_ps:
3535 ; X64: # %bb.0: # %entry
3536 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3537 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3540 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3541 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3545 define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3546 ; X86-LABEL: test_mm512_mask_fmsub_round_ps:
3547 ; X86: # %bb.0: # %entry
3548 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3549 ; X86-NEXT: kmovw %eax, %k1
3550 ; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3553 ; X64-LABEL: test_mm512_mask_fmsub_round_ps:
3554 ; X64: # %bb.0: # %entry
3555 ; X64-NEXT: kmovw %edi, %k1
3556 ; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3559 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3560 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3561 %1 = bitcast i16 %__U to <16 x i1>
3562 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3566 define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3567 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
3568 ; X86: # %bb.0: # %entry
3569 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3570 ; X86-NEXT: kmovw %eax, %k1
3571 ; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3574 ; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
3575 ; X64: # %bb.0: # %entry
3576 ; X64-NEXT: kmovw %edi, %k1
3577 ; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3580 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3581 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3582 %1 = bitcast i16 %__U to <16 x i1>
3583 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3587 define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3588 ; X86-LABEL: test_mm512_fnmadd_round_ps:
3589 ; X86: # %bb.0: # %entry
3590 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3591 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3594 ; X64-LABEL: test_mm512_fnmadd_round_ps:
3595 ; X64: # %bb.0: # %entry
3596 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3597 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3600 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3601 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3605 define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3606 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
3607 ; X86: # %bb.0: # %entry
3608 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3609 ; X86-NEXT: kmovw %eax, %k1
3610 ; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3611 ; X86-NEXT: vmovaps %zmm2, %zmm0
3614 ; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
3615 ; X64: # %bb.0: # %entry
3616 ; X64-NEXT: kmovw %edi, %k1
3617 ; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3618 ; X64-NEXT: vmovaps %zmm2, %zmm0
3621 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3622 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3623 %1 = bitcast i16 %__U to <16 x i1>
3624 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3628 define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3629 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
3630 ; X86: # %bb.0: # %entry
3631 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3632 ; X86-NEXT: kmovw %eax, %k1
3633 ; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3636 ; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
3637 ; X64: # %bb.0: # %entry
3638 ; X64-NEXT: kmovw %edi, %k1
3639 ; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3642 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3643 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3644 %1 = bitcast i16 %__U to <16 x i1>
3645 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3649 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3650 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
3651 ; CHECK: # %bb.0: # %entry
3652 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3653 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3654 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3655 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
3656 ; CHECK-NEXT: ret{{[l|q]}}
3658 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3659 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3660 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3664 define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3665 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
3666 ; X86: # %bb.0: # %entry
3667 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3668 ; X86-NEXT: kmovw %eax, %k1
3669 ; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3672 ; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
3673 ; X64: # %bb.0: # %entry
3674 ; X64-NEXT: kmovw %edi, %k1
3675 ; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3678 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3679 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3680 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3681 %1 = bitcast i16 %__U to <16 x i1>
3682 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3686 define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3687 ; CHECK-LABEL: test_mm512_fmadd_ps:
3688 ; CHECK: # %bb.0: # %entry
3689 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3690 ; CHECK-NEXT: ret{{[l|q]}}
3692 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3696 define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3697 ; X86-LABEL: test_mm512_mask_fmadd_ps:
3698 ; X86: # %bb.0: # %entry
3699 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3700 ; X86-NEXT: kmovw %eax, %k1
3701 ; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3704 ; X64-LABEL: test_mm512_mask_fmadd_ps:
3705 ; X64: # %bb.0: # %entry
3706 ; X64-NEXT: kmovw %edi, %k1
3707 ; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3710 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3711 %1 = bitcast i16 %__U to <16 x i1>
3712 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3716 define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3717 ; X86-LABEL: test_mm512_mask3_fmadd_ps:
3718 ; X86: # %bb.0: # %entry
3719 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3720 ; X86-NEXT: kmovw %eax, %k1
3721 ; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3722 ; X86-NEXT: vmovaps %zmm2, %zmm0
3725 ; X64-LABEL: test_mm512_mask3_fmadd_ps:
3726 ; X64: # %bb.0: # %entry
3727 ; X64-NEXT: kmovw %edi, %k1
3728 ; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3729 ; X64-NEXT: vmovaps %zmm2, %zmm0
3732 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3733 %1 = bitcast i16 %__U to <16 x i1>
3734 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3738 define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3739 ; X86-LABEL: test_mm512_maskz_fmadd_ps:
3740 ; X86: # %bb.0: # %entry
3741 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3742 ; X86-NEXT: kmovw %eax, %k1
3743 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3746 ; X64-LABEL: test_mm512_maskz_fmadd_ps:
3747 ; X64: # %bb.0: # %entry
3748 ; X64-NEXT: kmovw %edi, %k1
3749 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3752 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3753 %1 = bitcast i16 %__U to <16 x i1>
3754 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3758 define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3759 ; X86-LABEL: test_mm512_fmsub_ps:
3760 ; X86: # %bb.0: # %entry
3761 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3762 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3765 ; X64-LABEL: test_mm512_fmsub_ps:
3766 ; X64: # %bb.0: # %entry
3767 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3768 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3771 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3772 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3776 define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3777 ; X86-LABEL: test_mm512_mask_fmsub_ps:
3778 ; X86: # %bb.0: # %entry
3779 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3780 ; X86-NEXT: kmovw %eax, %k1
3781 ; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3784 ; X64-LABEL: test_mm512_mask_fmsub_ps:
3785 ; X64: # %bb.0: # %entry
3786 ; X64-NEXT: kmovw %edi, %k1
3787 ; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3790 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3791 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3792 %1 = bitcast i16 %__U to <16 x i1>
3793 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3797 define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3798 ; X86-LABEL: test_mm512_maskz_fmsub_ps:
3799 ; X86: # %bb.0: # %entry
3800 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3801 ; X86-NEXT: kmovw %eax, %k1
3802 ; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3805 ; X64-LABEL: test_mm512_maskz_fmsub_ps:
3806 ; X64: # %bb.0: # %entry
3807 ; X64-NEXT: kmovw %edi, %k1
3808 ; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3811 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3812 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3813 %1 = bitcast i16 %__U to <16 x i1>
3814 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3818 define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3819 ; X86-LABEL: test_mm512_fnmadd_ps:
3820 ; X86: # %bb.0: # %entry
3821 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3822 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3825 ; X64-LABEL: test_mm512_fnmadd_ps:
3826 ; X64: # %bb.0: # %entry
3827 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3828 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3831 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3832 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3836 define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3837 ; X86-LABEL: test_mm512_mask3_fnmadd_ps:
3838 ; X86: # %bb.0: # %entry
3839 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3840 ; X86-NEXT: kmovw %eax, %k1
3841 ; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3842 ; X86-NEXT: vmovaps %zmm2, %zmm0
3845 ; X64-LABEL: test_mm512_mask3_fnmadd_ps:
3846 ; X64: # %bb.0: # %entry
3847 ; X64-NEXT: kmovw %edi, %k1
3848 ; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3849 ; X64-NEXT: vmovaps %zmm2, %zmm0
3852 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3853 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3854 %1 = bitcast i16 %__U to <16 x i1>
3855 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3859 define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3860 ; X86-LABEL: test_mm512_maskz_fnmadd_ps:
3861 ; X86: # %bb.0: # %entry
3862 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3863 ; X86-NEXT: kmovw %eax, %k1
3864 ; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3867 ; X64-LABEL: test_mm512_maskz_fnmadd_ps:
3868 ; X64: # %bb.0: # %entry
3869 ; X64-NEXT: kmovw %edi, %k1
3870 ; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3873 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3874 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3875 %1 = bitcast i16 %__U to <16 x i1>
3876 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3880 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3881 ; CHECK-LABEL: test_mm512_fnmsub_ps:
3882 ; CHECK: # %bb.0: # %entry
3883 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3884 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3885 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3886 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3887 ; CHECK-NEXT: ret{{[l|q]}}
3889 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3890 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3891 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3895 define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3896 ; X86-LABEL: test_mm512_maskz_fnmsub_ps:
3897 ; X86: # %bb.0: # %entry
3898 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3899 ; X86-NEXT: kmovw %eax, %k1
3900 ; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3903 ; X64-LABEL: test_mm512_maskz_fnmsub_ps:
3904 ; X64: # %bb.0: # %entry
3905 ; X64-NEXT: kmovw %edi, %k1
3906 ; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3909 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3910 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3911 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3912 %1 = bitcast i16 %__U to <16 x i1>
3913 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3917 define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3918 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
3919 ; CHECK: # %bb.0: # %entry
3920 ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3921 ; CHECK-NEXT: ret{{[l|q]}}
3923 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3927 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3929 define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3930 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
3931 ; X86: # %bb.0: # %entry
3932 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3933 ; X86-NEXT: kmovw %eax, %k1
3934 ; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3937 ; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
3938 ; X64: # %bb.0: # %entry
3939 ; X64-NEXT: kmovw %edi, %k1
3940 ; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3943 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3944 %1 = bitcast i8 %__U to <8 x i1>
3945 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3949 define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3950 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3951 ; X86: # %bb.0: # %entry
3952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3953 ; X86-NEXT: kmovw %eax, %k1
3954 ; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3955 ; X86-NEXT: vmovapd %zmm2, %zmm0
3958 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3959 ; X64: # %bb.0: # %entry
3960 ; X64-NEXT: kmovw %edi, %k1
3961 ; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3962 ; X64-NEXT: vmovapd %zmm2, %zmm0
3965 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3966 %1 = bitcast i8 %__U to <8 x i1>
3967 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3971 define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3972 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3973 ; X86: # %bb.0: # %entry
3974 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3975 ; X86-NEXT: kmovw %eax, %k1
3976 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3979 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3980 ; X64: # %bb.0: # %entry
3981 ; X64-NEXT: kmovw %edi, %k1
3982 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3985 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3986 %1 = bitcast i8 %__U to <8 x i1>
3987 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3991 define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3992 ; X86-LABEL: test_mm512_fmsubadd_round_pd:
3993 ; X86: # %bb.0: # %entry
3994 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3995 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3998 ; X64-LABEL: test_mm512_fmsubadd_round_pd:
3999 ; X64: # %bb.0: # %entry
4000 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
4001 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4004 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4005 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4009 define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4010 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
4011 ; X86: # %bb.0: # %entry
4012 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4013 ; X86-NEXT: kmovw %eax, %k1
4014 ; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4017 ; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
4018 ; X64: # %bb.0: # %entry
4019 ; X64-NEXT: kmovw %edi, %k1
4020 ; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4023 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4024 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4025 %1 = bitcast i8 %__U to <8 x i1>
4026 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4030 define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4031 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4032 ; X86: # %bb.0: # %entry
4033 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4034 ; X86-NEXT: kmovw %eax, %k1
4035 ; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4038 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4039 ; X64: # %bb.0: # %entry
4040 ; X64-NEXT: kmovw %edi, %k1
4041 ; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4044 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4045 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4046 %1 = bitcast i8 %__U to <8 x i1>
4047 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4051 define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4052 ; CHECK-LABEL: test_mm512_fmaddsub_pd:
4053 ; CHECK: # %bb.0: # %entry
4054 ; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4055 ; CHECK-NEXT: ret{{[l|q]}}
4057 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4058 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4059 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4060 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4064 define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4065 ; X86-LABEL: test_mm512_mask_fmaddsub_pd:
4066 ; X86: # %bb.0: # %entry
4067 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4068 ; X86-NEXT: kmovw %eax, %k1
4069 ; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4072 ; X64-LABEL: test_mm512_mask_fmaddsub_pd:
4073 ; X64: # %bb.0: # %entry
4074 ; X64-NEXT: kmovw %edi, %k1
4075 ; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4078 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4079 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4080 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4081 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4082 %4 = bitcast i8 %__U to <8 x i1>
4083 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
4087 define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4088 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
4089 ; X86: # %bb.0: # %entry
4090 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4091 ; X86-NEXT: kmovw %eax, %k1
4092 ; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4093 ; X86-NEXT: vmovapd %zmm2, %zmm0
4096 ; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
4097 ; X64: # %bb.0: # %entry
4098 ; X64-NEXT: kmovw %edi, %k1
4099 ; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4100 ; X64-NEXT: vmovapd %zmm2, %zmm0
4103 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4104 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4105 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4106 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4107 %4 = bitcast i8 %__U to <8 x i1>
4108 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
4112 define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4113 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
4114 ; X86: # %bb.0: # %entry
4115 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4116 ; X86-NEXT: kmovw %eax, %k1
4117 ; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4120 ; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
4121 ; X64: # %bb.0: # %entry
4122 ; X64-NEXT: kmovw %edi, %k1
4123 ; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4126 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4127 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4128 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4129 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4130 %4 = bitcast i8 %__U to <8 x i1>
4131 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
4135 define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4136 ; CHECK-LABEL: test_mm512_fmsubadd_pd:
4137 ; CHECK: # %bb.0: # %entry
4138 ; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4139 ; CHECK-NEXT: ret{{[l|q]}}
4141 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4142 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4143 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4144 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4148 define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4149 ; X86-LABEL: test_mm512_mask_fmsubadd_pd:
4150 ; X86: # %bb.0: # %entry
4151 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4152 ; X86-NEXT: kmovw %eax, %k1
4153 ; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4156 ; X64-LABEL: test_mm512_mask_fmsubadd_pd:
4157 ; X64: # %bb.0: # %entry
4158 ; X64-NEXT: kmovw %edi, %k1
4159 ; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4162 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4163 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4164 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4165 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4166 %3 = bitcast i8 %__U to <8 x i1>
4167 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
4171 define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4172 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
4173 ; X86: # %bb.0: # %entry
4174 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4175 ; X86-NEXT: kmovw %eax, %k1
4176 ; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4179 ; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
4180 ; X64: # %bb.0: # %entry
4181 ; X64-NEXT: kmovw %edi, %k1
4182 ; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4185 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4186 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4187 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4188 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4189 %3 = bitcast i8 %__U to <8 x i1>
4190 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
4194 define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4195 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
4196 ; CHECK: # %bb.0: # %entry
4197 ; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4198 ; CHECK-NEXT: ret{{[l|q]}}
4200 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4204 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
4206 define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4207 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
4208 ; X86: # %bb.0: # %entry
4209 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4210 ; X86-NEXT: kmovw %eax, %k1
4211 ; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4214 ; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
4215 ; X64: # %bb.0: # %entry
4216 ; X64-NEXT: kmovw %edi, %k1
4217 ; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4220 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4221 %1 = bitcast i16 %__U to <16 x i1>
4222 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4226 define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4227 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4228 ; X86: # %bb.0: # %entry
4229 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4230 ; X86-NEXT: kmovw %eax, %k1
4231 ; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4232 ; X86-NEXT: vmovaps %zmm2, %zmm0
4235 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4236 ; X64: # %bb.0: # %entry
4237 ; X64-NEXT: kmovw %edi, %k1
4238 ; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4239 ; X64-NEXT: vmovaps %zmm2, %zmm0
4242 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4243 %1 = bitcast i16 %__U to <16 x i1>
4244 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4248 define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4249 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4250 ; X86: # %bb.0: # %entry
4251 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4252 ; X86-NEXT: kmovw %eax, %k1
4253 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4256 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4257 ; X64: # %bb.0: # %entry
4258 ; X64-NEXT: kmovw %edi, %k1
4259 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4262 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4263 %1 = bitcast i16 %__U to <16 x i1>
4264 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4268 define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4269 ; X86-LABEL: test_mm512_fmsubadd_round_ps:
4270 ; X86: # %bb.0: # %entry
4271 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
4272 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4275 ; X64-LABEL: test_mm512_fmsubadd_round_ps:
4276 ; X64: # %bb.0: # %entry
4277 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
4278 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4281 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4282 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4286 define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4287 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
4288 ; X86: # %bb.0: # %entry
4289 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4290 ; X86-NEXT: kmovw %eax, %k1
4291 ; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4294 ; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
4295 ; X64: # %bb.0: # %entry
4296 ; X64-NEXT: kmovw %edi, %k1
4297 ; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4300 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4301 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4302 %1 = bitcast i16 %__U to <16 x i1>
4303 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4307 define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4308 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4309 ; X86: # %bb.0: # %entry
4310 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4311 ; X86-NEXT: kmovw %eax, %k1
4312 ; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4315 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4316 ; X64: # %bb.0: # %entry
4317 ; X64-NEXT: kmovw %edi, %k1
4318 ; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4321 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4322 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4323 %1 = bitcast i16 %__U to <16 x i1>
4324 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4328 define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4329 ; CHECK-LABEL: test_mm512_fmaddsub_ps:
4330 ; CHECK: # %bb.0: # %entry
4331 ; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4332 ; CHECK-NEXT: ret{{[l|q]}}
4334 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4335 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4336 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4337 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4341 define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4342 ; X86-LABEL: test_mm512_mask_fmaddsub_ps:
4343 ; X86: # %bb.0: # %entry
4344 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4345 ; X86-NEXT: kmovw %eax, %k1
4346 ; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4349 ; X64-LABEL: test_mm512_mask_fmaddsub_ps:
4350 ; X64: # %bb.0: # %entry
4351 ; X64-NEXT: kmovw %edi, %k1
4352 ; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4355 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4356 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4357 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4358 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4359 %4 = bitcast i16 %__U to <16 x i1>
4360 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
4364 define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4365 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
4366 ; X86: # %bb.0: # %entry
4367 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4368 ; X86-NEXT: kmovw %eax, %k1
4369 ; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4370 ; X86-NEXT: vmovaps %zmm2, %zmm0
4373 ; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
4374 ; X64: # %bb.0: # %entry
4375 ; X64-NEXT: kmovw %edi, %k1
4376 ; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4377 ; X64-NEXT: vmovaps %zmm2, %zmm0
4380 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4381 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4382 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4383 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4384 %4 = bitcast i16 %__U to <16 x i1>
4385 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
4389 define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4390 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
4391 ; X86: # %bb.0: # %entry
4392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4393 ; X86-NEXT: kmovw %eax, %k1
4394 ; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4397 ; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
4398 ; X64: # %bb.0: # %entry
4399 ; X64-NEXT: kmovw %edi, %k1
4400 ; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4403 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4404 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4405 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4406 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4407 %4 = bitcast i16 %__U to <16 x i1>
4408 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
4412 define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4413 ; CHECK-LABEL: test_mm512_fmsubadd_ps:
4414 ; CHECK: # %bb.0: # %entry
4415 ; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4416 ; CHECK-NEXT: ret{{[l|q]}}
4418 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4419 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4420 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4421 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4425 define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4426 ; X86-LABEL: test_mm512_mask_fmsubadd_ps:
4427 ; X86: # %bb.0: # %entry
4428 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4429 ; X86-NEXT: kmovw %eax, %k1
4430 ; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4433 ; X64-LABEL: test_mm512_mask_fmsubadd_ps:
4434 ; X64: # %bb.0: # %entry
4435 ; X64-NEXT: kmovw %edi, %k1
4436 ; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4439 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4440 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4441 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4442 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4443 %3 = bitcast i16 %__U to <16 x i1>
4444 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
4448 define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4449 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
4450 ; X86: # %bb.0: # %entry
4451 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4452 ; X86-NEXT: kmovw %eax, %k1
4453 ; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4456 ; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
4457 ; X64: # %bb.0: # %entry
4458 ; X64-NEXT: kmovw %edi, %k1
4459 ; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4462 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4463 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4464 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4465 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4466 %3 = bitcast i16 %__U to <16 x i1>
4467 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
4471 define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4472 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
4473 ; X86: # %bb.0: # %entry
4474 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4475 ; X86-NEXT: kmovw %eax, %k1
4476 ; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4477 ; X86-NEXT: vmovapd %zmm2, %zmm0
4480 ; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
4481 ; X64: # %bb.0: # %entry
4482 ; X64-NEXT: kmovw %edi, %k1
4483 ; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4484 ; X64-NEXT: vmovapd %zmm2, %zmm0
4487 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4488 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4489 %1 = bitcast i8 %__U to <8 x i1>
4490 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4494 define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4495 ; X86-LABEL: test_mm512_mask3_fmsub_pd:
4496 ; X86: # %bb.0: # %entry
4497 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4498 ; X86-NEXT: kmovw %eax, %k1
4499 ; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4500 ; X86-NEXT: vmovapd %zmm2, %zmm0
4503 ; X64-LABEL: test_mm512_mask3_fmsub_pd:
4504 ; X64: # %bb.0: # %entry
4505 ; X64-NEXT: kmovw %edi, %k1
4506 ; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4507 ; X64-NEXT: vmovapd %zmm2, %zmm0
4510 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4511 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4512 %1 = bitcast i8 %__U to <8 x i1>
4513 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4517 define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4518 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
4519 ; X86: # %bb.0: # %entry
4520 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4521 ; X86-NEXT: kmovw %eax, %k1
4522 ; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4523 ; X86-NEXT: vmovaps %zmm2, %zmm0
4526 ; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
4527 ; X64: # %bb.0: # %entry
4528 ; X64-NEXT: kmovw %edi, %k1
4529 ; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4530 ; X64-NEXT: vmovaps %zmm2, %zmm0
4533 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4534 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4535 %1 = bitcast i16 %__U to <16 x i1>
4536 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4540 define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4541 ; X86-LABEL: test_mm512_mask3_fmsub_ps:
4542 ; X86: # %bb.0: # %entry
4543 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4544 ; X86-NEXT: kmovw %eax, %k1
4545 ; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4546 ; X86-NEXT: vmovaps %zmm2, %zmm0
4549 ; X64-LABEL: test_mm512_mask3_fmsub_ps:
4550 ; X64: # %bb.0: # %entry
4551 ; X64-NEXT: kmovw %edi, %k1
4552 ; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4553 ; X64-NEXT: vmovaps %zmm2, %zmm0
4556 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4557 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4558 %1 = bitcast i16 %__U to <16 x i1>
4559 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4563 define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4564 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4565 ; X86: # %bb.0: # %entry
4566 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4567 ; X86-NEXT: kmovw %eax, %k1
4568 ; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4569 ; X86-NEXT: vmovapd %zmm2, %zmm0
4572 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4573 ; X64: # %bb.0: # %entry
4574 ; X64-NEXT: kmovw %edi, %k1
4575 ; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4576 ; X64-NEXT: vmovapd %zmm2, %zmm0
4579 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4580 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4581 %1 = bitcast i8 %__U to <8 x i1>
4582 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4586 define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4587 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
4588 ; X86: # %bb.0: # %entry
4589 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4590 ; X86-NEXT: kmovw %eax, %k1
4591 ; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4592 ; X86-NEXT: vmovapd %zmm2, %zmm0
4595 ; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
4596 ; X64: # %bb.0: # %entry
4597 ; X64-NEXT: kmovw %edi, %k1
4598 ; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4599 ; X64-NEXT: vmovapd %zmm2, %zmm0
4602 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4603 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4604 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4605 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4606 %3 = bitcast i8 %__U to <8 x i1>
4607 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
4611 define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4612 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4613 ; X86: # %bb.0: # %entry
4614 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4615 ; X86-NEXT: kmovw %eax, %k1
4616 ; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4617 ; X86-NEXT: vmovaps %zmm2, %zmm0
4620 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4621 ; X64: # %bb.0: # %entry
4622 ; X64-NEXT: kmovw %edi, %k1
4623 ; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4624 ; X64-NEXT: vmovaps %zmm2, %zmm0
4627 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4628 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4629 %1 = bitcast i16 %__U to <16 x i1>
4630 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4634 define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4635 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
4636 ; X86: # %bb.0: # %entry
4637 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4638 ; X86-NEXT: kmovw %eax, %k1
4639 ; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4640 ; X86-NEXT: vmovaps %zmm2, %zmm0
4643 ; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
4644 ; X64: # %bb.0: # %entry
4645 ; X64-NEXT: kmovw %edi, %k1
4646 ; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4647 ; X64-NEXT: vmovaps %zmm2, %zmm0
4650 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4651 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4652 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4653 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4654 %3 = bitcast i16 %__U to <16 x i1>
4655 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
4659 define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4660 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
4661 ; X86: # %bb.0: # %entry
4662 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4663 ; X86-NEXT: kmovw %eax, %k1
4664 ; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4667 ; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
4668 ; X64: # %bb.0: # %entry
4669 ; X64-NEXT: kmovw %edi, %k1
4670 ; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4673 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4674 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
4675 %1 = bitcast i8 %__U to <8 x i1>
4676 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4680 define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4681 ; X86-LABEL: test_mm512_mask_fnmadd_pd:
4682 ; X86: # %bb.0: # %entry
4683 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4684 ; X86-NEXT: kmovw %eax, %k1
4685 ; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4688 ; X64-LABEL: test_mm512_mask_fnmadd_pd:
4689 ; X64: # %bb.0: # %entry
4690 ; X64-NEXT: kmovw %edi, %k1
4691 ; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4694 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4695 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
4696 %1 = bitcast i8 %__U to <8 x i1>
4697 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4701 define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4702 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
4703 ; X86: # %bb.0: # %entry
4704 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4705 ; X86-NEXT: kmovw %eax, %k1
4706 ; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4709 ; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
4710 ; X64: # %bb.0: # %entry
4711 ; X64-NEXT: kmovw %edi, %k1
4712 ; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4715 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4716 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
4717 %1 = bitcast i16 %__U to <16 x i1>
4718 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4722 define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4723 ; X86-LABEL: test_mm512_mask_fnmadd_ps:
4724 ; X86: # %bb.0: # %entry
4725 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4726 ; X86-NEXT: kmovw %eax, %k1
4727 ; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4730 ; X64-LABEL: test_mm512_mask_fnmadd_ps:
4731 ; X64: # %bb.0: # %entry
4732 ; X64-NEXT: kmovw %edi, %k1
4733 ; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4736 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4737 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
4738 %1 = bitcast i16 %__U to <16 x i1>
4739 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4743 define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4744 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
4745 ; X86: # %bb.0: # %entry
4746 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4747 ; X86-NEXT: kmovw %eax, %k1
4748 ; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4751 ; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
4752 ; X64: # %bb.0: # %entry
4753 ; X64-NEXT: kmovw %edi, %k1
4754 ; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4757 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4758 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4759 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4760 %1 = bitcast i8 %__U to <8 x i1>
4761 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4765 define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4766 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
4767 ; X86: # %bb.0: # %entry
4768 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4769 ; X86-NEXT: kmovw %eax, %k1
4770 ; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4771 ; X86-NEXT: vmovapd %zmm2, %zmm0
4774 ; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
4775 ; X64: # %bb.0: # %entry
4776 ; X64-NEXT: kmovw %edi, %k1
4777 ; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4778 ; X64-NEXT: vmovapd %zmm2, %zmm0
4781 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4782 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4783 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4784 %1 = bitcast i8 %__U to <8 x i1>
4785 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4789 define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4790 ; X86-LABEL: test_mm512_mask_fnmsub_pd:
4791 ; X86: # %bb.0: # %entry
4792 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4793 ; X86-NEXT: kmovw %eax, %k1
4794 ; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4797 ; X64-LABEL: test_mm512_mask_fnmsub_pd:
4798 ; X64: # %bb.0: # %entry
4799 ; X64-NEXT: kmovw %edi, %k1
4800 ; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4803 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4804 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4805 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4806 %1 = bitcast i8 %__U to <8 x i1>
4807 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4811 define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4812 ; X86-LABEL: test_mm512_mask3_fnmsub_pd:
4813 ; X86: # %bb.0: # %entry
4814 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4815 ; X86-NEXT: kmovw %eax, %k1
4816 ; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4817 ; X86-NEXT: vmovapd %zmm2, %zmm0
4820 ; X64-LABEL: test_mm512_mask3_fnmsub_pd:
4821 ; X64: # %bb.0: # %entry
4822 ; X64-NEXT: kmovw %edi, %k1
4823 ; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4824 ; X64-NEXT: vmovapd %zmm2, %zmm0
4827 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4828 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4829 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4830 %1 = bitcast i8 %__U to <8 x i1>
4831 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4835 define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4836 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
4837 ; X86: # %bb.0: # %entry
4838 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4839 ; X86-NEXT: kmovw %eax, %k1
4840 ; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4843 ; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
4844 ; X64: # %bb.0: # %entry
4845 ; X64-NEXT: kmovw %edi, %k1
4846 ; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4849 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4850 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4851 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4852 %1 = bitcast i16 %__U to <16 x i1>
4853 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4857 define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4858 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
4859 ; X86: # %bb.0: # %entry
4860 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4861 ; X86-NEXT: kmovw %eax, %k1
4862 ; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4863 ; X86-NEXT: vmovaps %zmm2, %zmm0
4866 ; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
4867 ; X64: # %bb.0: # %entry
4868 ; X64-NEXT: kmovw %edi, %k1
4869 ; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4870 ; X64-NEXT: vmovaps %zmm2, %zmm0
4873 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4874 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4875 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4876 %1 = bitcast i16 %__U to <16 x i1>
4877 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4881 define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4882 ; X86-LABEL: test_mm512_mask_fnmsub_ps:
4883 ; X86: # %bb.0: # %entry
4884 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4885 ; X86-NEXT: kmovw %eax, %k1
4886 ; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4889 ; X64-LABEL: test_mm512_mask_fnmsub_ps:
4890 ; X64: # %bb.0: # %entry
4891 ; X64-NEXT: kmovw %edi, %k1
4892 ; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4895 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4896 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4897 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4898 %1 = bitcast i16 %__U to <16 x i1>
4899 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4903 define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4904 ; X86-LABEL: test_mm512_mask3_fnmsub_ps:
4905 ; X86: # %bb.0: # %entry
4906 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4907 ; X86-NEXT: kmovw %eax, %k1
4908 ; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4909 ; X86-NEXT: vmovaps %zmm2, %zmm0
4912 ; X64-LABEL: test_mm512_mask3_fnmsub_ps:
4913 ; X64: # %bb.0: # %entry
4914 ; X64-NEXT: kmovw %edi, %k1
4915 ; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4916 ; X64-NEXT: vmovaps %zmm2, %zmm0
4919 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4920 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4921 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4922 %1 = bitcast i16 %__U to <16 x i1>
4923 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4927 define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4928 ; X86-LABEL: test_mm_mask_fmadd_ss:
4929 ; X86: # %bb.0: # %entry
4930 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4931 ; X86-NEXT: kmovw %eax, %k1
4932 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4935 ; X64-LABEL: test_mm_mask_fmadd_ss:
4936 ; X64: # %bb.0: # %entry
4937 ; X64-NEXT: kmovw %edi, %k1
4938 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4941 %0 = extractelement <4 x float> %__W, i64 0
4942 %1 = extractelement <4 x float> %__A, i64 0
4943 %2 = extractelement <4 x float> %__B, i64 0
4944 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4946 %tobool.i = icmp eq i8 %4, 0
4947 %vecext1.i = extractelement <4 x float> %__W, i32 0
4948 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
4949 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
4950 ret <4 x float> %vecins.i
4953 define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4954 ; X86-LABEL: test_mm_mask_fmadd_round_ss:
4955 ; X86: # %bb.0: # %entry
4956 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4957 ; X86-NEXT: kmovw %eax, %k1
4958 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
4961 ; X64-LABEL: test_mm_mask_fmadd_round_ss:
4962 ; X64: # %bb.0: # %entry
4963 ; X64-NEXT: kmovw %edi, %k1
4964 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
4967 %0 = extractelement <4 x float> %__W, i64 0
4968 %1 = extractelement <4 x float> %__A, i64 0
4969 %2 = extractelement <4 x float> %__B, i64 0
4970 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
4971 %4 = bitcast i8 %__U to <8 x i1>
4972 %5 = extractelement <8 x i1> %4, i64 0
4973 %6 = select i1 %5, float %3, float %0
4974 %7 = insertelement <4 x float> %__W, float %6, i64 0
4978 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
4980 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4981 ; X86-LABEL: test_mm_maskz_fmadd_ss:
4982 ; X86: # %bb.0: # %entry
4983 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4984 ; X86-NEXT: kmovw %eax, %k1
4985 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4988 ; X64-LABEL: test_mm_maskz_fmadd_ss:
4989 ; X64: # %bb.0: # %entry
4990 ; X64-NEXT: kmovw %edi, %k1
4991 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4994 %0 = extractelement <4 x float> %__A, i64 0
4995 %1 = extractelement <4 x float> %__B, i64 0
4996 %2 = extractelement <4 x float> %__C, i64 0
4997 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4999 %tobool.i = icmp eq i8 %4, 0
5000 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5001 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5002 ret <4 x float> %vecins.i
5005 define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5006 ; X86-LABEL: test_mm_maskz_fmadd_round_ss:
5007 ; X86: # %bb.0: # %entry
5008 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5009 ; X86-NEXT: kmovw %eax, %k1
5010 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5013 ; X64-LABEL: test_mm_maskz_fmadd_round_ss:
5014 ; X64: # %bb.0: # %entry
5015 ; X64-NEXT: kmovw %edi, %k1
5016 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5019 %0 = extractelement <4 x float> %__A, i64 0
5020 %1 = extractelement <4 x float> %__B, i64 0
5021 %2 = extractelement <4 x float> %__C, i64 0
5022 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5023 %4 = bitcast i8 %__U to <8 x i1>
5024 %5 = extractelement <8 x i1> %4, i64 0
5025 %6 = select i1 %5, float %3, float 0.000000e+00
5026 %7 = insertelement <4 x float> %__A, float %6, i64 0
5030 define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5031 ; X86-LABEL: test_mm_mask3_fmadd_ss:
5032 ; X86: # %bb.0: # %entry
5033 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5034 ; X86-NEXT: kmovw %eax, %k1
5035 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5036 ; X86-NEXT: vmovaps %xmm2, %xmm0
5039 ; X64-LABEL: test_mm_mask3_fmadd_ss:
5040 ; X64: # %bb.0: # %entry
5041 ; X64-NEXT: kmovw %edi, %k1
5042 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5043 ; X64-NEXT: vmovaps %xmm2, %xmm0
5046 %0 = extractelement <4 x float> %__W, i64 0
5047 %1 = extractelement <4 x float> %__X, i64 0
5048 %2 = extractelement <4 x float> %__Y, i64 0
5049 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5051 %tobool.i = icmp eq i8 %4, 0
5052 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5053 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5054 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5055 ret <4 x float> %vecins.i
5058 define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5059 ; X86-LABEL: test_mm_mask3_fmadd_round_ss:
5060 ; X86: # %bb.0: # %entry
5061 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5062 ; X86-NEXT: kmovw %eax, %k1
5063 ; X86-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5064 ; X86-NEXT: vmovaps %xmm2, %xmm0
5067 ; X64-LABEL: test_mm_mask3_fmadd_round_ss:
5068 ; X64: # %bb.0: # %entry
5069 ; X64-NEXT: kmovw %edi, %k1
5070 ; X64-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5071 ; X64-NEXT: vmovaps %xmm2, %xmm0
5074 %0 = extractelement <4 x float> %__W, i64 0
5075 %1 = extractelement <4 x float> %__X, i64 0
5076 %2 = extractelement <4 x float> %__Y, i64 0
5077 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5078 %4 = bitcast i8 %__U to <8 x i1>
5079 %5 = extractelement <8 x i1> %4, i64 0
5080 %6 = select i1 %5, float %3, float %2
5081 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5085 define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5086 ; X86-LABEL: test_mm_mask_fmsub_ss:
5087 ; X86: # %bb.0: # %entry
5088 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5089 ; X86-NEXT: kmovw %eax, %k1
5090 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5093 ; X64-LABEL: test_mm_mask_fmsub_ss:
5094 ; X64: # %bb.0: # %entry
5095 ; X64-NEXT: kmovw %edi, %k1
5096 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5099 %0 = extractelement <4 x float> %__W, i64 0
5100 %1 = extractelement <4 x float> %__A, i64 0
5101 %.rhs.i = extractelement <4 x float> %__B, i64 0
5102 %2 = fsub float -0.000000e+00, %.rhs.i
5103 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5105 %tobool.i = icmp eq i8 %4, 0
5106 %vecext1.i = extractelement <4 x float> %__W, i32 0
5107 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5108 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5109 ret <4 x float> %vecins.i
5112 define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5113 ; X86-LABEL: test_mm_mask_fmsub_round_ss:
5114 ; X86: # %bb.0: # %entry
5115 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5116 ; X86-NEXT: kmovw %eax, %k1
5117 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5120 ; X64-LABEL: test_mm_mask_fmsub_round_ss:
5121 ; X64: # %bb.0: # %entry
5122 ; X64-NEXT: kmovw %edi, %k1
5123 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5126 %0 = extractelement <4 x float> %__W, i64 0
5127 %1 = extractelement <4 x float> %__A, i64 0
5128 %.rhs = extractelement <4 x float> %__B, i64 0
5129 %2 = fsub float -0.000000e+00, %.rhs
5130 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5131 %4 = bitcast i8 %__U to <8 x i1>
5132 %5 = extractelement <8 x i1> %4, i64 0
5133 %6 = select i1 %5, float %3, float %0
5134 %7 = insertelement <4 x float> %__W, float %6, i64 0
5138 define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5139 ; X86-LABEL: test_mm_maskz_fmsub_ss:
5140 ; X86: # %bb.0: # %entry
5141 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5142 ; X86-NEXT: kmovw %eax, %k1
5143 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5146 ; X64-LABEL: test_mm_maskz_fmsub_ss:
5147 ; X64: # %bb.0: # %entry
5148 ; X64-NEXT: kmovw %edi, %k1
5149 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5152 %0 = extractelement <4 x float> %__A, i64 0
5153 %1 = extractelement <4 x float> %__B, i64 0
5154 %.rhs.i = extractelement <4 x float> %__C, i64 0
5155 %2 = fsub float -0.000000e+00, %.rhs.i
5156 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5158 %tobool.i = icmp eq i8 %4, 0
5159 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5160 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5161 ret <4 x float> %vecins.i
5164 define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5165 ; X86-LABEL: test_mm_maskz_fmsub_round_ss:
5166 ; X86: # %bb.0: # %entry
5167 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5168 ; X86-NEXT: kmovw %eax, %k1
5169 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5172 ; X64-LABEL: test_mm_maskz_fmsub_round_ss:
5173 ; X64: # %bb.0: # %entry
5174 ; X64-NEXT: kmovw %edi, %k1
5175 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5178 %0 = extractelement <4 x float> %__A, i64 0
5179 %1 = extractelement <4 x float> %__B, i64 0
5180 %.rhs = extractelement <4 x float> %__C, i64 0
5181 %2 = fsub float -0.000000e+00, %.rhs
5182 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5183 %4 = bitcast i8 %__U to <8 x i1>
5184 %5 = extractelement <8 x i1> %4, i64 0
5185 %6 = select i1 %5, float %3, float 0.000000e+00
5186 %7 = insertelement <4 x float> %__A, float %6, i64 0
5190 define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5191 ; X86-LABEL: test_mm_mask3_fmsub_ss:
5192 ; X86: # %bb.0: # %entry
5193 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5194 ; X86-NEXT: kmovw %eax, %k1
5195 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5196 ; X86-NEXT: vmovaps %xmm2, %xmm0
5199 ; X64-LABEL: test_mm_mask3_fmsub_ss:
5200 ; X64: # %bb.0: # %entry
5201 ; X64-NEXT: kmovw %edi, %k1
5202 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5203 ; X64-NEXT: vmovaps %xmm2, %xmm0
5206 %0 = extractelement <4 x float> %__W, i64 0
5207 %1 = extractelement <4 x float> %__X, i64 0
5208 %.rhs.i = extractelement <4 x float> %__Y, i64 0
5209 %2 = fsub float -0.000000e+00, %.rhs.i
5210 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5212 %tobool.i = icmp eq i8 %4, 0
5213 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5214 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5215 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5216 ret <4 x float> %vecins.i
5219 define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5220 ; X86-LABEL: test_mm_mask3_fmsub_round_ss:
5221 ; X86: # %bb.0: # %entry
5222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5223 ; X86-NEXT: kmovw %eax, %k1
5224 ; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5225 ; X86-NEXT: vmovaps %xmm2, %xmm0
5228 ; X64-LABEL: test_mm_mask3_fmsub_round_ss:
5229 ; X64: # %bb.0: # %entry
5230 ; X64-NEXT: kmovw %edi, %k1
5231 ; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5232 ; X64-NEXT: vmovaps %xmm2, %xmm0
5235 %0 = extractelement <4 x float> %__W, i64 0
5236 %1 = extractelement <4 x float> %__X, i64 0
5237 %.rhs = extractelement <4 x float> %__Y, i64 0
5238 %2 = fsub float -0.000000e+00, %.rhs
5239 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5240 %4 = bitcast i8 %__U to <8 x i1>
5241 %5 = extractelement <8 x i1> %4, i64 0
5242 %6 = select i1 %5, float %3, float %.rhs
5243 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5247 define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5248 ; X86-LABEL: test_mm_mask_fnmadd_ss:
5249 ; X86: # %bb.0: # %entry
5250 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5251 ; X86-NEXT: kmovw %eax, %k1
5252 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5255 ; X64-LABEL: test_mm_mask_fnmadd_ss:
5256 ; X64: # %bb.0: # %entry
5257 ; X64-NEXT: kmovw %edi, %k1
5258 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5261 %0 = extractelement <4 x float> %__W, i64 0
5262 %.rhs.i = extractelement <4 x float> %__A, i64 0
5263 %1 = fsub float -0.000000e+00, %.rhs.i
5264 %2 = extractelement <4 x float> %__B, i64 0
5265 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5267 %tobool.i = icmp eq i8 %4, 0
5268 %vecext1.i = extractelement <4 x float> %__W, i32 0
5269 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5270 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5271 ret <4 x float> %vecins.i
5274 define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5275 ; X86-LABEL: test_mm_mask_fnmadd_round_ss:
5276 ; X86: # %bb.0: # %entry
5277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5278 ; X86-NEXT: kmovw %eax, %k1
5279 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5282 ; X64-LABEL: test_mm_mask_fnmadd_round_ss:
5283 ; X64: # %bb.0: # %entry
5284 ; X64-NEXT: kmovw %edi, %k1
5285 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5288 %0 = extractelement <4 x float> %__W, i64 0
5289 %.rhs = extractelement <4 x float> %__A, i64 0
5290 %1 = fsub float -0.000000e+00, %.rhs
5291 %2 = extractelement <4 x float> %__B, i64 0
5292 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5293 %4 = bitcast i8 %__U to <8 x i1>
5294 %5 = extractelement <8 x i1> %4, i64 0
5295 %6 = select i1 %5, float %3, float %0
5296 %7 = insertelement <4 x float> %__W, float %6, i64 0
5300 define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5301 ; X86-LABEL: test_mm_maskz_fnmadd_ss:
5302 ; X86: # %bb.0: # %entry
5303 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5304 ; X86-NEXT: kmovw %eax, %k1
5305 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5308 ; X64-LABEL: test_mm_maskz_fnmadd_ss:
5309 ; X64: # %bb.0: # %entry
5310 ; X64-NEXT: kmovw %edi, %k1
5311 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5314 %0 = extractelement <4 x float> %__A, i64 0
5315 %.rhs.i = extractelement <4 x float> %__B, i64 0
5316 %1 = fsub float -0.000000e+00, %.rhs.i
5317 %2 = extractelement <4 x float> %__C, i64 0
5318 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5320 %tobool.i = icmp eq i8 %4, 0
5321 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5322 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5323 ret <4 x float> %vecins.i
5326 define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5327 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
5328 ; X86: # %bb.0: # %entry
5329 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5330 ; X86-NEXT: kmovw %eax, %k1
5331 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5334 ; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
5335 ; X64: # %bb.0: # %entry
5336 ; X64-NEXT: kmovw %edi, %k1
5337 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5340 %0 = extractelement <4 x float> %__A, i64 0
5341 %.rhs = extractelement <4 x float> %__B, i64 0
5342 %1 = fsub float -0.000000e+00, %.rhs
5343 %2 = extractelement <4 x float> %__C, i64 0
5344 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5345 %4 = bitcast i8 %__U to <8 x i1>
5346 %5 = extractelement <8 x i1> %4, i64 0
5347 %6 = select i1 %5, float %3, float 0.000000e+00
5348 %7 = insertelement <4 x float> %__A, float %6, i64 0
5352 define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5353 ; X86-LABEL: test_mm_mask3_fnmadd_ss:
5354 ; X86: # %bb.0: # %entry
5355 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5356 ; X86-NEXT: kmovw %eax, %k1
5357 ; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5358 ; X86-NEXT: vmovaps %xmm2, %xmm0
5361 ; X64-LABEL: test_mm_mask3_fnmadd_ss:
5362 ; X64: # %bb.0: # %entry
5363 ; X64-NEXT: kmovw %edi, %k1
5364 ; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5365 ; X64-NEXT: vmovaps %xmm2, %xmm0
5368 %0 = extractelement <4 x float> %__W, i64 0
5369 %.rhs.i = extractelement <4 x float> %__X, i64 0
5370 %1 = fsub float -0.000000e+00, %.rhs.i
5371 %2 = extractelement <4 x float> %__Y, i64 0
5372 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5374 %tobool.i = icmp eq i8 %4, 0
5375 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5376 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5377 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5378 ret <4 x float> %vecins.i
5381 define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5382 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
5383 ; X86: # %bb.0: # %entry
5384 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5385 ; X86-NEXT: kmovw %eax, %k1
5386 ; X86-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5387 ; X86-NEXT: vmovaps %xmm2, %xmm0
5390 ; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
5391 ; X64: # %bb.0: # %entry
5392 ; X64-NEXT: kmovw %edi, %k1
5393 ; X64-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5394 ; X64-NEXT: vmovaps %xmm2, %xmm0
5397 %0 = extractelement <4 x float> %__W, i64 0
5398 %.rhs = extractelement <4 x float> %__X, i64 0
5399 %1 = fsub float -0.000000e+00, %.rhs
5400 %2 = extractelement <4 x float> %__Y, i64 0
5401 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5402 %4 = bitcast i8 %__U to <8 x i1>
5403 %5 = extractelement <8 x i1> %4, i64 0
5404 %6 = select i1 %5, float %3, float %2
5405 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5409 define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5410 ; X86-LABEL: test_mm_mask_fnmsub_ss:
5411 ; X86: # %bb.0: # %entry
5412 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5413 ; X86-NEXT: kmovw %eax, %k1
5414 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5417 ; X64-LABEL: test_mm_mask_fnmsub_ss:
5418 ; X64: # %bb.0: # %entry
5419 ; X64-NEXT: kmovw %edi, %k1
5420 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5423 %0 = extractelement <4 x float> %__W, i64 0
5424 %.rhs.i = extractelement <4 x float> %__A, i64 0
5425 %1 = fsub float -0.000000e+00, %.rhs.i
5426 %.rhs7.i = extractelement <4 x float> %__B, i64 0
5427 %2 = fsub float -0.000000e+00, %.rhs7.i
5428 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5430 %tobool.i = icmp eq i8 %4, 0
5431 %vecext2.i = extractelement <4 x float> %__W, i32 0
5432 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5433 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5434 ret <4 x float> %vecins.i
5437 define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5438 ; X86-LABEL: test_mm_mask_fnmsub_round_ss:
5439 ; X86: # %bb.0: # %entry
5440 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5441 ; X86-NEXT: kmovw %eax, %k1
5442 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5445 ; X64-LABEL: test_mm_mask_fnmsub_round_ss:
5446 ; X64: # %bb.0: # %entry
5447 ; X64-NEXT: kmovw %edi, %k1
5448 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5451 %0 = extractelement <4 x float> %__W, i64 0
5452 %.rhs = extractelement <4 x float> %__A, i64 0
5453 %1 = fsub float -0.000000e+00, %.rhs
5454 %.rhs2 = extractelement <4 x float> %__B, i64 0
5455 %2 = fsub float -0.000000e+00, %.rhs2
5456 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5457 %4 = bitcast i8 %__U to <8 x i1>
5458 %5 = extractelement <8 x i1> %4, i64 0
5459 %6 = select i1 %5, float %3, float %0
5460 %7 = insertelement <4 x float> %__W, float %6, i64 0
5464 define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5465 ; X86-LABEL: test_mm_maskz_fnmsub_ss:
5466 ; X86: # %bb.0: # %entry
5467 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5468 ; X86-NEXT: kmovw %eax, %k1
5469 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5472 ; X64-LABEL: test_mm_maskz_fnmsub_ss:
5473 ; X64: # %bb.0: # %entry
5474 ; X64-NEXT: kmovw %edi, %k1
5475 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5478 %0 = extractelement <4 x float> %__A, i64 0
5479 %.rhs.i = extractelement <4 x float> %__B, i64 0
5480 %1 = fsub float -0.000000e+00, %.rhs.i
5481 %.rhs5.i = extractelement <4 x float> %__C, i64 0
5482 %2 = fsub float -0.000000e+00, %.rhs5.i
5483 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5485 %tobool.i = icmp eq i8 %4, 0
5486 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5487 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5488 ret <4 x float> %vecins.i
5491 define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5492 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
5493 ; X86: # %bb.0: # %entry
5494 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5495 ; X86-NEXT: kmovw %eax, %k1
5496 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5499 ; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
5500 ; X64: # %bb.0: # %entry
5501 ; X64-NEXT: kmovw %edi, %k1
5502 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5505 %0 = extractelement <4 x float> %__A, i64 0
5506 %.rhs = extractelement <4 x float> %__B, i64 0
5507 %1 = fsub float -0.000000e+00, %.rhs
5508 %.rhs2 = extractelement <4 x float> %__C, i64 0
5509 %2 = fsub float -0.000000e+00, %.rhs2
5510 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5511 %4 = bitcast i8 %__U to <8 x i1>
5512 %5 = extractelement <8 x i1> %4, i64 0
5513 %6 = select i1 %5, float %3, float 0.000000e+00
5514 %7 = insertelement <4 x float> %__A, float %6, i64 0
5518 define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5519 ; X86-LABEL: test_mm_mask3_fnmsub_ss:
5520 ; X86: # %bb.0: # %entry
5521 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5522 ; X86-NEXT: kmovw %eax, %k1
5523 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5524 ; X86-NEXT: vmovaps %xmm2, %xmm0
5527 ; X64-LABEL: test_mm_mask3_fnmsub_ss:
5528 ; X64: # %bb.0: # %entry
5529 ; X64-NEXT: kmovw %edi, %k1
5530 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5531 ; X64-NEXT: vmovaps %xmm2, %xmm0
5534 %0 = extractelement <4 x float> %__W, i64 0
5535 %.rhs.i = extractelement <4 x float> %__X, i64 0
5536 %1 = fsub float -0.000000e+00, %.rhs.i
5537 %.rhs7.i = extractelement <4 x float> %__Y, i64 0
5538 %2 = fsub float -0.000000e+00, %.rhs7.i
5539 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5541 %tobool.i = icmp eq i8 %4, 0
5542 %vecext2.i = extractelement <4 x float> %__Y, i32 0
5543 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5544 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5545 ret <4 x float> %vecins.i
5548 define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5549 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
5550 ; X86: # %bb.0: # %entry
5551 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5552 ; X86-NEXT: kmovw %eax, %k1
5553 ; X86-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5554 ; X86-NEXT: vmovaps %xmm2, %xmm0
5557 ; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
5558 ; X64: # %bb.0: # %entry
5559 ; X64-NEXT: kmovw %edi, %k1
5560 ; X64-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5561 ; X64-NEXT: vmovaps %xmm2, %xmm0
5564 %0 = extractelement <4 x float> %__W, i64 0
5565 %.rhs = extractelement <4 x float> %__X, i64 0
5566 %1 = fsub float -0.000000e+00, %.rhs
5567 %.rhs1 = extractelement <4 x float> %__Y, i64 0
5568 %2 = fsub float -0.000000e+00, %.rhs1
5569 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5570 %4 = bitcast i8 %__U to <8 x i1>
5571 %5 = extractelement <8 x i1> %4, i64 0
5572 %6 = select i1 %5, float %3, float %.rhs1
5573 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5577 define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5578 ; X86-LABEL: test_mm_mask_fmadd_sd:
5579 ; X86: # %bb.0: # %entry
5580 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5581 ; X86-NEXT: kmovw %eax, %k1
5582 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5585 ; X64-LABEL: test_mm_mask_fmadd_sd:
5586 ; X64: # %bb.0: # %entry
5587 ; X64-NEXT: kmovw %edi, %k1
5588 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5591 %0 = extractelement <2 x double> %__W, i64 0
5592 %1 = extractelement <2 x double> %__A, i64 0
5593 %2 = extractelement <2 x double> %__B, i64 0
5594 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5596 %tobool.i = icmp eq i8 %4, 0
5597 %vecext1.i = extractelement <2 x double> %__W, i32 0
5598 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5599 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5600 ret <2 x double> %vecins.i
5603 define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5604 ; X86-LABEL: test_mm_mask_fmadd_round_sd:
5605 ; X86: # %bb.0: # %entry
5606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5607 ; X86-NEXT: kmovw %eax, %k1
5608 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5611 ; X64-LABEL: test_mm_mask_fmadd_round_sd:
5612 ; X64: # %bb.0: # %entry
5613 ; X64-NEXT: kmovw %edi, %k1
5614 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5617 %0 = extractelement <2 x double> %__W, i64 0
5618 %1 = extractelement <2 x double> %__A, i64 0
5619 %2 = extractelement <2 x double> %__B, i64 0
5620 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5621 %4 = bitcast i8 %__U to <8 x i1>
5622 %5 = extractelement <8 x i1> %4, i64 0
5623 %6 = select i1 %5, double %3, double %0
5624 %7 = insertelement <2 x double> %__W, double %6, i64 0
5628 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
5630 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5631 ; X86-LABEL: test_mm_maskz_fmadd_sd:
5632 ; X86: # %bb.0: # %entry
5633 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5634 ; X86-NEXT: kmovw %eax, %k1
5635 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5638 ; X64-LABEL: test_mm_maskz_fmadd_sd:
5639 ; X64: # %bb.0: # %entry
5640 ; X64-NEXT: kmovw %edi, %k1
5641 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5644 %0 = extractelement <2 x double> %__A, i64 0
5645 %1 = extractelement <2 x double> %__B, i64 0
5646 %2 = extractelement <2 x double> %__C, i64 0
5647 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5649 %tobool.i = icmp eq i8 %4, 0
5650 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5651 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5652 ret <2 x double> %vecins.i
5655 define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5656 ; X86-LABEL: test_mm_maskz_fmadd_round_sd:
5657 ; X86: # %bb.0: # %entry
5658 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5659 ; X86-NEXT: kmovw %eax, %k1
5660 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5663 ; X64-LABEL: test_mm_maskz_fmadd_round_sd:
5664 ; X64: # %bb.0: # %entry
5665 ; X64-NEXT: kmovw %edi, %k1
5666 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5669 %0 = extractelement <2 x double> %__A, i64 0
5670 %1 = extractelement <2 x double> %__B, i64 0
5671 %2 = extractelement <2 x double> %__C, i64 0
5672 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5673 %4 = bitcast i8 %__U to <8 x i1>
5674 %5 = extractelement <8 x i1> %4, i64 0
5675 %6 = select i1 %5, double %3, double 0.000000e+00
5676 %7 = insertelement <2 x double> %__A, double %6, i64 0
5680 define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5681 ; X86-LABEL: test_mm_mask3_fmadd_sd:
5682 ; X86: # %bb.0: # %entry
5683 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5684 ; X86-NEXT: kmovw %eax, %k1
5685 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5686 ; X86-NEXT: vmovapd %xmm2, %xmm0
5689 ; X64-LABEL: test_mm_mask3_fmadd_sd:
5690 ; X64: # %bb.0: # %entry
5691 ; X64-NEXT: kmovw %edi, %k1
5692 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5693 ; X64-NEXT: vmovapd %xmm2, %xmm0
5696 %0 = extractelement <2 x double> %__W, i64 0
5697 %1 = extractelement <2 x double> %__X, i64 0
5698 %2 = extractelement <2 x double> %__Y, i64 0
5699 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5701 %tobool.i = icmp eq i8 %4, 0
5702 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5703 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5704 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5705 ret <2 x double> %vecins.i
5708 define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5709 ; X86-LABEL: test_mm_mask3_fmadd_round_sd:
5710 ; X86: # %bb.0: # %entry
5711 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5712 ; X86-NEXT: kmovw %eax, %k1
5713 ; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5714 ; X86-NEXT: vmovapd %xmm2, %xmm0
5717 ; X64-LABEL: test_mm_mask3_fmadd_round_sd:
5718 ; X64: # %bb.0: # %entry
5719 ; X64-NEXT: kmovw %edi, %k1
5720 ; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5721 ; X64-NEXT: vmovapd %xmm2, %xmm0
5724 %0 = extractelement <2 x double> %__W, i64 0
5725 %1 = extractelement <2 x double> %__X, i64 0
5726 %2 = extractelement <2 x double> %__Y, i64 0
5727 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5728 %4 = bitcast i8 %__U to <8 x i1>
5729 %5 = extractelement <8 x i1> %4, i64 0
5730 %6 = select i1 %5, double %3, double %2
5731 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5735 define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5736 ; X86-LABEL: test_mm_mask_fmsub_sd:
5737 ; X86: # %bb.0: # %entry
5738 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5739 ; X86-NEXT: kmovw %eax, %k1
5740 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5743 ; X64-LABEL: test_mm_mask_fmsub_sd:
5744 ; X64: # %bb.0: # %entry
5745 ; X64-NEXT: kmovw %edi, %k1
5746 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5749 %0 = extractelement <2 x double> %__W, i64 0
5750 %1 = extractelement <2 x double> %__A, i64 0
5751 %.rhs.i = extractelement <2 x double> %__B, i64 0
5752 %2 = fsub double -0.000000e+00, %.rhs.i
5753 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5755 %tobool.i = icmp eq i8 %4, 0
5756 %vecext1.i = extractelement <2 x double> %__W, i32 0
5757 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5758 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5759 ret <2 x double> %vecins.i
5762 define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5763 ; X86-LABEL: test_mm_mask_fmsub_round_sd:
5764 ; X86: # %bb.0: # %entry
5765 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5766 ; X86-NEXT: kmovw %eax, %k1
5767 ; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5770 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
5771 ; X64: # %bb.0: # %entry
5772 ; X64-NEXT: kmovw %edi, %k1
5773 ; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5776 %0 = extractelement <2 x double> %__W, i64 0
5777 %1 = extractelement <2 x double> %__A, i64 0
5778 %.rhs = extractelement <2 x double> %__B, i64 0
5779 %2 = fsub double -0.000000e+00, %.rhs
5780 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5781 %4 = bitcast i8 %__U to <8 x i1>
5782 %5 = extractelement <8 x i1> %4, i64 0
5783 %6 = select i1 %5, double %3, double %0
5784 %7 = insertelement <2 x double> %__W, double %6, i64 0
5788 define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5789 ; X86-LABEL: test_mm_maskz_fmsub_sd:
5790 ; X86: # %bb.0: # %entry
5791 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5792 ; X86-NEXT: kmovw %eax, %k1
5793 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5796 ; X64-LABEL: test_mm_maskz_fmsub_sd:
5797 ; X64: # %bb.0: # %entry
5798 ; X64-NEXT: kmovw %edi, %k1
5799 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5802 %0 = extractelement <2 x double> %__A, i64 0
5803 %1 = extractelement <2 x double> %__B, i64 0
5804 %.rhs.i = extractelement <2 x double> %__C, i64 0
5805 %2 = fsub double -0.000000e+00, %.rhs.i
5806 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5808 %tobool.i = icmp eq i8 %4, 0
5809 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5810 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5811 ret <2 x double> %vecins.i
5814 define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5815 ; X86-LABEL: test_mm_maskz_fmsub_round_sd:
5816 ; X86: # %bb.0: # %entry
5817 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5818 ; X86-NEXT: kmovw %eax, %k1
5819 ; X86-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5822 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
5823 ; X64: # %bb.0: # %entry
5824 ; X64-NEXT: kmovw %edi, %k1
5825 ; X64-NEXT: vfmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5828 %0 = extractelement <2 x double> %__A, i64 0
5829 %1 = extractelement <2 x double> %__B, i64 0
5830 %.rhs = extractelement <2 x double> %__C, i64 0
5831 %2 = fsub double -0.000000e+00, %.rhs
5832 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5833 %4 = bitcast i8 %__U to <8 x i1>
5834 %5 = extractelement <8 x i1> %4, i64 0
5835 %6 = select i1 %5, double %3, double 0.000000e+00
5836 %7 = insertelement <2 x double> %__A, double %6, i64 0
5840 define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5841 ; X86-LABEL: test_mm_mask3_fmsub_sd:
5842 ; X86: # %bb.0: # %entry
5843 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5844 ; X86-NEXT: kmovw %eax, %k1
5845 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5846 ; X86-NEXT: vmovapd %xmm2, %xmm0
5849 ; X64-LABEL: test_mm_mask3_fmsub_sd:
5850 ; X64: # %bb.0: # %entry
5851 ; X64-NEXT: kmovw %edi, %k1
5852 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5853 ; X64-NEXT: vmovapd %xmm2, %xmm0
5856 %0 = extractelement <2 x double> %__W, i64 0
5857 %1 = extractelement <2 x double> %__X, i64 0
5858 %.rhs.i = extractelement <2 x double> %__Y, i64 0
5859 %2 = fsub double -0.000000e+00, %.rhs.i
5860 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5862 %tobool.i = icmp eq i8 %4, 0
5863 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5864 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5865 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5866 ret <2 x double> %vecins.i
5869 define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5870 ; X86-LABEL: test_mm_mask3_fmsub_round_sd:
5871 ; X86: # %bb.0: # %entry
5872 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5873 ; X86-NEXT: kmovw %eax, %k1
5874 ; X86-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5875 ; X86-NEXT: vmovapd %xmm2, %xmm0
5878 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
5879 ; X64: # %bb.0: # %entry
5880 ; X64-NEXT: kmovw %edi, %k1
5881 ; X64-NEXT: vfmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5882 ; X64-NEXT: vmovapd %xmm2, %xmm0
5885 %0 = extractelement <2 x double> %__W, i64 0
5886 %1 = extractelement <2 x double> %__X, i64 0
5887 %.rhs = extractelement <2 x double> %__Y, i64 0
5888 %2 = fsub double -0.000000e+00, %.rhs
5889 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5890 %4 = bitcast i8 %__U to <8 x i1>
5891 %5 = extractelement <8 x i1> %4, i64 0
5892 %6 = select i1 %5, double %3, double %.rhs
5893 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5897 define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5898 ; X86-LABEL: test_mm_mask_fnmadd_sd:
5899 ; X86: # %bb.0: # %entry
5900 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5901 ; X86-NEXT: kmovw %eax, %k1
5902 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5905 ; X64-LABEL: test_mm_mask_fnmadd_sd:
5906 ; X64: # %bb.0: # %entry
5907 ; X64-NEXT: kmovw %edi, %k1
5908 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5911 %0 = extractelement <2 x double> %__W, i64 0
5912 %.rhs.i = extractelement <2 x double> %__A, i64 0
5913 %1 = fsub double -0.000000e+00, %.rhs.i
5914 %2 = extractelement <2 x double> %__B, i64 0
5915 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5917 %tobool.i = icmp eq i8 %4, 0
5918 %vecext1.i = extractelement <2 x double> %__W, i32 0
5919 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5920 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5921 ret <2 x double> %vecins.i
5924 define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5925 ; X86-LABEL: test_mm_mask_fnmadd_round_sd:
5926 ; X86: # %bb.0: # %entry
5927 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5928 ; X86-NEXT: kmovw %eax, %k1
5929 ; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5932 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
5933 ; X64: # %bb.0: # %entry
5934 ; X64-NEXT: kmovw %edi, %k1
5935 ; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5938 %0 = extractelement <2 x double> %__W, i64 0
5939 %.rhs = extractelement <2 x double> %__A, i64 0
5940 %1 = fsub double -0.000000e+00, %.rhs
5941 %2 = extractelement <2 x double> %__B, i64 0
5942 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5943 %4 = bitcast i8 %__U to <8 x i1>
5944 %5 = extractelement <8 x i1> %4, i64 0
5945 %6 = select i1 %5, double %3, double %0
5946 %7 = insertelement <2 x double> %__W, double %6, i64 0
5950 define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5951 ; X86-LABEL: test_mm_maskz_fnmadd_sd:
5952 ; X86: # %bb.0: # %entry
5953 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5954 ; X86-NEXT: kmovw %eax, %k1
5955 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5958 ; X64-LABEL: test_mm_maskz_fnmadd_sd:
5959 ; X64: # %bb.0: # %entry
5960 ; X64-NEXT: kmovw %edi, %k1
5961 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5964 %0 = extractelement <2 x double> %__A, i64 0
5965 %.rhs.i = extractelement <2 x double> %__B, i64 0
5966 %1 = fsub double -0.000000e+00, %.rhs.i
5967 %2 = extractelement <2 x double> %__C, i64 0
5968 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5970 %tobool.i = icmp eq i8 %4, 0
5971 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5972 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5973 ret <2 x double> %vecins.i
5976 define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5977 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
5978 ; X86: # %bb.0: # %entry
5979 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5980 ; X86-NEXT: kmovw %eax, %k1
5981 ; X86-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5984 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
5985 ; X64: # %bb.0: # %entry
5986 ; X64-NEXT: kmovw %edi, %k1
5987 ; X64-NEXT: vfnmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5990 %0 = extractelement <2 x double> %__A, i64 0
5991 %.rhs = extractelement <2 x double> %__B, i64 0
5992 %1 = fsub double -0.000000e+00, %.rhs
5993 %2 = extractelement <2 x double> %__C, i64 0
5994 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5995 %4 = bitcast i8 %__U to <8 x i1>
5996 %5 = extractelement <8 x i1> %4, i64 0
5997 %6 = select i1 %5, double %3, double 0.000000e+00
5998 %7 = insertelement <2 x double> %__A, double %6, i64 0
6002 define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6003 ; X86-LABEL: test_mm_mask3_fnmadd_sd:
6004 ; X86: # %bb.0: # %entry
6005 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6006 ; X86-NEXT: kmovw %eax, %k1
6007 ; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
6008 ; X86-NEXT: vmovapd %xmm2, %xmm0
6011 ; X64-LABEL: test_mm_mask3_fnmadd_sd:
6012 ; X64: # %bb.0: # %entry
6013 ; X64-NEXT: kmovw %edi, %k1
6014 ; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
6015 ; X64-NEXT: vmovapd %xmm2, %xmm0
6018 %0 = extractelement <2 x double> %__W, i64 0
6019 %.rhs.i = extractelement <2 x double> %__X, i64 0
6020 %1 = fsub double -0.000000e+00, %.rhs.i
6021 %2 = extractelement <2 x double> %__Y, i64 0
6022 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6024 %tobool.i = icmp eq i8 %4, 0
6025 %vecext1.i = extractelement <2 x double> %__Y, i32 0
6026 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
6027 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6028 ret <2 x double> %vecins.i
6031 define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6032 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
6033 ; X86: # %bb.0: # %entry
6034 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6035 ; X86-NEXT: kmovw %eax, %k1
6036 ; X86-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6037 ; X86-NEXT: vmovapd %xmm2, %xmm0
6040 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
6041 ; X64: # %bb.0: # %entry
6042 ; X64-NEXT: kmovw %edi, %k1
6043 ; X64-NEXT: vfnmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6044 ; X64-NEXT: vmovapd %xmm2, %xmm0
6047 %0 = extractelement <2 x double> %__W, i64 0
6048 %.rhs = extractelement <2 x double> %__X, i64 0
6049 %1 = fsub double -0.000000e+00, %.rhs
6050 %2 = extractelement <2 x double> %__Y, i64 0
6051 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6052 %4 = bitcast i8 %__U to <8 x i1>
6053 %5 = extractelement <8 x i1> %4, i64 0
6054 %6 = select i1 %5, double %3, double %2
6055 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6059 define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6060 ; X86-LABEL: test_mm_mask_fnmsub_sd:
6061 ; X86: # %bb.0: # %entry
6062 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6063 ; X86-NEXT: kmovw %eax, %k1
6064 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6067 ; X64-LABEL: test_mm_mask_fnmsub_sd:
6068 ; X64: # %bb.0: # %entry
6069 ; X64-NEXT: kmovw %edi, %k1
6070 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6073 %0 = extractelement <2 x double> %__W, i64 0
6074 %.rhs.i = extractelement <2 x double> %__A, i64 0
6075 %1 = fsub double -0.000000e+00, %.rhs.i
6076 %.rhs7.i = extractelement <2 x double> %__B, i64 0
6077 %2 = fsub double -0.000000e+00, %.rhs7.i
6078 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6080 %tobool.i = icmp eq i8 %4, 0
6081 %vecext2.i = extractelement <2 x double> %__W, i32 0
6082 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6083 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
6084 ret <2 x double> %vecins.i
6087 define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6088 ; X86-LABEL: test_mm_mask_fnmsub_round_sd:
6089 ; X86: # %bb.0: # %entry
6090 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6091 ; X86-NEXT: kmovw %eax, %k1
6092 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6095 ; X64-LABEL: test_mm_mask_fnmsub_round_sd:
6096 ; X64: # %bb.0: # %entry
6097 ; X64-NEXT: kmovw %edi, %k1
6098 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6101 %0 = extractelement <2 x double> %__W, i64 0
6102 %.rhs = extractelement <2 x double> %__A, i64 0
6103 %1 = fsub double -0.000000e+00, %.rhs
6104 %.rhs2 = extractelement <2 x double> %__B, i64 0
6105 %2 = fsub double -0.000000e+00, %.rhs2
6106 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6107 %4 = bitcast i8 %__U to <8 x i1>
6108 %5 = extractelement <8 x i1> %4, i64 0
6109 %6 = select i1 %5, double %3, double %0
6110 %7 = insertelement <2 x double> %__W, double %6, i64 0
6114 define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6115 ; X86-LABEL: test_mm_maskz_fnmsub_sd:
6116 ; X86: # %bb.0: # %entry
6117 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6118 ; X86-NEXT: kmovw %eax, %k1
6119 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6122 ; X64-LABEL: test_mm_maskz_fnmsub_sd:
6123 ; X64: # %bb.0: # %entry
6124 ; X64-NEXT: kmovw %edi, %k1
6125 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6128 %0 = extractelement <2 x double> %__A, i64 0
6129 %.rhs.i = extractelement <2 x double> %__B, i64 0
6130 %1 = fsub double -0.000000e+00, %.rhs.i
6131 %.rhs5.i = extractelement <2 x double> %__C, i64 0
6132 %2 = fsub double -0.000000e+00, %.rhs5.i
6133 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6135 %tobool.i = icmp eq i8 %4, 0
6136 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6137 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6138 ret <2 x double> %vecins.i
6141 define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6142 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
6143 ; X86: # %bb.0: # %entry
6144 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6145 ; X86-NEXT: kmovw %eax, %k1
6146 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6149 ; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
6150 ; X64: # %bb.0: # %entry
6151 ; X64-NEXT: kmovw %edi, %k1
6152 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6155 %0 = extractelement <2 x double> %__A, i64 0
6156 %.rhs = extractelement <2 x double> %__B, i64 0
6157 %1 = fsub double -0.000000e+00, %.rhs
6158 %.rhs2 = extractelement <2 x double> %__C, i64 0
6159 %2 = fsub double -0.000000e+00, %.rhs2
6160 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6161 %4 = bitcast i8 %__U to <8 x i1>
6162 %5 = extractelement <8 x i1> %4, i64 0
6163 %6 = select i1 %5, double %3, double 0.000000e+00
6164 %7 = insertelement <2 x double> %__A, double %6, i64 0
6168 define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6169 ; X86-LABEL: test_mm_mask3_fnmsub_sd:
6170 ; X86: # %bb.0: # %entry
6171 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6172 ; X86-NEXT: kmovw %eax, %k1
6173 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6174 ; X86-NEXT: vmovapd %xmm2, %xmm0
6177 ; X64-LABEL: test_mm_mask3_fnmsub_sd:
6178 ; X64: # %bb.0: # %entry
6179 ; X64-NEXT: kmovw %edi, %k1
6180 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6181 ; X64-NEXT: vmovapd %xmm2, %xmm0
6184 %0 = extractelement <2 x double> %__W, i64 0
6185 %.rhs.i = extractelement <2 x double> %__X, i64 0
6186 %1 = fsub double -0.000000e+00, %.rhs.i
6187 %.rhs7.i = extractelement <2 x double> %__Y, i64 0
6188 %2 = fsub double -0.000000e+00, %.rhs7.i
6189 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6191 %tobool.i = icmp eq i8 %4, 0
6192 %vecext2.i = extractelement <2 x double> %__Y, i32 0
6193 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6194 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6195 ret <2 x double> %vecins.i
6198 define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6199 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
6200 ; X86: # %bb.0: # %entry
6201 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6202 ; X86-NEXT: kmovw %eax, %k1
6203 ; X86-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6204 ; X86-NEXT: vmovapd %xmm2, %xmm0
6207 ; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
6208 ; X64: # %bb.0: # %entry
6209 ; X64-NEXT: kmovw %edi, %k1
6210 ; X64-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6211 ; X64-NEXT: vmovapd %xmm2, %xmm0
6214 %0 = extractelement <2 x double> %__W, i64 0
6215 %.rhs = extractelement <2 x double> %__X, i64 0
6216 %1 = fsub double -0.000000e+00, %.rhs
6217 %.rhs1 = extractelement <2 x double> %__Y, i64 0
6218 %2 = fsub double -0.000000e+00, %.rhs1
6219 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6220 %4 = bitcast i8 %__U to <8 x i1>
6221 %5 = extractelement <8 x i1> %4, i64 0
6222 %6 = select i1 %5, double %3, double %.rhs1
6223 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6227 define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6228 ; X86-LABEL: test_mm512_mask_expandloadu_epi64:
6229 ; X86: # %bb.0: # %entry
6230 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6231 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6232 ; X86-NEXT: kmovw %ecx, %k1
6233 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1}
6236 ; X64-LABEL: test_mm512_mask_expandloadu_epi64:
6237 ; X64: # %bb.0: # %entry
6238 ; X64-NEXT: kmovw %edi, %k1
6239 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1}
6242 %0 = bitcast i8* %__P to i64*
6243 %1 = bitcast i8 %__U to <8 x i1>
6244 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
6248 define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6249 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
6250 ; X86: # %bb.0: # %entry
6251 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6252 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6253 ; X86-NEXT: kmovw %ecx, %k1
6254 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
6257 ; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
6258 ; X64: # %bb.0: # %entry
6259 ; X64-NEXT: kmovw %edi, %k1
6260 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
6263 %0 = bitcast i8* %__P to i64*
6264 %1 = bitcast i8 %__U to <8 x i1>
6265 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
6269 define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
6270 ; X86-LABEL: test_mm512_mask_expandloadu_pd:
6271 ; X86: # %bb.0: # %entry
6272 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6273 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6274 ; X86-NEXT: kmovw %ecx, %k1
6275 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1}
6278 ; X64-LABEL: test_mm512_mask_expandloadu_pd:
6279 ; X64: # %bb.0: # %entry
6280 ; X64-NEXT: kmovw %edi, %k1
6281 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1}
6284 %0 = bitcast i8* %__P to double*
6285 %1 = bitcast i8 %__U to <8 x i1>
6286 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
6290 define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6291 ; X86-LABEL: test_mm512_maskz_expandloadu_pd:
6292 ; X86: # %bb.0: # %entry
6293 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6294 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6295 ; X86-NEXT: kmovw %ecx, %k1
6296 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z}
6299 ; X64-LABEL: test_mm512_maskz_expandloadu_pd:
6300 ; X64: # %bb.0: # %entry
6301 ; X64-NEXT: kmovw %edi, %k1
6302 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z}
6305 %0 = bitcast i8* %__P to double*
6306 %1 = bitcast i8 %__U to <8 x i1>
6307 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
6311 define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
6312 ; X86-LABEL: test_mm512_mask_expandloadu_epi32:
6313 ; X86: # %bb.0: # %entry
6314 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6315 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6316 ; X86-NEXT: kmovw %ecx, %k1
6317 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1}
6320 ; X64-LABEL: test_mm512_mask_expandloadu_epi32:
6321 ; X64: # %bb.0: # %entry
6322 ; X64-NEXT: kmovw %edi, %k1
6323 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1}
6326 %0 = bitcast <8 x i64> %__W to <16 x i32>
6327 %1 = bitcast i8* %__P to i32*
6328 %2 = bitcast i16 %__U to <16 x i1>
6329 %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
6330 %4 = bitcast <16 x i32> %3 to <8 x i64>
6334 define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
6335 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
6336 ; X86: # %bb.0: # %entry
6337 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6338 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6339 ; X86-NEXT: kmovw %ecx, %k1
6340 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z}
6343 ; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
6344 ; X64: # %bb.0: # %entry
6345 ; X64-NEXT: kmovw %edi, %k1
6346 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z}
6349 %0 = bitcast i8* %__P to i32*
6350 %1 = bitcast i16 %__U to <16 x i1>
6351 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
6352 %3 = bitcast <16 x i32> %2 to <8 x i64>
6356 define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
6357 ; X86-LABEL: test_mm512_mask_expandloadu_ps:
6358 ; X86: # %bb.0: # %entry
6359 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6360 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6361 ; X86-NEXT: kmovw %ecx, %k1
6362 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1}
6365 ; X64-LABEL: test_mm512_mask_expandloadu_ps:
6366 ; X64: # %bb.0: # %entry
6367 ; X64-NEXT: kmovw %edi, %k1
6368 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1}
6371 %0 = bitcast i8* %__P to float*
6372 %1 = bitcast i16 %__U to <16 x i1>
6373 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
6377 define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
6378 ; X86-LABEL: test_mm512_maskz_expandloadu_ps:
6379 ; X86: # %bb.0: # %entry
6380 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6381 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6382 ; X86-NEXT: kmovw %ecx, %k1
6383 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z}
6386 ; X64-LABEL: test_mm512_maskz_expandloadu_ps:
6387 ; X64: # %bb.0: # %entry
6388 ; X64-NEXT: kmovw %edi, %k1
6389 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z}
6392 %0 = bitcast i8* %__P to float*
6393 %1 = bitcast i16 %__U to <16 x i1>
6394 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
6398 define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
6399 ; X86-LABEL: test_mm512_mask_compressstoreu_pd:
6400 ; X86: # %bb.0: # %entry
6401 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6402 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6403 ; X86-NEXT: kmovw %eax, %k1
6404 ; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1}
6405 ; X86-NEXT: vzeroupper
6408 ; X64-LABEL: test_mm512_mask_compressstoreu_pd:
6409 ; X64: # %bb.0: # %entry
6410 ; X64-NEXT: kmovw %esi, %k1
6411 ; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
6412 ; X64-NEXT: vzeroupper
6415 %0 = bitcast i8* %__P to double*
6416 %1 = bitcast i8 %__U to <8 x i1>
6417 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
6421 define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
6422 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
6423 ; X86: # %bb.0: # %entry
6424 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6425 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6426 ; X86-NEXT: kmovw %eax, %k1
6427 ; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1}
6428 ; X86-NEXT: vzeroupper
6431 ; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
6432 ; X64: # %bb.0: # %entry
6433 ; X64-NEXT: kmovw %esi, %k1
6434 ; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
6435 ; X64-NEXT: vzeroupper
6438 %0 = bitcast i8* %__P to i64*
6439 %1 = bitcast i8 %__U to <8 x i1>
6440 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
6444 define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
6445 ; X86-LABEL: test_mm512_mask_compressstoreu_ps:
6446 ; X86: # %bb.0: # %entry
6447 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6448 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6449 ; X86-NEXT: kmovw %eax, %k1
6450 ; X86-NEXT: vcompressps %zmm0, (%ecx) {%k1}
6451 ; X86-NEXT: vzeroupper
6454 ; X64-LABEL: test_mm512_mask_compressstoreu_ps:
6455 ; X64: # %bb.0: # %entry
6456 ; X64-NEXT: kmovw %esi, %k1
6457 ; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1}
6458 ; X64-NEXT: vzeroupper
6461 %0 = bitcast i8* %__P to float*
6462 %1 = bitcast i16 %__U to <16 x i1>
6463 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
6467 define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
6468 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
6469 ; X86: # %bb.0: # %entry
6470 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6471 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6472 ; X86-NEXT: kmovw %eax, %k1
6473 ; X86-NEXT: vpcompressd %zmm0, (%ecx) {%k1}
6474 ; X86-NEXT: vzeroupper
6477 ; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
6478 ; X64: # %bb.0: # %entry
6479 ; X64-NEXT: kmovw %esi, %k1
6480 ; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1}
6481 ; X64-NEXT: vzeroupper
6484 %0 = bitcast <8 x i64> %__A to <16 x i32>
6485 %1 = bitcast i8* %__P to i32*
6486 %2 = bitcast i16 %__U to <16 x i1>
6487 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
6491 define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
6492 ; X86-LABEL: test_mm512_reduce_add_epi64:
6493 ; X86: # %bb.0: # %entry
6494 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6495 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6496 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6497 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6498 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6499 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6500 ; X86-NEXT: vmovd %xmm0, %eax
6501 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6502 ; X86-NEXT: vzeroupper
6505 ; X64-LABEL: test_mm512_reduce_add_epi64:
6506 ; X64: # %bb.0: # %entry
6507 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6508 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6509 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6510 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6511 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6512 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6513 ; X64-NEXT: vmovq %xmm0, %rax
6514 ; X64-NEXT: vzeroupper
6517 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6518 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6519 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6520 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6521 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6522 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6523 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6524 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6525 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6529 define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
6530 ; X86-LABEL: test_mm512_reduce_mul_epi64:
6531 ; X86: # %bb.0: # %entry
6532 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6533 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm2
6534 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6535 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm3
6536 ; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6537 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6538 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6539 ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6540 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6541 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6542 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6543 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6544 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6545 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6546 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6547 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6548 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6549 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6550 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6551 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6552 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6553 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6554 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6555 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6556 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6557 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6558 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6559 ; X86-NEXT: vmovd %xmm0, %eax
6560 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6561 ; X86-NEXT: vzeroupper
6564 ; X64-LABEL: test_mm512_reduce_mul_epi64:
6565 ; X64: # %bb.0: # %entry
6566 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6567 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
6568 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6569 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
6570 ; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6571 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6572 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6573 ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6574 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6575 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6576 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6577 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6578 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6579 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6580 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6581 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6582 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6583 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6584 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6585 ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6586 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6587 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6588 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6589 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6590 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6591 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6592 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6593 ; X64-NEXT: vmovq %xmm0, %rax
6594 ; X64-NEXT: vzeroupper
6597 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6598 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6599 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6600 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6601 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6602 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6603 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6604 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6605 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6609 define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
6610 ; X86-LABEL: test_mm512_reduce_or_epi64:
6611 ; X86: # %bb.0: # %entry
6612 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6613 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6614 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6615 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6616 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6617 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6618 ; X86-NEXT: vmovd %xmm0, %eax
6619 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6620 ; X86-NEXT: vzeroupper
6623 ; X64-LABEL: test_mm512_reduce_or_epi64:
6624 ; X64: # %bb.0: # %entry
6625 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6626 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6627 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6628 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6629 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6630 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6631 ; X64-NEXT: vmovq %xmm0, %rax
6632 ; X64-NEXT: vzeroupper
6635 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6636 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6637 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6638 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6639 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6640 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6641 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6642 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6643 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6647 define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
6648 ; X86-LABEL: test_mm512_reduce_and_epi64:
6649 ; X86: # %bb.0: # %entry
6650 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6651 ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
6652 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6653 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6654 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6655 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6656 ; X86-NEXT: vmovd %xmm0, %eax
6657 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6658 ; X86-NEXT: vzeroupper
6661 ; X64-LABEL: test_mm512_reduce_and_epi64:
6662 ; X64: # %bb.0: # %entry
6663 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6664 ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
6665 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6666 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6667 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6668 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6669 ; X64-NEXT: vmovq %xmm0, %rax
6670 ; X64-NEXT: vzeroupper
6673 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6674 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6675 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6676 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6677 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6678 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6679 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6680 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6681 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6685 define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6686 ; X86-LABEL: test_mm512_mask_reduce_add_epi64:
6687 ; X86: # %bb.0: # %entry
6688 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6689 ; X86-NEXT: kmovw %eax, %k1
6690 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6691 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6692 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6693 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6694 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6695 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6696 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6697 ; X86-NEXT: vmovd %xmm0, %eax
6698 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6699 ; X86-NEXT: vzeroupper
6702 ; X64-LABEL: test_mm512_mask_reduce_add_epi64:
6703 ; X64: # %bb.0: # %entry
6704 ; X64-NEXT: kmovw %edi, %k1
6705 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6706 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6707 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6708 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6709 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6710 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6711 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6712 ; X64-NEXT: vmovq %xmm0, %rax
6713 ; X64-NEXT: vzeroupper
6716 %0 = bitcast i8 %__M to <8 x i1>
6717 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6718 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6719 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6720 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6721 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6722 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6723 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6724 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6725 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6726 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6730 define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6731 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
6732 ; X86: # %bb.0: # %entry
6733 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6734 ; X86-NEXT: kmovw %eax, %k1
6735 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
6736 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6737 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6738 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2
6739 ; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6740 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm3
6741 ; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6742 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6743 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6744 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6745 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6746 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6747 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6748 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6749 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6750 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6751 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6752 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6753 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6754 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6755 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6756 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6757 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6758 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6759 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6760 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6761 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6762 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6763 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6764 ; X86-NEXT: vmovd %xmm0, %eax
6765 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6766 ; X86-NEXT: vzeroupper
6769 ; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
6770 ; X64: # %bb.0: # %entry
6771 ; X64-NEXT: kmovw %edi, %k1
6772 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
6773 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6774 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6775 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm2
6776 ; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6777 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm3
6778 ; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6779 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6780 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6781 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6782 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6783 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6784 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6785 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6786 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6787 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6788 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6789 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6790 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6791 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6792 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6793 ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6794 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6795 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6796 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6797 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6798 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6799 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6800 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6801 ; X64-NEXT: vmovq %xmm0, %rax
6802 ; X64-NEXT: vzeroupper
6805 %0 = bitcast i8 %__M to <8 x i1>
6806 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
6807 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6808 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6809 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6810 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6811 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6812 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6813 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6814 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6815 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6819 define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6820 ; X86-LABEL: test_mm512_mask_reduce_and_epi64:
6821 ; X86: # %bb.0: # %entry
6822 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6823 ; X86-NEXT: kmovw %eax, %k1
6824 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6825 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6826 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6827 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
6828 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6829 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6830 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6831 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6832 ; X86-NEXT: vmovd %xmm0, %eax
6833 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6834 ; X86-NEXT: vzeroupper
6837 ; X64-LABEL: test_mm512_mask_reduce_and_epi64:
6838 ; X64: # %bb.0: # %entry
6839 ; X64-NEXT: kmovw %edi, %k1
6840 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6841 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6842 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6843 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
6844 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6845 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6846 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6847 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6848 ; X64-NEXT: vmovq %xmm0, %rax
6849 ; X64-NEXT: vzeroupper
6852 %0 = bitcast i8 %__M to <8 x i1>
6853 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
6854 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6855 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6856 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6857 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6858 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6859 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6860 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6861 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6862 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6866 define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6867 ; X86-LABEL: test_mm512_mask_reduce_or_epi64:
6868 ; X86: # %bb.0: # %entry
6869 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6870 ; X86-NEXT: kmovw %eax, %k1
6871 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6872 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6873 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6874 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6875 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6876 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6877 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6878 ; X86-NEXT: vmovd %xmm0, %eax
6879 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6880 ; X86-NEXT: vzeroupper
6883 ; X64-LABEL: test_mm512_mask_reduce_or_epi64:
6884 ; X64: # %bb.0: # %entry
6885 ; X64-NEXT: kmovw %edi, %k1
6886 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6887 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6888 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6889 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6890 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6891 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6892 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6893 ; X64-NEXT: vmovq %xmm0, %rax
6894 ; X64-NEXT: vzeroupper
6897 %0 = bitcast i8 %__M to <8 x i1>
6898 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6899 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6900 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6901 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6902 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6903 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6904 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6905 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6906 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6907 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6911 define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
6912 ; CHECK-LABEL: test_mm512_reduce_add_epi32:
6913 ; CHECK: # %bb.0: # %entry
6914 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6915 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6916 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6917 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6918 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6919 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6920 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6921 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6922 ; CHECK-NEXT: vmovd %xmm0, %eax
6923 ; CHECK-NEXT: vzeroupper
6924 ; CHECK-NEXT: ret{{[l|q]}}
6926 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6927 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6928 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6929 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6930 %add.i = add <8 x i32> %0, %1
6931 %2 = bitcast <8 x i32> %add.i to <4 x i64>
6932 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6933 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6934 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6935 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6936 %add5.i = add <4 x i32> %3, %4
6937 %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6938 %add6.i = add <4 x i32> %shuffle.i, %add5.i
6939 %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6940 %add8.i = add <4 x i32> %shuffle7.i, %add6.i
6941 %vecext.i = extractelement <4 x i32> %add8.i, i32 0
6945 define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
6946 ; CHECK-LABEL: test_mm512_reduce_mul_epi32:
6947 ; CHECK: # %bb.0: # %entry
6948 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6949 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
6950 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6951 ; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0
6952 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6953 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6954 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6955 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6956 ; CHECK-NEXT: vmovd %xmm0, %eax
6957 ; CHECK-NEXT: vzeroupper
6958 ; CHECK-NEXT: ret{{[l|q]}}
6960 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6961 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6962 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6963 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6964 %mul.i = mul <8 x i32> %0, %1
6965 %2 = bitcast <8 x i32> %mul.i to <4 x i64>
6966 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6967 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6968 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6969 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6970 %mul5.i = mul <4 x i32> %3, %4
6971 %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6972 %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
6973 %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6974 %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
6975 %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
6979 define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
6980 ; CHECK-LABEL: test_mm512_reduce_or_epi32:
6981 ; CHECK: # %bb.0: # %entry
6982 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6983 ; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
6984 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6985 ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
6986 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6987 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
6988 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6989 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
6990 ; CHECK-NEXT: vmovd %xmm0, %eax
6991 ; CHECK-NEXT: vzeroupper
6992 ; CHECK-NEXT: ret{{[l|q]}}
6994 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6995 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6996 %or25.i = or <4 x i64> %extract.i, %extract2.i
6997 %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6998 %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6999 %or526.i = or <2 x i64> %extract3.i, %extract4.i
7000 %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
7001 %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7002 %or6.i = or <4 x i32> %shuffle.i, %or5.i
7003 %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7004 %or8.i = or <4 x i32> %shuffle7.i, %or6.i
7005 %vecext.i = extractelement <4 x i32> %or8.i, i32 0
7009 define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
7010 ; CHECK-LABEL: test_mm512_reduce_and_epi32:
7011 ; CHECK: # %bb.0: # %entry
7012 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7013 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
7014 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
7015 ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
7016 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7017 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7018 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7019 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7020 ; CHECK-NEXT: vmovd %xmm0, %eax
7021 ; CHECK-NEXT: vzeroupper
7022 ; CHECK-NEXT: ret{{[l|q]}}
7024 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7025 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7026 %and25.i = and <4 x i64> %extract.i, %extract2.i
7027 %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7028 %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7029 %and526.i = and <2 x i64> %extract3.i, %extract4.i
7030 %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
7031 %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7032 %and6.i = and <4 x i32> %shuffle.i, %and5.i
7033 %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7034 %and8.i = and <4 x i32> %shuffle7.i, %and6.i
7035 %vecext.i = extractelement <4 x i32> %and8.i, i32 0
7039 define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7040 ; X86-LABEL: test_mm512_mask_reduce_add_epi32:
7041 ; X86: # %bb.0: # %entry
7042 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7043 ; X86-NEXT: kmovw %eax, %k1
7044 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7045 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7046 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7047 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7048 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7049 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7050 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7051 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7052 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7053 ; X86-NEXT: vmovd %xmm0, %eax
7054 ; X86-NEXT: vzeroupper
7057 ; X64-LABEL: test_mm512_mask_reduce_add_epi32:
7058 ; X64: # %bb.0: # %entry
7059 ; X64-NEXT: kmovw %edi, %k1
7060 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7061 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7062 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7063 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7064 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7065 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7066 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7067 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7068 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7069 ; X64-NEXT: vmovd %xmm0, %eax
7070 ; X64-NEXT: vzeroupper
7073 %0 = bitcast <8 x i64> %__W to <16 x i32>
7074 %1 = bitcast i16 %__M to <16 x i1>
7075 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7076 %3 = bitcast <16 x i32> %2 to <8 x i64>
7077 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7078 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7079 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7080 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
7081 %add.i = add <8 x i32> %4, %5
7082 %6 = bitcast <8 x i32> %add.i to <4 x i64>
7083 %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7084 %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
7085 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7086 %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
7087 %add6.i = add <4 x i32> %7, %8
7088 %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7089 %add7.i = add <4 x i32> %shuffle.i, %add6.i
7090 %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7091 %add9.i = add <4 x i32> %shuffle8.i, %add7.i
7092 %vecext.i = extractelement <4 x i32> %add9.i, i32 0
7096 define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7097 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
7098 ; X86: # %bb.0: # %entry
7099 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7100 ; X86-NEXT: kmovw %eax, %k1
7101 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7102 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7103 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7104 ; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7105 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7106 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7107 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7108 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7109 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7110 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7111 ; X86-NEXT: vmovd %xmm0, %eax
7112 ; X86-NEXT: vzeroupper
7115 ; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
7116 ; X64: # %bb.0: # %entry
7117 ; X64-NEXT: kmovw %edi, %k1
7118 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7119 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7120 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7121 ; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7122 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7123 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7124 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7125 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7126 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7127 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7128 ; X64-NEXT: vmovd %xmm0, %eax
7129 ; X64-NEXT: vzeroupper
7132 %0 = bitcast <8 x i64> %__W to <16 x i32>
7133 %1 = bitcast i16 %__M to <16 x i1>
7134 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
7135 %3 = bitcast <16 x i32> %2 to <8 x i64>
7136 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7137 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7138 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7139 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
7140 %mul.i = mul <8 x i32> %4, %5
7141 %6 = bitcast <8 x i32> %mul.i to <4 x i64>
7142 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7143 %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
7144 %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7145 %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
7146 %mul7.i = mul <4 x i32> %7, %8
7147 %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7148 %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
7149 %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7150 %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
7151 %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
7155 define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7156 ; X86-LABEL: test_mm512_mask_reduce_and_epi32:
7157 ; X86: # %bb.0: # %entry
7158 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7159 ; X86-NEXT: kmovw %eax, %k1
7160 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7161 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7162 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7163 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
7164 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7165 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
7166 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7167 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7168 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7169 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7170 ; X86-NEXT: vmovd %xmm0, %eax
7171 ; X86-NEXT: vzeroupper
7174 ; X64-LABEL: test_mm512_mask_reduce_and_epi32:
7175 ; X64: # %bb.0: # %entry
7176 ; X64-NEXT: kmovw %edi, %k1
7177 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7178 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7179 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7180 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
7181 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7182 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
7183 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7184 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7185 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7186 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7187 ; X64-NEXT: vmovd %xmm0, %eax
7188 ; X64-NEXT: vzeroupper
7191 %0 = bitcast <8 x i64> %__W to <16 x i32>
7192 %1 = bitcast i16 %__M to <16 x i1>
7193 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
7194 %3 = bitcast <16 x i32> %2 to <8 x i64>
7195 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7196 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7197 %and28.i = and <4 x i64> %extract.i, %extract4.i
7198 %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7199 %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7200 %and729.i = and <2 x i64> %extract5.i, %extract6.i
7201 %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
7202 %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7203 %and8.i = and <4 x i32> %shuffle.i, %and7.i
7204 %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7205 %and10.i = and <4 x i32> %shuffle9.i, %and8.i
7206 %vecext.i = extractelement <4 x i32> %and10.i, i32 0
7210 define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7211 ; X86-LABEL: test_mm512_mask_reduce_or_epi32:
7212 ; X86: # %bb.0: # %entry
7213 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7214 ; X86-NEXT: kmovw %eax, %k1
7215 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7216 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7217 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
7218 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7219 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
7220 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7221 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7222 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7223 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7224 ; X86-NEXT: vmovd %xmm0, %eax
7225 ; X86-NEXT: vzeroupper
7228 ; X64-LABEL: test_mm512_mask_reduce_or_epi32:
7229 ; X64: # %bb.0: # %entry
7230 ; X64-NEXT: kmovw %edi, %k1
7231 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7232 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7233 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
7234 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7235 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
7236 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7237 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7238 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7239 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7240 ; X64-NEXT: vmovd %xmm0, %eax
7241 ; X64-NEXT: vzeroupper
7244 %0 = bitcast <8 x i64> %__W to <16 x i32>
7245 %1 = bitcast i16 %__M to <16 x i1>
7246 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7247 %3 = bitcast <16 x i32> %2 to <8 x i64>
7248 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7249 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7250 %or27.i = or <4 x i64> %extract.i, %extract3.i
7251 %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7252 %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7253 %or628.i = or <2 x i64> %extract4.i, %extract5.i
7254 %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
7255 %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7256 %or7.i = or <4 x i32> %shuffle.i, %or6.i
7257 %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7258 %or9.i = or <4 x i32> %shuffle8.i, %or7.i
7259 %vecext.i = extractelement <4 x i32> %or9.i, i32 0
7263 define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
7264 ; X86-LABEL: test_mm512_reduce_add_pd:
7265 ; X86: # %bb.0: # %entry
7266 ; X86-NEXT: pushl %ebp
7267 ; X86-NEXT: .cfi_def_cfa_offset 8
7268 ; X86-NEXT: .cfi_offset %ebp, -8
7269 ; X86-NEXT: movl %esp, %ebp
7270 ; X86-NEXT: .cfi_def_cfa_register %ebp
7271 ; X86-NEXT: andl $-8, %esp
7272 ; X86-NEXT: subl $8, %esp
7273 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7274 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7275 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7276 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7277 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7278 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7279 ; X86-NEXT: vmovsd %xmm0, (%esp)
7280 ; X86-NEXT: fldl (%esp)
7281 ; X86-NEXT: movl %ebp, %esp
7282 ; X86-NEXT: popl %ebp
7283 ; X86-NEXT: .cfi_def_cfa %esp, 4
7284 ; X86-NEXT: vzeroupper
7287 ; X64-LABEL: test_mm512_reduce_add_pd:
7288 ; X64: # %bb.0: # %entry
7289 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7290 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7291 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7292 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7293 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7294 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7295 ; X64-NEXT: vzeroupper
7298 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7299 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7300 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7301 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7302 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7303 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7304 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7305 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7306 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7307 ret double %vecext.i
7310 define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
7311 ; X86-LABEL: test_mm512_reduce_mul_pd:
7312 ; X86: # %bb.0: # %entry
7313 ; X86-NEXT: pushl %ebp
7314 ; X86-NEXT: .cfi_def_cfa_offset 8
7315 ; X86-NEXT: .cfi_offset %ebp, -8
7316 ; X86-NEXT: movl %esp, %ebp
7317 ; X86-NEXT: .cfi_def_cfa_register %ebp
7318 ; X86-NEXT: andl $-8, %esp
7319 ; X86-NEXT: subl $8, %esp
7320 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7321 ; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7322 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7323 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7324 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7325 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7326 ; X86-NEXT: vmovsd %xmm0, (%esp)
7327 ; X86-NEXT: fldl (%esp)
7328 ; X86-NEXT: movl %ebp, %esp
7329 ; X86-NEXT: popl %ebp
7330 ; X86-NEXT: .cfi_def_cfa %esp, 4
7331 ; X86-NEXT: vzeroupper
7334 ; X64-LABEL: test_mm512_reduce_mul_pd:
7335 ; X64: # %bb.0: # %entry
7336 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7337 ; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7338 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7339 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7340 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7341 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7342 ; X64-NEXT: vzeroupper
7345 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7346 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7347 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7348 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7349 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7350 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7351 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7352 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7353 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7354 ret double %vecext.i
7357 define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
7358 ; X86-LABEL: test_mm512_reduce_add_ps:
7359 ; X86: # %bb.0: # %entry
7360 ; X86-NEXT: pushl %eax
7361 ; X86-NEXT: .cfi_def_cfa_offset 8
7362 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7363 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7364 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7365 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7366 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7367 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7368 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7369 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
7370 ; X86-NEXT: vmovss %xmm0, (%esp)
7371 ; X86-NEXT: flds (%esp)
7372 ; X86-NEXT: popl %eax
7373 ; X86-NEXT: .cfi_def_cfa_offset 4
7374 ; X86-NEXT: vzeroupper
7377 ; X64-LABEL: test_mm512_reduce_add_ps:
7378 ; X64: # %bb.0: # %entry
7379 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7380 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7381 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7382 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7383 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7384 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7385 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7386 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
7387 ; X64-NEXT: vzeroupper
7390 %0 = bitcast <16 x float> %__W to <8 x double>
7391 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7392 %1 = bitcast <4 x double> %extract.i to <8 x float>
7393 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7394 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7395 %add.i = fadd <8 x float> %1, %2
7396 %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7397 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7398 %add5.i = fadd <4 x float> %extract3.i, %extract4.i
7399 %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7400 %add6.i = fadd <4 x float> %add5.i, %shuffle.i
7401 %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7402 %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
7403 %vecext.i = extractelement <4 x float> %add8.i, i32 0
7407 define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
7408 ; X86-LABEL: test_mm512_reduce_mul_ps:
7409 ; X86: # %bb.0: # %entry
7410 ; X86-NEXT: pushl %eax
7411 ; X86-NEXT: .cfi_def_cfa_offset 8
7412 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7413 ; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0
7414 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7415 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7416 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7417 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7418 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7419 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
7420 ; X86-NEXT: vmovss %xmm0, (%esp)
7421 ; X86-NEXT: flds (%esp)
7422 ; X86-NEXT: popl %eax
7423 ; X86-NEXT: .cfi_def_cfa_offset 4
7424 ; X86-NEXT: vzeroupper
7427 ; X64-LABEL: test_mm512_reduce_mul_ps:
7428 ; X64: # %bb.0: # %entry
7429 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7430 ; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
7431 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7432 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7433 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7434 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7435 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7436 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
7437 ; X64-NEXT: vzeroupper
7440 %0 = bitcast <16 x float> %__W to <8 x double>
7441 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7442 %1 = bitcast <4 x double> %extract.i to <8 x float>
7443 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7444 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7445 %mul.i = fmul <8 x float> %1, %2
7446 %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7447 %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7448 %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
7449 %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7450 %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
7451 %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7452 %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
7453 %vecext.i = extractelement <4 x float> %mul8.i, i32 0
7457 define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
7458 ; X86-LABEL: test_mm512_mask_reduce_add_pd:
7459 ; X86: # %bb.0: # %entry
7460 ; X86-NEXT: pushl %ebp
7461 ; X86-NEXT: .cfi_def_cfa_offset 8
7462 ; X86-NEXT: .cfi_offset %ebp, -8
7463 ; X86-NEXT: movl %esp, %ebp
7464 ; X86-NEXT: .cfi_def_cfa_register %ebp
7465 ; X86-NEXT: andl $-8, %esp
7466 ; X86-NEXT: subl $8, %esp
7467 ; X86-NEXT: movb 8(%ebp), %al
7468 ; X86-NEXT: kmovw %eax, %k1
7469 ; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7470 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7471 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7472 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7473 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7474 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7475 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7476 ; X86-NEXT: vmovsd %xmm0, (%esp)
7477 ; X86-NEXT: fldl (%esp)
7478 ; X86-NEXT: movl %ebp, %esp
7479 ; X86-NEXT: popl %ebp
7480 ; X86-NEXT: .cfi_def_cfa %esp, 4
7481 ; X86-NEXT: vzeroupper
7484 ; X64-LABEL: test_mm512_mask_reduce_add_pd:
7485 ; X64: # %bb.0: # %entry
7486 ; X64-NEXT: kmovw %edi, %k1
7487 ; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7488 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7489 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7490 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7491 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7492 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7493 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
7494 ; X64-NEXT: vzeroupper
7497 %0 = bitcast i8 %__M to <8 x i1>
7498 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
7499 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7500 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7501 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7502 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7503 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7504 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7505 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7506 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7507 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7508 ret double %vecext.i
7511 define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
7512 ; X86-LABEL: test_mm512_mask_reduce_mul_pd:
7513 ; X86: # %bb.0: # %entry
7514 ; X86-NEXT: pushl %ebp
7515 ; X86-NEXT: .cfi_def_cfa_offset 8
7516 ; X86-NEXT: .cfi_offset %ebp, -8
7517 ; X86-NEXT: movl %esp, %ebp
7518 ; X86-NEXT: .cfi_def_cfa_register %ebp
7519 ; X86-NEXT: andl $-8, %esp
7520 ; X86-NEXT: subl $8, %esp
7521 ; X86-NEXT: movb 8(%ebp), %al
7522 ; X86-NEXT: kmovw %eax, %k1
7523 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7524 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7525 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7526 ; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7527 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7528 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7529 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7530 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7531 ; X86-NEXT: vmovsd %xmm0, (%esp)
7532 ; X86-NEXT: fldl (%esp)
7533 ; X86-NEXT: movl %ebp, %esp
7534 ; X86-NEXT: popl %ebp
7535 ; X86-NEXT: .cfi_def_cfa %esp, 4
7536 ; X86-NEXT: vzeroupper
7539 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
7540 ; X64: # %bb.0: # %entry
7541 ; X64-NEXT: kmovw %edi, %k1
7542 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7543 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7544 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7545 ; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7546 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7547 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7548 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7549 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0
7550 ; X64-NEXT: vzeroupper
7553 %0 = bitcast i8 %__M to <8 x i1>
7554 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
7555 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7556 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7557 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7558 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7559 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7560 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7561 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7562 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7563 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7564 ret double %vecext.i
7567 define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
7568 ; X86-LABEL: test_mm512_mask_reduce_add_ps:
7569 ; X86: # %bb.0: # %entry
7570 ; X86-NEXT: pushl %eax
7571 ; X86-NEXT: .cfi_def_cfa_offset 8
7572 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7573 ; X86-NEXT: kmovw %eax, %k1
7574 ; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7575 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7576 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7577 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7578 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7579 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7580 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7581 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7582 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0
7583 ; X86-NEXT: vmovss %xmm0, (%esp)
7584 ; X86-NEXT: flds (%esp)
7585 ; X86-NEXT: popl %eax
7586 ; X86-NEXT: .cfi_def_cfa_offset 4
7587 ; X86-NEXT: vzeroupper
7590 ; X64-LABEL: test_mm512_mask_reduce_add_ps:
7591 ; X64: # %bb.0: # %entry
7592 ; X64-NEXT: kmovw %edi, %k1
7593 ; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7594 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7595 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7596 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7597 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7598 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7599 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7600 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7601 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0
7602 ; X64-NEXT: vzeroupper
7605 %0 = bitcast i16 %__M to <16 x i1>
7606 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
7607 %2 = bitcast <16 x float> %1 to <8 x double>
7608 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7609 %3 = bitcast <4 x double> %extract.i to <8 x float>
7610 %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7611 %4 = bitcast <4 x double> %extract3.i to <8 x float>
7612 %add.i = fadd <8 x float> %3, %4
7613 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7614 %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7615 %add6.i = fadd <4 x float> %extract4.i, %extract5.i
7616 %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7617 %add7.i = fadd <4 x float> %add6.i, %shuffle.i
7618 %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7619 %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
7620 %vecext.i = extractelement <4 x float> %add9.i, i32 0
7624 define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
7625 ; X86-LABEL: test_mm512_mask_reduce_mul_ps:
7626 ; X86: # %bb.0: # %entry
7627 ; X86-NEXT: pushl %eax
7628 ; X86-NEXT: .cfi_def_cfa_offset 8
7629 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7630 ; X86-NEXT: kmovw %eax, %k1
7631 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7632 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7633 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7634 ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0
7635 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7636 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7637 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7638 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7639 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7640 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0
7641 ; X86-NEXT: vmovss %xmm0, (%esp)
7642 ; X86-NEXT: flds (%esp)
7643 ; X86-NEXT: popl %eax
7644 ; X86-NEXT: .cfi_def_cfa_offset 4
7645 ; X86-NEXT: vzeroupper
7648 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
7649 ; X64: # %bb.0: # %entry
7650 ; X64-NEXT: kmovw %edi, %k1
7651 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7652 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7653 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7654 ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0
7655 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7656 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7657 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7658 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7659 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7660 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0
7661 ; X64-NEXT: vzeroupper
7664 %0 = bitcast i16 %__M to <16 x i1>
7665 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
7666 %2 = bitcast <16 x float> %1 to <8 x double>
7667 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7668 %3 = bitcast <4 x double> %extract.i to <8 x float>
7669 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7670 %4 = bitcast <4 x double> %extract4.i to <8 x float>
7671 %mul.i = fmul <8 x float> %3, %4
7672 %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7673 %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7674 %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
7675 %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7676 %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
7677 %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7678 %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
7679 %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7683 define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
7684 ; X86-LABEL: test_mm512_reduce_max_epi64:
7685 ; X86: # %bb.0: # %entry
7686 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7687 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7688 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7689 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7690 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7691 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7692 ; X86-NEXT: vmovd %xmm0, %eax
7693 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7694 ; X86-NEXT: vzeroupper
7697 ; X64-LABEL: test_mm512_reduce_max_epi64:
7698 ; X64: # %bb.0: # %entry
7699 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7700 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7701 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7702 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7703 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7704 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7705 ; X64-NEXT: vmovq %xmm0, %rax
7706 ; X64-NEXT: vzeroupper
7709 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7710 %0 = icmp slt <8 x i64> %shuffle.i, %__W
7711 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7712 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7713 %2 = icmp sgt <8 x i64> %1, %shuffle1.i
7714 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7715 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7716 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7717 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7718 %vecext.i = extractelement <8 x i64> %5, i32 0
7722 define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
7723 ; X86-LABEL: test_mm512_reduce_max_epu64:
7724 ; X86: # %bb.0: # %entry
7725 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7726 ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7727 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7728 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7729 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7730 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7731 ; X86-NEXT: vmovd %xmm0, %eax
7732 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7733 ; X86-NEXT: vzeroupper
7736 ; X64-LABEL: test_mm512_reduce_max_epu64:
7737 ; X64: # %bb.0: # %entry
7738 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7739 ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7740 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7741 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7742 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7743 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7744 ; X64-NEXT: vmovq %xmm0, %rax
7745 ; X64-NEXT: vzeroupper
7748 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7749 %0 = icmp ult <8 x i64> %shuffle.i, %__W
7750 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7751 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7752 %2 = icmp ugt <8 x i64> %1, %shuffle1.i
7753 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7754 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7755 %4 = icmp ugt <8 x i64> %3, %shuffle3.i
7756 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7757 %vecext.i = extractelement <8 x i64> %5, i32 0
7761 define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
7762 ; X86-LABEL: test_mm512_reduce_max_pd:
7763 ; X86: # %bb.0: # %entry
7764 ; X86-NEXT: pushl %ebp
7765 ; X86-NEXT: .cfi_def_cfa_offset 8
7766 ; X86-NEXT: .cfi_offset %ebp, -8
7767 ; X86-NEXT: movl %esp, %ebp
7768 ; X86-NEXT: .cfi_def_cfa_register %ebp
7769 ; X86-NEXT: andl $-8, %esp
7770 ; X86-NEXT: subl $8, %esp
7771 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7772 ; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7773 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7774 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7775 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7776 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
7777 ; X86-NEXT: vmovsd %xmm0, (%esp)
7778 ; X86-NEXT: fldl (%esp)
7779 ; X86-NEXT: movl %ebp, %esp
7780 ; X86-NEXT: popl %ebp
7781 ; X86-NEXT: .cfi_def_cfa %esp, 4
7782 ; X86-NEXT: vzeroupper
7785 ; X64-LABEL: test_mm512_reduce_max_pd:
7786 ; X64: # %bb.0: # %entry
7787 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7788 ; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7789 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7790 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7791 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7792 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
7793 ; X64-NEXT: vzeroupper
7796 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7797 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7798 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7799 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7800 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7801 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7802 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7803 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
7804 %vecext.i = extractelement <2 x double> %2, i32 0
7805 ret double %vecext.i
7808 define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
7809 ; X86-LABEL: test_mm512_reduce_min_epi64:
7810 ; X86: # %bb.0: # %entry
7811 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7812 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7813 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7814 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7815 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7816 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7817 ; X86-NEXT: vmovd %xmm0, %eax
7818 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7819 ; X86-NEXT: vzeroupper
7822 ; X64-LABEL: test_mm512_reduce_min_epi64:
7823 ; X64: # %bb.0: # %entry
7824 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7825 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7826 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7827 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7828 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7829 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7830 ; X64-NEXT: vmovq %xmm0, %rax
7831 ; X64-NEXT: vzeroupper
7834 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7835 %0 = icmp sgt <8 x i64> %shuffle.i, %__W
7836 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7837 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7838 %2 = icmp slt <8 x i64> %1, %shuffle1.i
7839 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7840 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7841 %4 = icmp slt <8 x i64> %3, %shuffle3.i
7842 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7843 %vecext.i = extractelement <8 x i64> %5, i32 0
7847 define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
7848 ; X86-LABEL: test_mm512_reduce_min_epu64:
7849 ; X86: # %bb.0: # %entry
7850 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7851 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7852 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7853 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7854 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7855 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7856 ; X86-NEXT: vmovd %xmm0, %eax
7857 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7858 ; X86-NEXT: vzeroupper
7861 ; X64-LABEL: test_mm512_reduce_min_epu64:
7862 ; X64: # %bb.0: # %entry
7863 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7864 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7865 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7866 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7867 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7868 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7869 ; X64-NEXT: vmovq %xmm0, %rax
7870 ; X64-NEXT: vzeroupper
7873 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7874 %0 = icmp ugt <8 x i64> %shuffle.i, %__W
7875 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7876 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7877 %2 = icmp ult <8 x i64> %1, %shuffle1.i
7878 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7879 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7880 %4 = icmp ult <8 x i64> %3, %shuffle3.i
7881 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7882 %vecext.i = extractelement <8 x i64> %5, i32 0
7886 define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
7887 ; X86-LABEL: test_mm512_reduce_min_pd:
7888 ; X86: # %bb.0: # %entry
7889 ; X86-NEXT: pushl %ebp
7890 ; X86-NEXT: .cfi_def_cfa_offset 8
7891 ; X86-NEXT: .cfi_offset %ebp, -8
7892 ; X86-NEXT: movl %esp, %ebp
7893 ; X86-NEXT: .cfi_def_cfa_register %ebp
7894 ; X86-NEXT: andl $-8, %esp
7895 ; X86-NEXT: subl $8, %esp
7896 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7897 ; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0
7898 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7899 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
7900 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7901 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
7902 ; X86-NEXT: vmovsd %xmm0, (%esp)
7903 ; X86-NEXT: fldl (%esp)
7904 ; X86-NEXT: movl %ebp, %esp
7905 ; X86-NEXT: popl %ebp
7906 ; X86-NEXT: .cfi_def_cfa %esp, 4
7907 ; X86-NEXT: vzeroupper
7910 ; X64-LABEL: test_mm512_reduce_min_pd:
7911 ; X64: # %bb.0: # %entry
7912 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7913 ; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
7914 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7915 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
7916 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7917 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
7918 ; X64-NEXT: vzeroupper
7921 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7922 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7923 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7924 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7925 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7926 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7927 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7928 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
7929 %vecext.i = extractelement <2 x double> %2, i32 0
7930 ret double %vecext.i
7933 define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7934 ; X86-LABEL: test_mm512_mask_reduce_max_epi64:
7935 ; X86: # %bb.0: # %entry
7936 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7937 ; X86-NEXT: kmovw %eax, %k1
7938 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
7939 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7940 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7941 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7942 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7943 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7944 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7945 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7946 ; X86-NEXT: vmovd %xmm0, %eax
7947 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7948 ; X86-NEXT: vzeroupper
7951 ; X64-LABEL: test_mm512_mask_reduce_max_epi64:
7952 ; X64: # %bb.0: # %entry
7953 ; X64-NEXT: kmovw %edi, %k1
7954 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
7955 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7956 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7957 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7958 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7959 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7960 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7961 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7962 ; X64-NEXT: vmovq %xmm0, %rax
7963 ; X64-NEXT: vzeroupper
7966 %0 = bitcast i8 %__M to <8 x i1>
7967 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
7968 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7969 %2 = icmp sgt <8 x i64> %1, %shuffle.i
7970 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
7971 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7972 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7973 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7974 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7975 %6 = icmp sgt <8 x i64> %5, %shuffle5.i
7976 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
7977 %vecext.i = extractelement <8 x i64> %7, i32 0
7981 define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
7982 ; X86-LABEL: test_mm512_mask_reduce_max_epu64:
7983 ; X86: # %bb.0: # %entry
7984 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7985 ; X86-NEXT: kmovw %eax, %k1
7986 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
7987 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7988 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7989 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7990 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7991 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7992 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7993 ; X86-NEXT: vmovd %xmm0, %eax
7994 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7995 ; X86-NEXT: vzeroupper
7998 ; X64-LABEL: test_mm512_mask_reduce_max_epu64:
7999 ; X64: # %bb.0: # %entry
8000 ; X64-NEXT: kmovw %edi, %k1
8001 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8002 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8003 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8004 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8005 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8006 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8007 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8008 ; X64-NEXT: vmovq %xmm0, %rax
8009 ; X64-NEXT: vzeroupper
8012 %0 = bitcast i8 %__M to <8 x i1>
8013 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
8014 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8015 %2 = icmp ugt <8 x i64> %1, %shuffle.i
8016 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8017 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8018 %4 = icmp ugt <8 x i64> %3, %shuffle2.i
8019 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
8020 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8021 %6 = icmp ugt <8 x i64> %5, %shuffle4.i
8022 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
8023 %vecext.i = extractelement <8 x i64> %7, i32 0
8027 define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
8028 ; X86-LABEL: test_mm512_mask_reduce_max_pd:
8029 ; X86: # %bb.0: # %entry
8030 ; X86-NEXT: pushl %ebp
8031 ; X86-NEXT: .cfi_def_cfa_offset 8
8032 ; X86-NEXT: .cfi_offset %ebp, -8
8033 ; X86-NEXT: movl %esp, %ebp
8034 ; X86-NEXT: .cfi_def_cfa_register %ebp
8035 ; X86-NEXT: andl $-8, %esp
8036 ; X86-NEXT: subl $8, %esp
8037 ; X86-NEXT: movb 8(%ebp), %al
8038 ; X86-NEXT: kmovw %eax, %k1
8039 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8040 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8041 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8042 ; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8043 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8044 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8045 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8046 ; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
8047 ; X86-NEXT: vmovsd %xmm0, (%esp)
8048 ; X86-NEXT: fldl (%esp)
8049 ; X86-NEXT: movl %ebp, %esp
8050 ; X86-NEXT: popl %ebp
8051 ; X86-NEXT: .cfi_def_cfa %esp, 4
8052 ; X86-NEXT: vzeroupper
8055 ; X64-LABEL: test_mm512_mask_reduce_max_pd:
8056 ; X64: # %bb.0: # %entry
8057 ; X64-NEXT: kmovw %edi, %k1
8058 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8059 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8060 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8061 ; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8062 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8063 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8064 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8065 ; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
8066 ; X64-NEXT: vzeroupper
8069 %0 = bitcast i8 %__M to <8 x i1>
8070 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
8071 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8072 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8073 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
8074 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8075 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8076 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
8077 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8078 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
8079 %vecext.i = extractelement <2 x double> %4, i32 0
8080 ret double %vecext.i
8083 define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
8084 ; X86-LABEL: test_mm512_mask_reduce_min_epi64:
8085 ; X86: # %bb.0: # %entry
8086 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8087 ; X86-NEXT: kmovw %eax, %k1
8088 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
8089 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8090 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8091 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8092 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8093 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8094 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8095 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8096 ; X86-NEXT: vmovd %xmm0, %eax
8097 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8098 ; X86-NEXT: vzeroupper
8101 ; X64-LABEL: test_mm512_mask_reduce_min_epi64:
8102 ; X64: # %bb.0: # %entry
8103 ; X64-NEXT: kmovw %edi, %k1
8104 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
8105 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8106 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8107 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8108 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8109 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8110 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8111 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8112 ; X64-NEXT: vmovq %xmm0, %rax
8113 ; X64-NEXT: vzeroupper
8116 %0 = bitcast i8 %__M to <8 x i1>
8117 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
8118 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8119 %2 = icmp slt <8 x i64> %1, %shuffle.i
8120 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8121 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8122 %4 = icmp slt <8 x i64> %3, %shuffle3.i
8123 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8124 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8125 %6 = icmp slt <8 x i64> %5, %shuffle5.i
8126 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8127 %vecext.i = extractelement <8 x i64> %7, i32 0
8131 define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8132 ; X86-LABEL: test_mm512_mask_reduce_min_epu64:
8133 ; X86: # %bb.0: # %entry
8134 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8135 ; X86-NEXT: kmovw %eax, %k1
8136 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8137 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8138 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8139 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8140 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8141 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8142 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8143 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8144 ; X86-NEXT: vmovd %xmm0, %eax
8145 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8146 ; X86-NEXT: vzeroupper
8149 ; X64-LABEL: test_mm512_mask_reduce_min_epu64:
8150 ; X64: # %bb.0: # %entry
8151 ; X64-NEXT: kmovw %edi, %k1
8152 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8153 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8154 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8155 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8156 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8157 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8158 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8159 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8160 ; X64-NEXT: vmovq %xmm0, %rax
8161 ; X64-NEXT: vzeroupper
8164 %0 = bitcast i8 %__M to <8 x i1>
8165 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
8166 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8167 %2 = icmp ult <8 x i64> %1, %shuffle.i
8168 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8169 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8170 %4 = icmp ult <8 x i64> %3, %shuffle3.i
8171 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8172 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8173 %6 = icmp ult <8 x i64> %5, %shuffle5.i
8174 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8175 %vecext.i = extractelement <8 x i64> %7, i32 0
8179 define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
8180 ; X86-LABEL: test_mm512_mask_reduce_min_pd:
8181 ; X86: # %bb.0: # %entry
8182 ; X86-NEXT: pushl %ebp
8183 ; X86-NEXT: .cfi_def_cfa_offset 8
8184 ; X86-NEXT: .cfi_offset %ebp, -8
8185 ; X86-NEXT: movl %esp, %ebp
8186 ; X86-NEXT: .cfi_def_cfa_register %ebp
8187 ; X86-NEXT: andl $-8, %esp
8188 ; X86-NEXT: subl $8, %esp
8189 ; X86-NEXT: movb 8(%ebp), %al
8190 ; X86-NEXT: kmovw %eax, %k1
8191 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8192 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8193 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8194 ; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0
8195 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8196 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
8197 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8198 ; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0
8199 ; X86-NEXT: vmovsd %xmm0, (%esp)
8200 ; X86-NEXT: fldl (%esp)
8201 ; X86-NEXT: movl %ebp, %esp
8202 ; X86-NEXT: popl %ebp
8203 ; X86-NEXT: .cfi_def_cfa %esp, 4
8204 ; X86-NEXT: vzeroupper
8207 ; X64-LABEL: test_mm512_mask_reduce_min_pd:
8208 ; X64: # %bb.0: # %entry
8209 ; X64-NEXT: kmovw %edi, %k1
8210 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8211 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8212 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8213 ; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0
8214 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8215 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
8216 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8217 ; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0
8218 ; X64-NEXT: vzeroupper
8221 %0 = bitcast i8 %__M to <8 x i1>
8222 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
8223 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8224 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8225 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
8226 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8227 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8228 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
8229 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8230 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
8231 %vecext.i = extractelement <2 x double> %4, i32 0
8232 ret double %vecext.i
8235 define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
8236 ; CHECK-LABEL: test_mm512_reduce_max_epi32:
8237 ; CHECK: # %bb.0: # %entry
8238 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8239 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
8240 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8241 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8242 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8243 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8244 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8245 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8246 ; CHECK-NEXT: vmovd %xmm0, %eax
8247 ; CHECK-NEXT: vzeroupper
8248 ; CHECK-NEXT: ret{{[l|q]}}
8250 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8251 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8252 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8253 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8254 %2 = icmp sgt <8 x i32> %0, %1
8255 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8256 %4 = bitcast <8 x i32> %3 to <4 x i64>
8257 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8258 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8259 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8260 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8261 %7 = icmp sgt <4 x i32> %5, %6
8262 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8263 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8264 %9 = icmp sgt <4 x i32> %8, %shuffle.i
8265 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8266 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8267 %11 = icmp sgt <4 x i32> %10, %shuffle8.i
8268 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8269 %vecext.i = extractelement <4 x i32> %12, i32 0
8273 define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
8274 ; CHECK-LABEL: test_mm512_reduce_max_epu32:
8275 ; CHECK: # %bb.0: # %entry
8276 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8277 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8278 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8279 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8280 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8281 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8282 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8283 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8284 ; CHECK-NEXT: vmovd %xmm0, %eax
8285 ; CHECK-NEXT: vzeroupper
8286 ; CHECK-NEXT: ret{{[l|q]}}
8288 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8289 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8290 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8291 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8292 %2 = icmp ugt <8 x i32> %0, %1
8293 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8294 %4 = bitcast <8 x i32> %3 to <4 x i64>
8295 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8296 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8297 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8298 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8299 %7 = icmp ugt <4 x i32> %5, %6
8300 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8301 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8302 %9 = icmp ugt <4 x i32> %8, %shuffle.i
8303 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8304 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8305 %11 = icmp ugt <4 x i32> %10, %shuffle8.i
8306 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8307 %vecext.i = extractelement <4 x i32> %12, i32 0
8311 define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
8312 ; X86-LABEL: test_mm512_reduce_max_ps:
8313 ; X86: # %bb.0: # %entry
8314 ; X86-NEXT: pushl %eax
8315 ; X86-NEXT: .cfi_def_cfa_offset 8
8316 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8317 ; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8318 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8319 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8320 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8321 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8322 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8323 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8324 ; X86-NEXT: vmovss %xmm0, (%esp)
8325 ; X86-NEXT: flds (%esp)
8326 ; X86-NEXT: popl %eax
8327 ; X86-NEXT: .cfi_def_cfa_offset 4
8328 ; X86-NEXT: vzeroupper
8331 ; X64-LABEL: test_mm512_reduce_max_ps:
8332 ; X64: # %bb.0: # %entry
8333 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8334 ; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8335 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8336 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8337 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8338 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8339 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8340 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8341 ; X64-NEXT: vzeroupper
8344 %0 = bitcast <16 x float> %__W to <8 x double>
8345 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8346 %1 = bitcast <4 x double> %extract.i to <8 x float>
8347 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8348 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8349 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
8350 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8351 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8352 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8353 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8354 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
8355 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8356 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8357 %vecext.i = extractelement <4 x float> %6, i32 0
8361 define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
8362 ; CHECK-LABEL: test_mm512_reduce_min_epi32:
8363 ; CHECK: # %bb.0: # %entry
8364 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8365 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
8366 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8367 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8368 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8369 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8370 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8371 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8372 ; CHECK-NEXT: vmovd %xmm0, %eax
8373 ; CHECK-NEXT: vzeroupper
8374 ; CHECK-NEXT: ret{{[l|q]}}
8376 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8377 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8378 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8379 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8380 %2 = icmp slt <8 x i32> %0, %1
8381 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8382 %4 = bitcast <8 x i32> %3 to <4 x i64>
8383 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8384 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8385 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8386 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8387 %7 = icmp slt <4 x i32> %5, %6
8388 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8389 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8390 %9 = icmp slt <4 x i32> %8, %shuffle.i
8391 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8392 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8393 %11 = icmp slt <4 x i32> %10, %shuffle8.i
8394 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8395 %vecext.i = extractelement <4 x i32> %12, i32 0
8399 define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
8400 ; CHECK-LABEL: test_mm512_reduce_min_epu32:
8401 ; CHECK: # %bb.0: # %entry
8402 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8403 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
8404 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8405 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8406 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8407 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8408 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8409 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8410 ; CHECK-NEXT: vmovd %xmm0, %eax
8411 ; CHECK-NEXT: vzeroupper
8412 ; CHECK-NEXT: ret{{[l|q]}}
8414 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8415 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8416 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8417 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8418 %2 = icmp ult <8 x i32> %0, %1
8419 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8420 %4 = bitcast <8 x i32> %3 to <4 x i64>
8421 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8422 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8423 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8424 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8425 %7 = icmp ult <4 x i32> %5, %6
8426 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8427 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8428 %9 = icmp ult <4 x i32> %8, %shuffle.i
8429 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8430 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8431 %11 = icmp ult <4 x i32> %10, %shuffle8.i
8432 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8433 %vecext.i = extractelement <4 x i32> %12, i32 0
8437 define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
8438 ; X86-LABEL: test_mm512_reduce_min_ps:
8439 ; X86: # %bb.0: # %entry
8440 ; X86-NEXT: pushl %eax
8441 ; X86-NEXT: .cfi_def_cfa_offset 8
8442 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8443 ; X86-NEXT: vminps %ymm1, %ymm0, %ymm0
8444 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8445 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8446 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8447 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8448 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8449 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
8450 ; X86-NEXT: vmovss %xmm0, (%esp)
8451 ; X86-NEXT: flds (%esp)
8452 ; X86-NEXT: popl %eax
8453 ; X86-NEXT: .cfi_def_cfa_offset 4
8454 ; X86-NEXT: vzeroupper
8457 ; X64-LABEL: test_mm512_reduce_min_ps:
8458 ; X64: # %bb.0: # %entry
8459 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8460 ; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
8461 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8462 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8463 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8464 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8465 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8466 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
8467 ; X64-NEXT: vzeroupper
8470 %0 = bitcast <16 x float> %__W to <8 x double>
8471 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8472 %1 = bitcast <4 x double> %extract.i to <8 x float>
8473 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8474 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8475 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
8476 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8477 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8478 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8479 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8480 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
8481 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8482 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8483 %vecext.i = extractelement <4 x float> %6, i32 0
8487 define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8488 ; X86-LABEL: test_mm512_mask_reduce_max_epi32:
8489 ; X86: # %bb.0: # %entry
8490 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8491 ; X86-NEXT: kmovw %eax, %k1
8492 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8493 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8494 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8495 ; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8496 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8497 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8498 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8499 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8500 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8501 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8502 ; X86-NEXT: vmovd %xmm0, %eax
8503 ; X86-NEXT: vzeroupper
8506 ; X64-LABEL: test_mm512_mask_reduce_max_epi32:
8507 ; X64: # %bb.0: # %entry
8508 ; X64-NEXT: kmovw %edi, %k1
8509 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8510 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8511 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8512 ; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8513 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8514 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8515 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8516 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8517 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8518 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8519 ; X64-NEXT: vmovd %xmm0, %eax
8520 ; X64-NEXT: vzeroupper
8523 %0 = bitcast <8 x i64> %__W to <16 x i32>
8524 %1 = bitcast i16 %__M to <16 x i1>
8525 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
8526 %3 = bitcast <16 x i32> %2 to <8 x i64>
8527 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8528 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8529 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8530 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8531 %6 = icmp sgt <8 x i32> %4, %5
8532 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8533 %8 = bitcast <8 x i32> %7 to <4 x i64>
8534 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8535 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8536 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8537 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8538 %11 = icmp sgt <4 x i32> %9, %10
8539 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8540 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8541 %13 = icmp sgt <4 x i32> %12, %shuffle.i
8542 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8543 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8544 %15 = icmp sgt <4 x i32> %14, %shuffle10.i
8545 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8546 %vecext.i = extractelement <4 x i32> %16, i32 0
8550 define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8551 ; X86-LABEL: test_mm512_mask_reduce_max_epu32:
8552 ; X86: # %bb.0: # %entry
8553 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8554 ; X86-NEXT: kmovw %eax, %k1
8555 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8556 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8557 ; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8558 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8559 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8560 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8561 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8562 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8563 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8564 ; X86-NEXT: vmovd %xmm0, %eax
8565 ; X86-NEXT: vzeroupper
8568 ; X64-LABEL: test_mm512_mask_reduce_max_epu32:
8569 ; X64: # %bb.0: # %entry
8570 ; X64-NEXT: kmovw %edi, %k1
8571 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8572 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8573 ; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8574 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8575 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8576 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8577 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8578 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8579 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8580 ; X64-NEXT: vmovd %xmm0, %eax
8581 ; X64-NEXT: vzeroupper
8584 %0 = bitcast <8 x i64> %__W to <16 x i32>
8585 %1 = bitcast i16 %__M to <16 x i1>
8586 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
8587 %3 = bitcast <16 x i32> %2 to <8 x i64>
8588 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8589 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8590 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8591 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
8592 %6 = icmp ugt <8 x i32> %4, %5
8593 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8594 %8 = bitcast <8 x i32> %7 to <4 x i64>
8595 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8596 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8597 %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
8598 %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
8599 %11 = icmp ugt <4 x i32> %9, %10
8600 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8601 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8602 %13 = icmp ugt <4 x i32> %12, %shuffle.i
8603 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8604 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8605 %15 = icmp ugt <4 x i32> %14, %shuffle9.i
8606 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
8607 %vecext.i = extractelement <4 x i32> %16, i32 0
8611 define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
8612 ; X86-LABEL: test_mm512_mask_reduce_max_ps:
8613 ; X86: # %bb.0: # %entry
8614 ; X86-NEXT: pushl %eax
8615 ; X86-NEXT: .cfi_def_cfa_offset 8
8616 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8617 ; X86-NEXT: kmovw %eax, %k1
8618 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8619 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8620 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8621 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8622 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8623 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8624 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8625 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8626 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8627 ; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8628 ; X86-NEXT: vmovss %xmm0, (%esp)
8629 ; X86-NEXT: flds (%esp)
8630 ; X86-NEXT: popl %eax
8631 ; X86-NEXT: .cfi_def_cfa_offset 4
8632 ; X86-NEXT: vzeroupper
8635 ; X64-LABEL: test_mm512_mask_reduce_max_ps:
8636 ; X64: # %bb.0: # %entry
8637 ; X64-NEXT: kmovw %edi, %k1
8638 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8639 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8640 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8641 ; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8642 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8643 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8644 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8645 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8646 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8647 ; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0
8648 ; X64-NEXT: vzeroupper
8651 %0 = bitcast i16 %__M to <16 x i1>
8652 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
8653 %2 = bitcast <16 x float> %1 to <8 x double>
8654 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8655 %3 = bitcast <4 x double> %extract.i to <8 x float>
8656 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8657 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8658 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
8659 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8660 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8661 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8662 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8663 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
8664 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8665 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8666 %vecext.i = extractelement <4 x float> %8, i32 0
8670 define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8671 ; X86-LABEL: test_mm512_mask_reduce_min_epi32:
8672 ; X86: # %bb.0: # %entry
8673 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8674 ; X86-NEXT: kmovw %eax, %k1
8675 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8676 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8677 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8678 ; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8679 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8680 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8681 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8682 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8683 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8684 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8685 ; X86-NEXT: vmovd %xmm0, %eax
8686 ; X86-NEXT: vzeroupper
8689 ; X64-LABEL: test_mm512_mask_reduce_min_epi32:
8690 ; X64: # %bb.0: # %entry
8691 ; X64-NEXT: kmovw %edi, %k1
8692 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8693 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8694 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8695 ; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8696 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8697 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8698 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8699 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8700 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8701 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8702 ; X64-NEXT: vmovd %xmm0, %eax
8703 ; X64-NEXT: vzeroupper
8706 %0 = bitcast <8 x i64> %__W to <16 x i32>
8707 %1 = bitcast i16 %__M to <16 x i1>
8708 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
8709 %3 = bitcast <16 x i32> %2 to <8 x i64>
8710 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8711 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8712 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8713 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8714 %6 = icmp slt <8 x i32> %4, %5
8715 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8716 %8 = bitcast <8 x i32> %7 to <4 x i64>
8717 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8718 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8719 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8720 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8721 %11 = icmp slt <4 x i32> %9, %10
8722 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8723 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8724 %13 = icmp slt <4 x i32> %12, %shuffle.i
8725 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8726 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8727 %15 = icmp slt <4 x i32> %14, %shuffle10.i
8728 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8729 %vecext.i = extractelement <4 x i32> %16, i32 0
8733 define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8734 ; X86-LABEL: test_mm512_mask_reduce_min_epu32:
8735 ; X86: # %bb.0: # %entry
8736 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8737 ; X86-NEXT: kmovw %eax, %k1
8738 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8739 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8740 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8741 ; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0
8742 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8743 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8744 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8745 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8746 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8747 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8748 ; X86-NEXT: vmovd %xmm0, %eax
8749 ; X86-NEXT: vzeroupper
8752 ; X64-LABEL: test_mm512_mask_reduce_min_epu32:
8753 ; X64: # %bb.0: # %entry
8754 ; X64-NEXT: kmovw %edi, %k1
8755 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8756 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8757 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8758 ; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0
8759 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8760 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8761 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8762 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8763 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8764 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8765 ; X64-NEXT: vmovd %xmm0, %eax
8766 ; X64-NEXT: vzeroupper
8769 %0 = bitcast <8 x i64> %__W to <16 x i32>
8770 %1 = bitcast i16 %__M to <16 x i1>
8771 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
8772 %3 = bitcast <16 x i32> %2 to <8 x i64>
8773 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8774 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8775 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8776 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8777 %6 = icmp ult <8 x i32> %4, %5
8778 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8779 %8 = bitcast <8 x i32> %7 to <4 x i64>
8780 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8781 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8782 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8783 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8784 %11 = icmp ult <4 x i32> %9, %10
8785 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8786 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8787 %13 = icmp ult <4 x i32> %12, %shuffle.i
8788 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8789 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8790 %15 = icmp ult <4 x i32> %14, %shuffle10.i
8791 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8792 %vecext.i = extractelement <4 x i32> %16, i32 0
8796 define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
8797 ; X86-LABEL: test_mm512_mask_reduce_min_ps:
8798 ; X86: # %bb.0: # %entry
8799 ; X86-NEXT: pushl %eax
8800 ; X86-NEXT: .cfi_def_cfa_offset 8
8801 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8802 ; X86-NEXT: kmovw %eax, %k1
8803 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8804 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8805 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8806 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0
8807 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8808 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8809 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8810 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8811 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8812 ; X86-NEXT: vminss %xmm1, %xmm0, %xmm0
8813 ; X86-NEXT: vmovss %xmm0, (%esp)
8814 ; X86-NEXT: flds (%esp)
8815 ; X86-NEXT: popl %eax
8816 ; X86-NEXT: .cfi_def_cfa_offset 4
8817 ; X86-NEXT: vzeroupper
8820 ; X64-LABEL: test_mm512_mask_reduce_min_ps:
8821 ; X64: # %bb.0: # %entry
8822 ; X64-NEXT: kmovw %edi, %k1
8823 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8824 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8825 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8826 ; X64-NEXT: vminps %ymm0, %ymm1, %ymm0
8827 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8828 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8829 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8830 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8831 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8832 ; X64-NEXT: vminss %xmm1, %xmm0, %xmm0
8833 ; X64-NEXT: vzeroupper
8836 %0 = bitcast i16 %__M to <16 x i1>
8837 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
8838 %2 = bitcast <16 x float> %1 to <8 x double>
8839 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8840 %3 = bitcast <4 x double> %extract.i to <8 x float>
8841 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8842 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8843 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
8844 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8845 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8846 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8847 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8848 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
8849 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8850 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8851 %vecext.i = extractelement <4 x float> %8, i32 0
8855 define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8856 ; X86-LABEL: test_mm512_mask_max_pd:
8857 ; X86: # %bb.0: # %entry
8858 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8859 ; X86-NEXT: kmovw %eax, %k1
8860 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8863 ; X64-LABEL: test_mm512_mask_max_pd:
8864 ; X64: # %bb.0: # %entry
8865 ; X64-NEXT: kmovw %edi, %k1
8866 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8869 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8870 %1 = bitcast i8 %__U to <8 x i1>
8871 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8875 define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8876 ; X86-LABEL: test_mm512_maskz_max_pd:
8877 ; X86: # %bb.0: # %entry
8878 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8879 ; X86-NEXT: kmovw %eax, %k1
8880 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8883 ; X64-LABEL: test_mm512_maskz_max_pd:
8884 ; X64: # %bb.0: # %entry
8885 ; X64-NEXT: kmovw %edi, %k1
8886 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8889 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8890 %1 = bitcast i8 %__U to <8 x i1>
8891 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8895 define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8896 ; X86-LABEL: test_mm512_mask_max_ps:
8897 ; X86: # %bb.0: # %entry
8898 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8899 ; X86-NEXT: kmovw %eax, %k1
8900 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8903 ; X64-LABEL: test_mm512_mask_max_ps:
8904 ; X64: # %bb.0: # %entry
8905 ; X64-NEXT: kmovw %edi, %k1
8906 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8909 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8910 %1 = bitcast i16 %__U to <16 x i1>
8911 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
8915 define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8916 ; X86-LABEL: test_mm512_mask_max_round_pd:
8917 ; X86: # %bb.0: # %entry
8918 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8919 ; X86-NEXT: kmovw %eax, %k1
8920 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8923 ; X64-LABEL: test_mm512_mask_max_round_pd:
8924 ; X64: # %bb.0: # %entry
8925 ; X64-NEXT: kmovw %edi, %k1
8926 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8929 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8930 %1 = bitcast i8 %__U to <8 x i1>
8931 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8935 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
8937 define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8938 ; X86-LABEL: test_mm512_maskz_max_round_pd:
8939 ; X86: # %bb.0: # %entry
8940 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8941 ; X86-NEXT: kmovw %eax, %k1
8942 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8945 ; X64-LABEL: test_mm512_maskz_max_round_pd:
8946 ; X64: # %bb.0: # %entry
8947 ; X64-NEXT: kmovw %edi, %k1
8948 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8951 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8952 %1 = bitcast i8 %__U to <8 x i1>
8953 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8957 define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
8958 ; CHECK-LABEL: test_mm512_max_round_pd:
8959 ; CHECK: # %bb.0: # %entry
8960 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
8961 ; CHECK-NEXT: ret{{[l|q]}}
8963 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8967 define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8968 ; X86-LABEL: test_mm512_maskz_max_ps:
8969 ; X86: # %bb.0: # %entry
8970 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8971 ; X86-NEXT: kmovw %eax, %k1
8972 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8975 ; X64-LABEL: test_mm512_maskz_max_ps:
8976 ; X64: # %bb.0: # %entry
8977 ; X64-NEXT: kmovw %edi, %k1
8978 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8981 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8982 %1 = bitcast i16 %__U to <16 x i1>
8983 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
8987 define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8988 ; X86-LABEL: test_mm512_mask_max_round_ps:
8989 ; X86: # %bb.0: # %entry
8990 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8991 ; X86-NEXT: kmovw %eax, %k1
8992 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8995 ; X64-LABEL: test_mm512_mask_max_round_ps:
8996 ; X64: # %bb.0: # %entry
8997 ; X64-NEXT: kmovw %edi, %k1
8998 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9001 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9002 %1 = bitcast i16 %__U to <16 x i1>
9003 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9007 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
9009 define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9010 ; X86-LABEL: test_mm512_maskz_max_round_ps:
9011 ; X86: # %bb.0: # %entry
9012 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9013 ; X86-NEXT: kmovw %eax, %k1
9014 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9017 ; X64-LABEL: test_mm512_maskz_max_round_ps:
9018 ; X64: # %bb.0: # %entry
9019 ; X64-NEXT: kmovw %edi, %k1
9020 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9023 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9024 %1 = bitcast i16 %__U to <16 x i1>
9025 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9029 define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
9030 ; CHECK-LABEL: test_mm512_max_round_ps:
9031 ; CHECK: # %bb.0: # %entry
9032 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
9033 ; CHECK-NEXT: ret{{[l|q]}}
9035 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9039 define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9040 ; X86-LABEL: test_mm512_mask_min_pd:
9041 ; X86: # %bb.0: # %entry
9042 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9043 ; X86-NEXT: kmovw %eax, %k1
9044 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9047 ; X64-LABEL: test_mm512_mask_min_pd:
9048 ; X64: # %bb.0: # %entry
9049 ; X64-NEXT: kmovw %edi, %k1
9050 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9053 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9054 %1 = bitcast i8 %__U to <8 x i1>
9055 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9059 define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9060 ; X86-LABEL: test_mm512_maskz_min_pd:
9061 ; X86: # %bb.0: # %entry
9062 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9063 ; X86-NEXT: kmovw %eax, %k1
9064 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9067 ; X64-LABEL: test_mm512_maskz_min_pd:
9068 ; X64: # %bb.0: # %entry
9069 ; X64-NEXT: kmovw %edi, %k1
9070 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9073 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9074 %1 = bitcast i8 %__U to <8 x i1>
9075 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9079 define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9080 ; X86-LABEL: test_mm512_mask_min_round_pd:
9081 ; X86: # %bb.0: # %entry
9082 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9083 ; X86-NEXT: kmovw %eax, %k1
9084 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9087 ; X64-LABEL: test_mm512_mask_min_round_pd:
9088 ; X64: # %bb.0: # %entry
9089 ; X64-NEXT: kmovw %edi, %k1
9090 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9093 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9094 %1 = bitcast i8 %__U to <8 x i1>
9095 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9099 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
9101 define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9102 ; X86-LABEL: test_mm512_maskz_min_round_pd:
9103 ; X86: # %bb.0: # %entry
9104 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9105 ; X86-NEXT: kmovw %eax, %k1
9106 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9109 ; X64-LABEL: test_mm512_maskz_min_round_pd:
9110 ; X64: # %bb.0: # %entry
9111 ; X64-NEXT: kmovw %edi, %k1
9112 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9115 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9116 %1 = bitcast i8 %__U to <8 x i1>
9117 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9121 define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
9122 ; CHECK-LABEL: test_mm512_min_round_pd:
9123 ; CHECK: # %bb.0: # %entry
9124 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
9125 ; CHECK-NEXT: ret{{[l|q]}}
9127 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9131 define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9132 ; X86-LABEL: test_mm512_mask_min_ps:
9133 ; X86: # %bb.0: # %entry
9134 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9135 ; X86-NEXT: kmovw %eax, %k1
9136 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9139 ; X64-LABEL: test_mm512_mask_min_ps:
9140 ; X64: # %bb.0: # %entry
9141 ; X64-NEXT: kmovw %edi, %k1
9142 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9145 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9146 %1 = bitcast i16 %__U to <16 x i1>
9147 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9151 define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9152 ; X86-LABEL: test_mm512_maskz_min_ps:
9153 ; X86: # %bb.0: # %entry
9154 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9155 ; X86-NEXT: kmovw %eax, %k1
9156 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9159 ; X64-LABEL: test_mm512_maskz_min_ps:
9160 ; X64: # %bb.0: # %entry
9161 ; X64-NEXT: kmovw %edi, %k1
9162 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9165 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9166 %1 = bitcast i16 %__U to <16 x i1>
9167 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9171 define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9172 ; X86-LABEL: test_mm512_mask_min_round_ps:
9173 ; X86: # %bb.0: # %entry
9174 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9175 ; X86-NEXT: kmovw %eax, %k1
9176 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9179 ; X64-LABEL: test_mm512_mask_min_round_ps:
9180 ; X64: # %bb.0: # %entry
9181 ; X64-NEXT: kmovw %edi, %k1
9182 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9185 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9186 %1 = bitcast i16 %__U to <16 x i1>
9187 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9191 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
9193 define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9194 ; X86-LABEL: test_mm512_maskz_min_round_ps:
9195 ; X86: # %bb.0: # %entry
9196 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9197 ; X86-NEXT: kmovw %eax, %k1
9198 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9201 ; X64-LABEL: test_mm512_maskz_min_round_ps:
9202 ; X64: # %bb.0: # %entry
9203 ; X64-NEXT: kmovw %edi, %k1
9204 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9207 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9208 %1 = bitcast i16 %__U to <16 x i1>
9209 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9213 define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
9214 ; CHECK-LABEL: test_mm512_min_round_ps:
9215 ; CHECK: # %bb.0: # %entry
9216 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
9217 ; CHECK-NEXT: ret{{[l|q]}}
9219 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9223 define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
9224 ; CHECK-LABEL: test_mm512_sqrt_pd:
9225 ; CHECK: # %bb.0: # %entry
9226 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
9227 ; CHECK-NEXT: ret{{[l|q]}}
9229 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
9233 define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9234 ; X86-LABEL: test_mm512_mask_sqrt_pd:
9235 ; X86: # %bb.0: # %entry
9236 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9237 ; X86-NEXT: kmovw %eax, %k1
9238 ; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9241 ; X64-LABEL: test_mm512_mask_sqrt_pd:
9242 ; X64: # %bb.0: # %entry
9243 ; X64-NEXT: kmovw %edi, %k1
9244 ; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9247 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9248 %1 = bitcast i8 %__U to <8 x i1>
9249 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9253 define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
9254 ; X86-LABEL: test_mm512_maskz_sqrt_pd:
9255 ; X86: # %bb.0: # %entry
9256 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9257 ; X86-NEXT: kmovw %eax, %k1
9258 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9261 ; X64-LABEL: test_mm512_maskz_sqrt_pd:
9262 ; X64: # %bb.0: # %entry
9263 ; X64-NEXT: kmovw %edi, %k1
9264 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9267 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9268 %1 = bitcast i8 %__U to <8 x i1>
9269 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9273 define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9274 ; X86-LABEL: test_mm512_mask_sqrt_round_pd:
9275 ; X86: # %bb.0: # %entry
9276 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9277 ; X86-NEXT: kmovw %eax, %k1
9278 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9281 ; X64-LABEL: test_mm512_mask_sqrt_round_pd:
9282 ; X64: # %bb.0: # %entry
9283 ; X64-NEXT: kmovw %edi, %k1
9284 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9287 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9288 %1 = bitcast i8 %__U to <8 x i1>
9289 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9293 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
9295 define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
9296 ; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
9297 ; X86: # %bb.0: # %entry
9298 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9299 ; X86-NEXT: kmovw %eax, %k1
9300 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9303 ; X64-LABEL: test_mm512_maskz_sqrt_round_pd:
9304 ; X64: # %bb.0: # %entry
9305 ; X64-NEXT: kmovw %edi, %k1
9306 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9309 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9310 %1 = bitcast i8 %__U to <8 x i1>
9311 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9315 define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
9316 ; CHECK-LABEL: test_mm512_sqrt_round_pd:
9317 ; CHECK: # %bb.0: # %entry
9318 ; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0
9319 ; CHECK-NEXT: ret{{[l|q]}}
9321 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9325 define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
9326 ; CHECK-LABEL: test_mm512_sqrt_ps:
9327 ; CHECK: # %bb.0: # %entry
9328 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
9329 ; CHECK-NEXT: ret{{[l|q]}}
9331 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
9335 define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9336 ; X86-LABEL: test_mm512_mask_sqrt_ps:
9337 ; X86: # %bb.0: # %entry
9338 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9339 ; X86-NEXT: kmovw %eax, %k1
9340 ; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9343 ; X64-LABEL: test_mm512_mask_sqrt_ps:
9344 ; X64: # %bb.0: # %entry
9345 ; X64-NEXT: kmovw %edi, %k1
9346 ; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9349 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9350 %1 = bitcast i16 %__U to <16 x i1>
9351 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9355 define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
9356 ; X86-LABEL: test_mm512_maskz_sqrt_ps:
9357 ; X86: # %bb.0: # %entry
9358 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9359 ; X86-NEXT: kmovw %eax, %k1
9360 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9363 ; X64-LABEL: test_mm512_maskz_sqrt_ps:
9364 ; X64: # %bb.0: # %entry
9365 ; X64-NEXT: kmovw %edi, %k1
9366 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9369 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9370 %1 = bitcast i16 %__U to <16 x i1>
9371 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9375 define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9376 ; X86-LABEL: test_mm512_mask_sqrt_round_ps:
9377 ; X86: # %bb.0: # %entry
9378 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9379 ; X86-NEXT: kmovw %eax, %k1
9380 ; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9383 ; X64-LABEL: test_mm512_mask_sqrt_round_ps:
9384 ; X64: # %bb.0: # %entry
9385 ; X64-NEXT: kmovw %edi, %k1
9386 ; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9389 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9390 %1 = bitcast i16 %__U to <16 x i1>
9391 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9395 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
9397 define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
9398 ; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
9399 ; X86: # %bb.0: # %entry
9400 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9401 ; X86-NEXT: kmovw %eax, %k1
9402 ; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9405 ; X64-LABEL: test_mm512_maskz_sqrt_round_ps:
9406 ; X64: # %bb.0: # %entry
9407 ; X64-NEXT: kmovw %edi, %k1
9408 ; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9411 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9412 %1 = bitcast i16 %__U to <16 x i1>
9413 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9417 define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
9418 ; CHECK-LABEL: test_mm512_sqrt_round_ps:
9419 ; CHECK: # %bb.0: # %entry
9420 ; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0
9421 ; CHECK-NEXT: ret{{[l|q]}}
9423 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9427 define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
9428 ; CHECK-LABEL: test_mm512_rol_epi32:
9429 ; CHECK: # %bb.0: # %entry
9430 ; CHECK-NEXT: vprold $5, %zmm0, %zmm0
9431 ; CHECK-NEXT: ret{{[l|q]}}
9433 %0 = bitcast <8 x i64> %__A to <16 x i32>
9434 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9435 %2 = bitcast <16 x i32> %1 to <8 x i64>
9439 define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9440 ; X86-LABEL: test_mm512_mask_rol_epi32:
9441 ; X86: # %bb.0: # %entry
9442 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9443 ; X86-NEXT: kmovw %eax, %k1
9444 ; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9447 ; X64-LABEL: test_mm512_mask_rol_epi32:
9448 ; X64: # %bb.0: # %entry
9449 ; X64-NEXT: kmovw %edi, %k1
9450 ; X64-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9453 %0 = bitcast <8 x i64> %__A to <16 x i32>
9454 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9455 %2 = bitcast <8 x i64> %__W to <16 x i32>
9456 %3 = bitcast i16 %__U to <16 x i1>
9457 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9458 %5 = bitcast <16 x i32> %4 to <8 x i64>
9462 define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9463 ; X86-LABEL: test_mm512_maskz_rol_epi32:
9464 ; X86: # %bb.0: # %entry
9465 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9466 ; X86-NEXT: kmovw %eax, %k1
9467 ; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9470 ; X64-LABEL: test_mm512_maskz_rol_epi32:
9471 ; X64: # %bb.0: # %entry
9472 ; X64-NEXT: kmovw %edi, %k1
9473 ; X64-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9476 %0 = bitcast <8 x i64> %__A to <16 x i32>
9477 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9478 %2 = bitcast i16 %__U to <16 x i1>
9479 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9480 %4 = bitcast <16 x i32> %3 to <8 x i64>
9484 define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
9485 ; CHECK-LABEL: test_mm512_rol_epi64:
9486 ; CHECK: # %bb.0: # %entry
9487 ; CHECK-NEXT: vprolq $5, %zmm0, %zmm0
9488 ; CHECK-NEXT: ret{{[l|q]}}
9490 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9494 define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9495 ; X86-LABEL: test_mm512_mask_rol_epi64:
9496 ; X86: # %bb.0: # %entry
9497 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9498 ; X86-NEXT: kmovw %eax, %k1
9499 ; X86-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9502 ; X64-LABEL: test_mm512_mask_rol_epi64:
9503 ; X64: # %bb.0: # %entry
9504 ; X64-NEXT: kmovw %edi, %k1
9505 ; X64-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9508 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9509 %1 = bitcast i8 %__U to <8 x i1>
9510 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9514 define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9515 ; X86-LABEL: test_mm512_maskz_rol_epi64:
9516 ; X86: # %bb.0: # %entry
9517 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9518 ; X86-NEXT: kmovw %eax, %k1
9519 ; X86-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9522 ; X64-LABEL: test_mm512_maskz_rol_epi64:
9523 ; X64: # %bb.0: # %entry
9524 ; X64-NEXT: kmovw %edi, %k1
9525 ; X64-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9528 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9529 %1 = bitcast i8 %__U to <8 x i1>
9530 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9534 define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9535 ; CHECK-LABEL: test_mm512_rolv_epi32:
9536 ; CHECK: # %bb.0: # %entry
9537 ; CHECK-NEXT: vprolvd %zmm1, %zmm0, %zmm0
9538 ; CHECK-NEXT: ret{{[l|q]}}
9540 %0 = bitcast <8 x i64> %__A to <16 x i32>
9541 %1 = bitcast <8 x i64> %__B to <16 x i32>
9542 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9543 %3 = bitcast <16 x i32> %2 to <8 x i64>
9547 define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9548 ; X86-LABEL: test_mm512_mask_rolv_epi32:
9549 ; X86: # %bb.0: # %entry
9550 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9551 ; X86-NEXT: kmovw %eax, %k1
9552 ; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9555 ; X64-LABEL: test_mm512_mask_rolv_epi32:
9556 ; X64: # %bb.0: # %entry
9557 ; X64-NEXT: kmovw %edi, %k1
9558 ; X64-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9561 %0 = bitcast <8 x i64> %__A to <16 x i32>
9562 %1 = bitcast <8 x i64> %__B to <16 x i32>
9563 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9564 %3 = bitcast <8 x i64> %__W to <16 x i32>
9565 %4 = bitcast i16 %__U to <16 x i1>
9566 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9567 %6 = bitcast <16 x i32> %5 to <8 x i64>
9571 define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9572 ; X86-LABEL: test_mm512_maskz_rolv_epi32:
9573 ; X86: # %bb.0: # %entry
9574 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9575 ; X86-NEXT: kmovw %eax, %k1
9576 ; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9579 ; X64-LABEL: test_mm512_maskz_rolv_epi32:
9580 ; X64: # %bb.0: # %entry
9581 ; X64-NEXT: kmovw %edi, %k1
9582 ; X64-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9585 %0 = bitcast <8 x i64> %__A to <16 x i32>
9586 %1 = bitcast <8 x i64> %__B to <16 x i32>
9587 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9588 %3 = bitcast i16 %__U to <16 x i1>
9589 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9590 %5 = bitcast <16 x i32> %4 to <8 x i64>
9594 define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9595 ; CHECK-LABEL: test_mm512_rolv_epi64:
9596 ; CHECK: # %bb.0: # %entry
9597 ; CHECK-NEXT: vprolvq %zmm1, %zmm0, %zmm0
9598 ; CHECK-NEXT: ret{{[l|q]}}
9600 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9604 define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9605 ; X86-LABEL: test_mm512_mask_rolv_epi64:
9606 ; X86: # %bb.0: # %entry
9607 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9608 ; X86-NEXT: kmovw %eax, %k1
9609 ; X86-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9612 ; X64-LABEL: test_mm512_mask_rolv_epi64:
9613 ; X64: # %bb.0: # %entry
9614 ; X64-NEXT: kmovw %edi, %k1
9615 ; X64-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9618 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9619 %1 = bitcast i8 %__U to <8 x i1>
9620 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9624 define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9625 ; X86-LABEL: test_mm512_maskz_rolv_epi64:
9626 ; X86: # %bb.0: # %entry
9627 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9628 ; X86-NEXT: kmovw %eax, %k1
9629 ; X86-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9632 ; X64-LABEL: test_mm512_maskz_rolv_epi64:
9633 ; X64: # %bb.0: # %entry
9634 ; X64-NEXT: kmovw %edi, %k1
9635 ; X64-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9638 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9639 %1 = bitcast i8 %__U to <8 x i1>
9640 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9644 define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
9645 ; CHECK-LABEL: test_mm512_ror_epi32:
9646 ; CHECK: # %bb.0: # %entry
9647 ; CHECK-NEXT: vprord $5, %zmm0, %zmm0
9648 ; CHECK-NEXT: ret{{[l|q]}}
9650 %0 = bitcast <8 x i64> %__A to <16 x i32>
9651 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9652 %2 = bitcast <16 x i32> %1 to <8 x i64>
9657 define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9658 ; X86-LABEL: test_mm512_mask_ror_epi32:
9659 ; X86: # %bb.0: # %entry
9660 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9661 ; X86-NEXT: kmovw %eax, %k1
9662 ; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9665 ; X64-LABEL: test_mm512_mask_ror_epi32:
9666 ; X64: # %bb.0: # %entry
9667 ; X64-NEXT: kmovw %edi, %k1
9668 ; X64-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9671 %0 = bitcast <8 x i64> %__A to <16 x i32>
9672 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9673 %2 = bitcast <8 x i64> %__W to <16 x i32>
9674 %3 = bitcast i16 %__U to <16 x i1>
9675 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9676 %5 = bitcast <16 x i32> %4 to <8 x i64>
9680 define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9681 ; X86-LABEL: test_mm512_maskz_ror_epi32:
9682 ; X86: # %bb.0: # %entry
9683 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9684 ; X86-NEXT: kmovw %eax, %k1
9685 ; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9688 ; X64-LABEL: test_mm512_maskz_ror_epi32:
9689 ; X64: # %bb.0: # %entry
9690 ; X64-NEXT: kmovw %edi, %k1
9691 ; X64-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9694 %0 = bitcast <8 x i64> %__A to <16 x i32>
9695 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9696 %2 = bitcast i16 %__U to <16 x i1>
9697 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9698 %4 = bitcast <16 x i32> %3 to <8 x i64>
9702 define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
9703 ; CHECK-LABEL: test_mm512_ror_epi64:
9704 ; CHECK: # %bb.0: # %entry
9705 ; CHECK-NEXT: vprorq $5, %zmm0, %zmm0
9706 ; CHECK-NEXT: ret{{[l|q]}}
9708 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9712 define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9713 ; X86-LABEL: test_mm512_mask_ror_epi64:
9714 ; X86: # %bb.0: # %entry
9715 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9716 ; X86-NEXT: kmovw %eax, %k1
9717 ; X86-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9720 ; X64-LABEL: test_mm512_mask_ror_epi64:
9721 ; X64: # %bb.0: # %entry
9722 ; X64-NEXT: kmovw %edi, %k1
9723 ; X64-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9726 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9727 %1 = bitcast i8 %__U to <8 x i1>
9728 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9732 define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9733 ; X86-LABEL: test_mm512_maskz_ror_epi64:
9734 ; X86: # %bb.0: # %entry
9735 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9736 ; X86-NEXT: kmovw %eax, %k1
9737 ; X86-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9740 ; X64-LABEL: test_mm512_maskz_ror_epi64:
9741 ; X64: # %bb.0: # %entry
9742 ; X64-NEXT: kmovw %edi, %k1
9743 ; X64-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9746 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9747 %1 = bitcast i8 %__U to <8 x i1>
9748 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9752 define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9753 ; CHECK-LABEL: test_mm512_rorv_epi32:
9754 ; CHECK: # %bb.0: # %entry
9755 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
9756 ; CHECK-NEXT: ret{{[l|q]}}
9758 %0 = bitcast <8 x i64> %__A to <16 x i32>
9759 %1 = bitcast <8 x i64> %__B to <16 x i32>
9760 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9761 %3 = bitcast <16 x i32> %2 to <8 x i64>
9765 define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9766 ; X86-LABEL: test_mm512_mask_rorv_epi32:
9767 ; X86: # %bb.0: # %entry
9768 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9769 ; X86-NEXT: kmovw %eax, %k1
9770 ; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9773 ; X64-LABEL: test_mm512_mask_rorv_epi32:
9774 ; X64: # %bb.0: # %entry
9775 ; X64-NEXT: kmovw %edi, %k1
9776 ; X64-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9779 %0 = bitcast <8 x i64> %__A to <16 x i32>
9780 %1 = bitcast <8 x i64> %__B to <16 x i32>
9781 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9782 %3 = bitcast <8 x i64> %__W to <16 x i32>
9783 %4 = bitcast i16 %__U to <16 x i1>
9784 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9785 %6 = bitcast <16 x i32> %5 to <8 x i64>
9789 define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9790 ; X86-LABEL: test_mm512_maskz_rorv_epi32:
9791 ; X86: # %bb.0: # %entry
9792 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9793 ; X86-NEXT: kmovw %eax, %k1
9794 ; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9797 ; X64-LABEL: test_mm512_maskz_rorv_epi32:
9798 ; X64: # %bb.0: # %entry
9799 ; X64-NEXT: kmovw %edi, %k1
9800 ; X64-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9803 %0 = bitcast <8 x i64> %__A to <16 x i32>
9804 %1 = bitcast <8 x i64> %__B to <16 x i32>
9805 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9806 %3 = bitcast i16 %__U to <16 x i1>
9807 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9808 %5 = bitcast <16 x i32> %4 to <8 x i64>
9812 define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9813 ; CHECK-LABEL: test_mm512_rorv_epi64:
9814 ; CHECK: # %bb.0: # %entry
9815 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
9816 ; CHECK-NEXT: ret{{[l|q]}}
9818 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9822 define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9823 ; X86-LABEL: test_mm512_mask_rorv_epi64:
9824 ; X86: # %bb.0: # %entry
9825 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9826 ; X86-NEXT: kmovw %eax, %k1
9827 ; X86-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9830 ; X64-LABEL: test_mm512_mask_rorv_epi64:
9831 ; X64: # %bb.0: # %entry
9832 ; X64-NEXT: kmovw %edi, %k1
9833 ; X64-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9836 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9837 %1 = bitcast i8 %__U to <8 x i1>
9838 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9842 define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9843 ; X86-LABEL: test_mm512_maskz_rorv_epi64:
9844 ; X86: # %bb.0: # %entry
9845 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9846 ; X86-NEXT: kmovw %eax, %k1
9847 ; X86-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9850 ; X64-LABEL: test_mm512_maskz_rorv_epi64:
9851 ; X64: # %bb.0: # %entry
9852 ; X64-NEXT: kmovw %edi, %k1
9853 ; X64-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9856 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9857 %1 = bitcast i8 %__U to <8 x i1>
9858 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9862 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
9863 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
9864 declare float @llvm.fma.f32(float, float, float) #9
9865 declare double @llvm.fma.f64(double, double, double) #9
9866 declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
9867 declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
9868 declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
9869 declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
9870 declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
9871 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
9872 declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
9873 declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
9874 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
9875 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
9876 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
9877 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
9878 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
9879 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
9880 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
9881 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
9882 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
9883 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
9885 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9886 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
9887 declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9888 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)