1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86
3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64
5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
8 define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
9 ; X86-LABEL: test_mm512_kunpackb:
10 ; X86: # %bb.0: # %entry
11 ; X86-NEXT: pushl %ebp
12 ; X86-NEXT: .cfi_def_cfa_offset 8
13 ; X86-NEXT: .cfi_offset %ebp, -8
14 ; X86-NEXT: movl %esp, %ebp
15 ; X86-NEXT: .cfi_def_cfa_register %ebp
16 ; X86-NEXT: andl $-64, %esp
17 ; X86-NEXT: subl $64, %esp
18 ; X86-NEXT: vmovdqa64 136(%ebp), %zmm3
19 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
20 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
21 ; X86-NEXT: kunpckbw %k0, %k1, %k1
22 ; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
23 ; X86-NEXT: kmovw %k0, %eax
24 ; X86-NEXT: movzwl %ax, %eax
25 ; X86-NEXT: movl %ebp, %esp
27 ; X86-NEXT: .cfi_def_cfa %esp, 4
28 ; X86-NEXT: vzeroupper
31 ; X64-LABEL: test_mm512_kunpackb:
32 ; X64: # %bb.0: # %entry
33 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
34 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
35 ; X64-NEXT: kunpckbw %k0, %k1, %k1
36 ; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
37 ; X64-NEXT: kmovw %k0, %eax
38 ; X64-NEXT: movzwl %ax, %eax
39 ; X64-NEXT: vzeroupper
42 %0 = bitcast <8 x i64> %__E to <16 x i32>
43 %1 = bitcast <8 x i64> %__F to <16 x i32>
44 %2 = bitcast <8 x i64> %__A to <16 x i32>
45 %3 = bitcast <8 x i64> %__B to <16 x i32>
46 %4 = icmp ne <16 x i32> %2, %3
47 %5 = bitcast <8 x i64> %__C to <16 x i32>
48 %6 = bitcast <8 x i64> %__D to <16 x i32>
49 %7 = icmp ne <16 x i32> %5, %6
50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
53 %11 = icmp ne <16 x i32> %0, %1
54 %12 = and <16 x i1> %11, %10
55 %13 = bitcast <16 x i1> %12 to i16
59 define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
60 ; X86-LABEL: test_mm512_kortestc:
61 ; X86: # %bb.0: # %entry
62 ; X86-NEXT: pushl %ebp
63 ; X86-NEXT: .cfi_def_cfa_offset 8
64 ; X86-NEXT: .cfi_offset %ebp, -8
65 ; X86-NEXT: movl %esp, %ebp
66 ; X86-NEXT: .cfi_def_cfa_register %ebp
67 ; X86-NEXT: andl $-64, %esp
68 ; X86-NEXT: subl $64, %esp
69 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
70 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
71 ; X86-NEXT: korw %k0, %k1, %k0
72 ; X86-NEXT: kmovw %k0, %eax
73 ; X86-NEXT: cmpw $-1, %ax
75 ; X86-NEXT: andb $1, %al
76 ; X86-NEXT: movzbl %al, %eax
77 ; X86-NEXT: movl %ebp, %esp
79 ; X86-NEXT: .cfi_def_cfa %esp, 4
80 ; X86-NEXT: vzeroupper
83 ; X64-LABEL: test_mm512_kortestc:
84 ; X64: # %bb.0: # %entry
85 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
86 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
87 ; X64-NEXT: korw %k0, %k1, %k0
88 ; X64-NEXT: kmovw %k0, %eax
89 ; X64-NEXT: cmpw $-1, %ax
91 ; X64-NEXT: andb $1, %al
92 ; X64-NEXT: movzbl %al, %eax
93 ; X64-NEXT: vzeroupper
96 %0 = bitcast <8 x i64> %__A to <16 x i32>
97 %1 = bitcast <8 x i64> %__B to <16 x i32>
98 %2 = icmp ne <16 x i32> %0, %1
99 %3 = bitcast <8 x i64> %__C to <16 x i32>
100 %4 = bitcast <8 x i64> %__D to <16 x i32>
101 %5 = icmp ne <16 x i32> %3, %4
102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16
103 %8 = icmp eq i16 %7, -1
104 %9 = zext i1 %8 to i32
108 define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) {
109 ; X86-LABEL: test_mm512_kortestz:
110 ; X86: # %bb.0: # %entry
111 ; X86-NEXT: pushl %ebp
112 ; X86-NEXT: .cfi_def_cfa_offset 8
113 ; X86-NEXT: .cfi_offset %ebp, -8
114 ; X86-NEXT: movl %esp, %ebp
115 ; X86-NEXT: .cfi_def_cfa_register %ebp
116 ; X86-NEXT: andl $-64, %esp
117 ; X86-NEXT: subl $64, %esp
118 ; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
119 ; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
120 ; X86-NEXT: korw %k0, %k1, %k0
121 ; X86-NEXT: kmovw %k0, %eax
122 ; X86-NEXT: cmpw $0, %ax
124 ; X86-NEXT: andb $1, %al
125 ; X86-NEXT: movzbl %al, %eax
126 ; X86-NEXT: movl %ebp, %esp
127 ; X86-NEXT: popl %ebp
128 ; X86-NEXT: .cfi_def_cfa %esp, 4
129 ; X86-NEXT: vzeroupper
132 ; X64-LABEL: test_mm512_kortestz:
133 ; X64: # %bb.0: # %entry
134 ; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
135 ; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
136 ; X64-NEXT: korw %k0, %k1, %k0
137 ; X64-NEXT: kmovw %k0, %eax
138 ; X64-NEXT: cmpw $0, %ax
140 ; X64-NEXT: andb $1, %al
141 ; X64-NEXT: movzbl %al, %eax
142 ; X64-NEXT: vzeroupper
145 %0 = bitcast <8 x i64> %__A to <16 x i32>
146 %1 = bitcast <8 x i64> %__B to <16 x i32>
147 %2 = icmp ne <16 x i32> %0, %1
148 %3 = bitcast <8 x i64> %__C to <16 x i32>
149 %4 = bitcast <8 x i64> %__D to <16 x i32>
150 %5 = icmp ne <16 x i32> %3, %4
151 %6 = or <16 x i1> %5, %2
152 %7 = bitcast <16 x i1> %6 to i16
153 %8 = icmp eq i16 %7, 0
154 %9 = zext i1 %8 to i32
158 define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
159 ; CHECK-LABEL: test_mm512_shuffle_f32x4:
160 ; CHECK: # %bb.0: # %entry
161 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
162 ; CHECK-NEXT: ret{{[l|q]}}
164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
165 ret <16 x float> %shuffle
169 define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
170 ; X86-LABEL: test_mm512_mask_shuffle_f32x4:
171 ; X86: # %bb.0: # %entry
172 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
173 ; X86-NEXT: kmovw %eax, %k1
174 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
177 ; X64-LABEL: test_mm512_mask_shuffle_f32x4:
178 ; X64: # %bb.0: # %entry
179 ; X64-NEXT: kmovw %edi, %k1
180 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
183 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
184 %0 = bitcast i16 %__U to <16 x i1>
185 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
189 define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
190 ; X86-LABEL: test_mm512_maskz_shuffle_f32x4:
191 ; X86: # %bb.0: # %entry
192 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
193 ; X86-NEXT: kmovw %eax, %k1
194 ; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
197 ; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
198 ; X64: # %bb.0: # %entry
199 ; X64-NEXT: kmovw %edi, %k1
200 ; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
203 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
204 %0 = bitcast i16 %__U to <16 x i1>
205 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
209 define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
210 ; CHECK-LABEL: test_mm512_shuffle_f64x2:
211 ; CHECK: # %bb.0: # %entry
212 ; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
213 ; CHECK-NEXT: ret{{[l|q]}}
215 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
216 ret <8 x double> %shuffle
219 define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
220 ; X86-LABEL: test_mm512_mask_shuffle_f64x2:
221 ; X86: # %bb.0: # %entry
222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
223 ; X86-NEXT: kmovw %eax, %k1
224 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
227 ; X64-LABEL: test_mm512_mask_shuffle_f64x2:
228 ; X64: # %bb.0: # %entry
229 ; X64-NEXT: kmovw %edi, %k1
230 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
233 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
234 %0 = bitcast i8 %__U to <8 x i1>
235 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
239 define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
240 ; X86-LABEL: test_mm512_maskz_shuffle_f64x2:
241 ; X86: # %bb.0: # %entry
242 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
243 ; X86-NEXT: kmovw %eax, %k1
244 ; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
247 ; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
248 ; X64: # %bb.0: # %entry
249 ; X64-NEXT: kmovw %edi, %k1
250 ; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
253 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
254 %0 = bitcast i8 %__U to <8 x i1>
255 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
259 define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
260 ; CHECK-LABEL: test_mm512_shuffle_i32x4:
261 ; CHECK: # %bb.0: # %entry
262 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
263 ; CHECK-NEXT: ret{{[l|q]}}
265 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
266 ret <8 x i64> %shuffle
269 define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
270 ; X86-LABEL: test_mm512_mask_shuffle_i32x4:
271 ; X86: # %bb.0: # %entry
272 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
273 ; X86-NEXT: kmovw %eax, %k1
274 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
277 ; X64-LABEL: test_mm512_mask_shuffle_i32x4:
278 ; X64: # %bb.0: # %entry
279 ; X64-NEXT: kmovw %edi, %k1
280 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
283 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
284 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
285 %1 = bitcast <8 x i64> %__W to <16 x i32>
286 %2 = bitcast i16 %__U to <16 x i1>
287 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
288 %4 = bitcast <16 x i32> %3 to <8 x i64>
292 define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
293 ; X86-LABEL: test_mm512_maskz_shuffle_i32x4:
294 ; X86: # %bb.0: # %entry
295 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
296 ; X86-NEXT: kmovw %eax, %k1
297 ; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
300 ; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
301 ; X64: # %bb.0: # %entry
302 ; X64-NEXT: kmovw %edi, %k1
303 ; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
306 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
307 %0 = bitcast <8 x i64> %shuffle to <16 x i32>
308 %1 = bitcast i16 %__U to <16 x i1>
309 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
310 %3 = bitcast <16 x i32> %2 to <8 x i64>
314 define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
315 ; CHECK-LABEL: test_mm512_shuffle_i64x2:
316 ; CHECK: # %bb.0: # %entry
317 ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
318 ; CHECK-NEXT: ret{{[l|q]}}
320 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
321 ret <8 x i64> %shuffle
324 define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
325 ; X86-LABEL: test_mm512_mask_shuffle_i64x2:
326 ; X86: # %bb.0: # %entry
327 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
328 ; X86-NEXT: kmovw %eax, %k1
329 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
332 ; X64-LABEL: test_mm512_mask_shuffle_i64x2:
333 ; X64: # %bb.0: # %entry
334 ; X64-NEXT: kmovw %edi, %k1
335 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
338 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
339 %0 = bitcast i8 %__U to <8 x i1>
340 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
344 define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
345 ; X86-LABEL: test_mm512_maskz_shuffle_i64x2:
346 ; X86: # %bb.0: # %entry
347 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
348 ; X86-NEXT: kmovw %eax, %k1
349 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
352 ; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
353 ; X64: # %bb.0: # %entry
354 ; X64-NEXT: kmovw %edi, %k1
355 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
358 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
359 %0 = bitcast i8 %__U to <8 x i1>
360 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
365 define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
366 ; CHECK-LABEL: test_mm512_testn_epi32_mask:
367 ; CHECK: # %bb.0: # %entry
368 ; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
369 ; CHECK-NEXT: kmovw %k0, %eax
370 ; CHECK-NEXT: movzwl %ax, %eax
371 ; CHECK-NEXT: vzeroupper
372 ; CHECK-NEXT: ret{{[l|q]}}
374 %and1.i.i = and <8 x i64> %__B, %__A
375 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
376 %1 = icmp eq <16 x i32> %0, zeroinitializer
377 %2 = bitcast <16 x i1> %1 to i16
381 define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
382 ; X86-LABEL: test_mm512_mask_testn_epi32_mask:
383 ; X86: # %bb.0: # %entry
384 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
385 ; X86-NEXT: kmovw %eax, %k1
386 ; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
387 ; X86-NEXT: kmovw %k0, %eax
388 ; X86-NEXT: movzwl %ax, %eax
389 ; X86-NEXT: vzeroupper
392 ; X64-LABEL: test_mm512_mask_testn_epi32_mask:
393 ; X64: # %bb.0: # %entry
394 ; X64-NEXT: kmovw %edi, %k1
395 ; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
396 ; X64-NEXT: kmovw %k0, %eax
397 ; X64-NEXT: movzwl %ax, %eax
398 ; X64-NEXT: vzeroupper
401 %and1.i.i = and <8 x i64> %__B, %__A
402 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
403 %1 = icmp eq <16 x i32> %0, zeroinitializer
404 %2 = bitcast i16 %__U to <16 x i1>
405 %3 = and <16 x i1> %1, %2
406 %4 = bitcast <16 x i1> %3 to i16
410 define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
411 ; CHECK-LABEL: test_mm512_testn_epi64_mask:
412 ; CHECK: # %bb.0: # %entry
413 ; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
414 ; CHECK-NEXT: kmovw %k0, %eax
415 ; CHECK-NEXT: movzbl %al, %eax
416 ; CHECK-NEXT: vzeroupper
417 ; CHECK-NEXT: ret{{[l|q]}}
419 %and1.i.i = and <8 x i64> %__B, %__A
420 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
421 %1 = bitcast <8 x i1> %0 to i8
425 define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
426 ; X86-LABEL: test_mm512_mask_testn_epi64_mask:
427 ; X86: # %bb.0: # %entry
428 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
429 ; X86-NEXT: kmovw %eax, %k1
430 ; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
431 ; X86-NEXT: kmovw %k0, %eax
432 ; X86-NEXT: movzbl %al, %eax
433 ; X86-NEXT: vzeroupper
436 ; X64-LABEL: test_mm512_mask_testn_epi64_mask:
437 ; X64: # %bb.0: # %entry
438 ; X64-NEXT: kmovw %edi, %k1
439 ; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
440 ; X64-NEXT: kmovw %k0, %eax
441 ; X64-NEXT: movzbl %al, %eax
442 ; X64-NEXT: vzeroupper
445 %and1.i.i = and <8 x i64> %__B, %__A
446 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
447 %1 = bitcast i8 %__U to <8 x i1>
448 %2 = and <8 x i1> %0, %1
449 %3 = bitcast <8 x i1> %2 to i8
453 define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
454 ; X86-LABEL: test_mm512_mask_test_epi32_mask:
455 ; X86: # %bb.0: # %entry
456 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
457 ; X86-NEXT: kmovw %eax, %k1
458 ; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
459 ; X86-NEXT: kmovw %k0, %eax
460 ; X86-NEXT: movzwl %ax, %eax
461 ; X86-NEXT: vzeroupper
464 ; X64-LABEL: test_mm512_mask_test_epi32_mask:
465 ; X64: # %bb.0: # %entry
466 ; X64-NEXT: kmovw %edi, %k1
467 ; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
468 ; X64-NEXT: kmovw %k0, %eax
469 ; X64-NEXT: movzwl %ax, %eax
470 ; X64-NEXT: vzeroupper
473 %and1.i.i = and <8 x i64> %__B, %__A
474 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
475 %1 = icmp ne <16 x i32> %0, zeroinitializer
476 %2 = bitcast i16 %__U to <16 x i1>
477 %3 = and <16 x i1> %1, %2
478 %4 = bitcast <16 x i1> %3 to i16
482 define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
483 ; X86-LABEL: test_mm512_mask_test_epi64_mask:
484 ; X86: # %bb.0: # %entry
485 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
486 ; X86-NEXT: kmovw %eax, %k1
487 ; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
488 ; X86-NEXT: kmovw %k0, %eax
489 ; X86-NEXT: movzbl %al, %eax
490 ; X86-NEXT: vzeroupper
493 ; X64-LABEL: test_mm512_mask_test_epi64_mask:
494 ; X64: # %bb.0: # %entry
495 ; X64-NEXT: kmovw %edi, %k1
496 ; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
497 ; X64-NEXT: kmovw %k0, %eax
498 ; X64-NEXT: movzbl %al, %eax
499 ; X64-NEXT: vzeroupper
502 %and1.i.i = and <8 x i64> %__B, %__A
503 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
504 %1 = bitcast i8 %__U to <8 x i1>
505 %2 = and <8 x i1> %0, %1
506 %3 = bitcast <8 x i1> %2 to i8
510 define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
511 ; X86-LABEL: test_mm512_mask_set1_epi32:
512 ; X86: # %bb.0: # %entry
513 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
514 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
515 ; X86-NEXT: kmovw %ecx, %k1
516 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1}
519 ; X64-LABEL: test_mm512_mask_set1_epi32:
520 ; X64: # %bb.0: # %entry
521 ; X64-NEXT: kmovw %edi, %k1
522 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1}
525 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
526 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
527 %0 = bitcast <8 x i64> %__O to <16 x i32>
528 %1 = bitcast i16 %__M to <16 x i1>
529 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
530 %3 = bitcast <16 x i32> %2 to <8 x i64>
534 define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) {
535 ; X86-LABEL: test_mm512_maskz_set1_epi32:
536 ; X86: # %bb.0: # %entry
537 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
538 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
539 ; X86-NEXT: kmovw %ecx, %k1
540 ; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
543 ; X64-LABEL: test_mm512_maskz_set1_epi32:
544 ; X64: # %bb.0: # %entry
545 ; X64-NEXT: kmovw %edi, %k1
546 ; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z}
549 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
550 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
551 %0 = bitcast i16 %__M to <16 x i1>
552 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
553 %2 = bitcast <16 x i32> %1 to <8 x i64>
557 define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
558 ; X86-LABEL: test_mm512_mask_set1_epi64:
559 ; X86: # %bb.0: # %entry
560 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
561 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
562 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
563 ; X86-NEXT: kmovw %eax, %k1
564 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
567 ; X64-LABEL: test_mm512_mask_set1_epi64:
568 ; X64: # %bb.0: # %entry
569 ; X64-NEXT: kmovw %edi, %k1
570 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1}
573 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
574 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
575 %0 = bitcast i8 %__M to <8 x i1>
576 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
580 define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
581 ; X86-LABEL: test_mm512_maskz_set1_epi64:
582 ; X86: # %bb.0: # %entry
583 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
584 ; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
585 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
586 ; X86-NEXT: kmovw %eax, %k1
587 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
590 ; X64-LABEL: test_mm512_maskz_set1_epi64:
591 ; X64: # %bb.0: # %entry
592 ; X64-NEXT: kmovw %edi, %k1
593 ; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z}
596 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
597 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
598 %0 = bitcast i8 %__M to <8 x i1>
599 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
604 define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
605 ; CHECK-LABEL: test_mm512_broadcastd_epi32:
607 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
608 ; CHECK-NEXT: ret{{[l|q]}}
609 %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
610 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer
611 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
615 define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
616 ; X86-LABEL: test_mm512_mask_broadcastd_epi32:
618 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
619 ; X86-NEXT: kmovw %eax, %k1
620 ; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
623 ; X64-LABEL: test_mm512_mask_broadcastd_epi32:
625 ; X64-NEXT: kmovw %edi, %k1
626 ; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
628 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
629 %arg1 = bitcast i16 %a1 to <16 x i1>
630 %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
631 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer
632 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
633 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
637 define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
638 ; X86-LABEL: test_mm512_maskz_broadcastd_epi32:
640 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
641 ; X86-NEXT: kmovw %eax, %k1
642 ; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
645 ; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
647 ; X64-NEXT: kmovw %edi, %k1
648 ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
650 %arg0 = bitcast i16 %a0 to <16 x i1>
651 %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
652 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer
653 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
654 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
658 define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
659 ; CHECK-LABEL: test_mm512_broadcastq_epi64:
661 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
662 ; CHECK-NEXT: ret{{[l|q]}}
663 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
667 define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
668 ; X86-LABEL: test_mm512_mask_broadcastq_epi64:
670 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
671 ; X86-NEXT: kmovw %eax, %k1
672 ; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
675 ; X64-LABEL: test_mm512_mask_broadcastq_epi64:
677 ; X64-NEXT: kmovw %edi, %k1
678 ; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
680 %arg1 = bitcast i8 %a1 to <8 x i1>
681 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer
682 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
686 define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
687 ; X86-LABEL: test_mm512_maskz_broadcastq_epi64:
689 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
690 ; X86-NEXT: kmovw %eax, %k1
691 ; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
694 ; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
696 ; X64-NEXT: kmovw %edi, %k1
697 ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
699 %arg0 = bitcast i8 %a0 to <8 x i1>
700 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer
701 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
705 define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
706 ; CHECK-LABEL: test_mm512_broadcastsd_pd:
708 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
709 ; CHECK-NEXT: ret{{[l|q]}}
710 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
711 ret <8 x double> %res
714 define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
715 ; X86-LABEL: test_mm512_mask_broadcastsd_pd:
717 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
718 ; X86-NEXT: kmovw %eax, %k1
719 ; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
722 ; X64-LABEL: test_mm512_mask_broadcastsd_pd:
724 ; X64-NEXT: kmovw %edi, %k1
725 ; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
727 %arg1 = bitcast i8 %a1 to <8 x i1>
728 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer
729 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
730 ret <8 x double> %res1
733 define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
734 ; X86-LABEL: test_mm512_maskz_broadcastsd_pd:
736 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
737 ; X86-NEXT: kmovw %eax, %k1
738 ; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
741 ; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
743 ; X64-NEXT: kmovw %edi, %k1
744 ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
746 %arg0 = bitcast i8 %a0 to <8 x i1>
747 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer
748 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
749 ret <8 x double> %res1
752 define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
753 ; CHECK-LABEL: test_mm512_broadcastss_ps:
755 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
756 ; CHECK-NEXT: ret{{[l|q]}}
757 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
758 ret <16 x float> %res
761 define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
762 ; X86-LABEL: test_mm512_mask_broadcastss_ps:
764 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
765 ; X86-NEXT: kmovw %eax, %k1
766 ; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
769 ; X64-LABEL: test_mm512_mask_broadcastss_ps:
771 ; X64-NEXT: kmovw %edi, %k1
772 ; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
774 %arg1 = bitcast i16 %a1 to <16 x i1>
775 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer
776 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
777 ret <16 x float> %res1
780 define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
781 ; X86-LABEL: test_mm512_maskz_broadcastss_ps:
783 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
784 ; X86-NEXT: kmovw %eax, %k1
785 ; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
788 ; X64-LABEL: test_mm512_maskz_broadcastss_ps:
790 ; X64-NEXT: kmovw %edi, %k1
791 ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
793 %arg0 = bitcast i16 %a0 to <16 x i1>
794 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer
795 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
796 ret <16 x float> %res1
799 define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) {
800 ; CHECK-LABEL: test_mm512_movedup_pd:
802 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
803 ; CHECK-NEXT: ret{{[l|q]}}
804 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
805 ret <8 x double> %res
808 define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
809 ; X86-LABEL: test_mm512_mask_movedup_pd:
811 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
812 ; X86-NEXT: kmovw %eax, %k1
813 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
816 ; X64-LABEL: test_mm512_mask_movedup_pd:
818 ; X64-NEXT: kmovw %edi, %k1
819 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
821 %arg1 = bitcast i8 %a1 to <8 x i1>
822 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
823 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
824 ret <8 x double> %res1
827 define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) {
828 ; X86-LABEL: test_mm512_maskz_movedup_pd:
830 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
831 ; X86-NEXT: kmovw %eax, %k1
832 ; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
835 ; X64-LABEL: test_mm512_maskz_movedup_pd:
837 ; X64-NEXT: kmovw %edi, %k1
838 ; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
840 %arg0 = bitcast i8 %a0 to <8 x i1>
841 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
842 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
843 ret <8 x double> %res1
846 define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
847 ; CHECK-LABEL: test_mm512_movehdup_ps:
849 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
850 ; CHECK-NEXT: ret{{[l|q]}}
851 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
852 ret <16 x float> %res
855 define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
856 ; X86-LABEL: test_mm512_mask_movehdup_ps:
858 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
859 ; X86-NEXT: kmovw %eax, %k1
860 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
863 ; X64-LABEL: test_mm512_mask_movehdup_ps:
865 ; X64-NEXT: kmovw %edi, %k1
866 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
868 %arg1 = bitcast i16 %a1 to <16 x i1>
869 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
870 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
871 ret <16 x float> %res1
874 define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
875 ; X86-LABEL: test_mm512_maskz_movehdup_ps:
877 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
878 ; X86-NEXT: kmovw %eax, %k1
879 ; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
882 ; X64-LABEL: test_mm512_maskz_movehdup_ps:
884 ; X64-NEXT: kmovw %edi, %k1
885 ; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
887 %arg0 = bitcast i16 %a0 to <16 x i1>
888 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
889 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
890 ret <16 x float> %res1
893 define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
894 ; CHECK-LABEL: test_mm512_moveldup_ps:
896 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
897 ; CHECK-NEXT: ret{{[l|q]}}
898 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
899 ret <16 x float> %res
902 define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
903 ; X86-LABEL: test_mm512_mask_moveldup_ps:
905 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
906 ; X86-NEXT: kmovw %eax, %k1
907 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
910 ; X64-LABEL: test_mm512_mask_moveldup_ps:
912 ; X64-NEXT: kmovw %edi, %k1
913 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
915 %arg1 = bitcast i16 %a1 to <16 x i1>
916 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
917 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
918 ret <16 x float> %res1
921 define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
922 ; X86-LABEL: test_mm512_maskz_moveldup_ps:
924 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
925 ; X86-NEXT: kmovw %eax, %k1
926 ; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
929 ; X64-LABEL: test_mm512_maskz_moveldup_ps:
931 ; X64-NEXT: kmovw %edi, %k1
932 ; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
934 %arg0 = bitcast i16 %a0 to <16 x i1>
935 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
936 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
937 ret <16 x float> %res1
940 define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
941 ; CHECK-LABEL: test_mm512_permute_pd:
943 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
944 ; CHECK-NEXT: ret{{[l|q]}}
945 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
946 ret <8 x double> %res
949 define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
950 ; X86-LABEL: test_mm512_mask_permute_pd:
952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
953 ; X86-NEXT: kmovw %eax, %k1
954 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
957 ; X64-LABEL: test_mm512_mask_permute_pd:
959 ; X64-NEXT: kmovw %edi, %k1
960 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
962 %arg1 = bitcast i8 %a1 to <8 x i1>
963 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
964 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
965 ret <8 x double> %res1
968 define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
969 ; X86-LABEL: test_mm512_maskz_permute_pd:
971 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
972 ; X86-NEXT: kmovw %eax, %k1
973 ; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
976 ; X64-LABEL: test_mm512_maskz_permute_pd:
978 ; X64-NEXT: kmovw %edi, %k1
979 ; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
981 %arg0 = bitcast i8 %a0 to <8 x i1>
982 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
983 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
984 ret <8 x double> %res1
987 define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
988 ; CHECK-LABEL: test_mm512_permute_ps:
990 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
991 ; CHECK-NEXT: ret{{[l|q]}}
992 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
993 ret <16 x float> %res
996 define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
997 ; X86-LABEL: test_mm512_mask_permute_ps:
999 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1000 ; X86-NEXT: kmovw %eax, %k1
1001 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1004 ; X64-LABEL: test_mm512_mask_permute_ps:
1006 ; X64-NEXT: kmovw %edi, %k1
1007 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1009 %arg1 = bitcast i16 %a1 to <16 x i1>
1010 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1011 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1012 ret <16 x float> %res1
1015 define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
1016 ; X86-LABEL: test_mm512_maskz_permute_ps:
1018 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1019 ; X86-NEXT: kmovw %eax, %k1
1020 ; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1023 ; X64-LABEL: test_mm512_maskz_permute_ps:
1025 ; X64-NEXT: kmovw %edi, %k1
1026 ; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
1028 %arg0 = bitcast i16 %a0 to <16 x i1>
1029 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
1030 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1031 ret <16 x float> %res1
1034 define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
1035 ; CHECK-LABEL: test_mm512_permutex_epi64:
1037 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1038 ; CHECK-NEXT: ret{{[l|q]}}
1039 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1043 define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
1044 ; X86-LABEL: test_mm512_mask_permutex_epi64:
1046 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1047 ; X86-NEXT: kmovw %eax, %k1
1048 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1051 ; X64-LABEL: test_mm512_mask_permutex_epi64:
1053 ; X64-NEXT: kmovw %edi, %k1
1054 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1056 %arg1 = bitcast i8 %a1 to <8 x i1>
1057 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1058 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1062 define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
1063 ; X86-LABEL: test_mm512_maskz_permutex_epi64:
1065 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1066 ; X86-NEXT: kmovw %eax, %k1
1067 ; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1070 ; X64-LABEL: test_mm512_maskz_permutex_epi64:
1072 ; X64-NEXT: kmovw %edi, %k1
1073 ; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1075 %arg0 = bitcast i8 %a0 to <8 x i1>
1076 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1077 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1081 define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
1082 ; CHECK-LABEL: test_mm512_permutex_pd:
1084 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
1085 ; CHECK-NEXT: ret{{[l|q]}}
1086 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1087 ret <8 x double> %res
1090 define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
1091 ; X86-LABEL: test_mm512_mask_permutex_pd:
1093 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1094 ; X86-NEXT: kmovw %eax, %k1
1095 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1098 ; X64-LABEL: test_mm512_mask_permutex_pd:
1100 ; X64-NEXT: kmovw %edi, %k1
1101 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
1103 %arg1 = bitcast i8 %a1 to <8 x i1>
1104 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1105 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1106 ret <8 x double> %res1
1109 define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
1110 ; X86-LABEL: test_mm512_maskz_permutex_pd:
1112 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1113 ; X86-NEXT: kmovw %eax, %k1
1114 ; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1117 ; X64-LABEL: test_mm512_maskz_permutex_pd:
1119 ; X64-NEXT: kmovw %edi, %k1
1120 ; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
1122 %arg0 = bitcast i8 %a0 to <8 x i1>
1123 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
1124 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1125 ret <8 x double> %res1
1128 define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
1129 ; CHECK-LABEL: test_mm512_shuffle_epi32:
1131 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1132 ; CHECK-NEXT: ret{{[l|q]}}
1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1134 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1135 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1139 define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
1140 ; X86-LABEL: test_mm512_mask_shuffle_epi32:
1142 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1143 ; X86-NEXT: kmovw %eax, %k1
1144 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1147 ; X64-LABEL: test_mm512_mask_shuffle_epi32:
1149 ; X64-NEXT: kmovw %edi, %k1
1150 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1152 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1153 %arg1 = bitcast i16 %a1 to <16 x i1>
1154 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1155 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1156 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1157 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1161 define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
1162 ; X86-LABEL: test_mm512_maskz_shuffle_epi32:
1164 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1165 ; X86-NEXT: kmovw %eax, %k1
1166 ; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1169 ; X64-LABEL: test_mm512_maskz_shuffle_epi32:
1171 ; X64-NEXT: kmovw %edi, %k1
1172 ; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
1174 %arg0 = bitcast i16 %a0 to <16 x i1>
1175 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1176 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
1177 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1178 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1182 define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
1183 ; CHECK-LABEL: test_mm512_shuffle_pd:
1185 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1186 ; CHECK-NEXT: ret{{[l|q]}}
1187 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1188 ret <8 x double> %res
1191 define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1192 ; X86-LABEL: test_mm512_mask_shuffle_pd:
1194 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1195 ; X86-NEXT: kmovw %eax, %k1
1196 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1199 ; X64-LABEL: test_mm512_mask_shuffle_pd:
1201 ; X64-NEXT: kmovw %edi, %k1
1202 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1204 %arg1 = bitcast i8 %a1 to <8 x i1>
1205 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1206 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1207 ret <8 x double> %res1
1210 define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1211 ; X86-LABEL: test_mm512_maskz_shuffle_pd:
1213 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1214 ; X86-NEXT: kmovw %eax, %k1
1215 ; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1218 ; X64-LABEL: test_mm512_maskz_shuffle_pd:
1220 ; X64-NEXT: kmovw %edi, %k1
1221 ; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1223 %arg0 = bitcast i8 %a0 to <8 x i1>
1224 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
1225 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1226 ret <8 x double> %res1
1229 define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1230 ; CHECK-LABEL: test_mm512_unpackhi_epi32:
1232 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1233 ; CHECK-NEXT: ret{{[l|q]}}
1234 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1235 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1236 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1237 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1241 define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1242 ; X86-LABEL: test_mm512_mask_unpackhi_epi32:
1244 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1245 ; X86-NEXT: kmovw %eax, %k1
1246 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1249 ; X64-LABEL: test_mm512_mask_unpackhi_epi32:
1251 ; X64-NEXT: kmovw %edi, %k1
1252 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1254 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1255 %arg1 = bitcast i16 %a1 to <16 x i1>
1256 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1257 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1258 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1259 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1264 define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1265 ; X86-LABEL: test_mm512_maskz_unpackhi_epi32:
1267 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1268 ; X86-NEXT: kmovw %eax, %k1
1269 ; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1272 ; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
1274 ; X64-NEXT: kmovw %edi, %k1
1275 ; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1277 %arg0 = bitcast i16 %a0 to <16 x i1>
1278 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1279 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1280 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1281 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1282 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1286 define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1287 ; CHECK-LABEL: test_mm512_unpackhi_epi64:
1289 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1290 ; CHECK-NEXT: ret{{[l|q]}}
1291 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1295 define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1296 ; X86-LABEL: test_mm512_mask_unpackhi_epi64:
1298 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1299 ; X86-NEXT: kmovw %eax, %k1
1300 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1303 ; X64-LABEL: test_mm512_mask_unpackhi_epi64:
1305 ; X64-NEXT: kmovw %edi, %k1
1306 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1308 %arg1 = bitcast i8 %a1 to <8 x i1>
1309 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1310 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1314 define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1315 ; X86-LABEL: test_mm512_maskz_unpackhi_epi64:
1317 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1318 ; X86-NEXT: kmovw %eax, %k1
1319 ; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1322 ; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
1324 ; X64-NEXT: kmovw %edi, %k1
1325 ; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1327 %arg0 = bitcast i8 %a0 to <8 x i1>
1328 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1329 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1333 define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
1334 ; CHECK-LABEL: test_mm512_unpackhi_pd:
1336 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1337 ; CHECK-NEXT: ret{{[l|q]}}
1338 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1339 ret <8 x double> %res
1342 define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1343 ; X86-LABEL: test_mm512_mask_unpackhi_pd:
1345 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1346 ; X86-NEXT: kmovw %eax, %k1
1347 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1350 ; X64-LABEL: test_mm512_mask_unpackhi_pd:
1352 ; X64-NEXT: kmovw %edi, %k1
1353 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
1355 %arg1 = bitcast i8 %a1 to <8 x i1>
1356 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1357 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1358 ret <8 x double> %res1
1361 define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1362 ; X86-LABEL: test_mm512_maskz_unpackhi_pd:
1364 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1365 ; X86-NEXT: kmovw %eax, %k1
1366 ; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1369 ; X64-LABEL: test_mm512_maskz_unpackhi_pd:
1371 ; X64-NEXT: kmovw %edi, %k1
1372 ; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
1374 %arg0 = bitcast i8 %a0 to <8 x i1>
1375 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1376 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1377 ret <8 x double> %res1
1380 define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
1381 ; CHECK-LABEL: test_mm512_unpackhi_ps:
1383 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1384 ; CHECK-NEXT: ret{{[l|q]}}
1385 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1386 ret <16 x float> %res
1389 define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1390 ; X86-LABEL: test_mm512_mask_unpackhi_ps:
1392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1393 ; X86-NEXT: kmovw %eax, %k1
1394 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1397 ; X64-LABEL: test_mm512_mask_unpackhi_ps:
1399 ; X64-NEXT: kmovw %edi, %k1
1400 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
1402 %arg1 = bitcast i16 %a1 to <16 x i1>
1403 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1404 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1405 ret <16 x float> %res1
1408 define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1409 ; X86-LABEL: test_mm512_maskz_unpackhi_ps:
1411 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1412 ; X86-NEXT: kmovw %eax, %k1
1413 ; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1416 ; X64-LABEL: test_mm512_maskz_unpackhi_ps:
1418 ; X64-NEXT: kmovw %edi, %k1
1419 ; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
1421 %arg0 = bitcast i16 %a0 to <16 x i1>
1422 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
1423 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1424 ret <16 x float> %res1
1427 define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
1428 ; CHECK-LABEL: test_mm512_unpacklo_epi32:
1430 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1431 ; CHECK-NEXT: ret{{[l|q]}}
1432 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1433 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1434 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1435 %res1 = bitcast <16 x i32> %res0 to <8 x i64>
1439 define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1440 ; X86-LABEL: test_mm512_mask_unpacklo_epi32:
1442 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1443 ; X86-NEXT: kmovw %eax, %k1
1444 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1447 ; X64-LABEL: test_mm512_mask_unpacklo_epi32:
1449 ; X64-NEXT: kmovw %edi, %k1
1450 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1452 %arg0 = bitcast <8 x i64> %a0 to <16 x i32>
1453 %arg1 = bitcast i16 %a1 to <16 x i1>
1454 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1455 %arg3 = bitcast <8 x i64> %a3 to <16 x i32>
1456 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1457 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0
1458 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1462 define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1463 ; X86-LABEL: test_mm512_maskz_unpacklo_epi32:
1465 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1466 ; X86-NEXT: kmovw %eax, %k1
1467 ; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1470 ; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
1472 ; X64-NEXT: kmovw %edi, %k1
1473 ; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1475 %arg0 = bitcast i16 %a0 to <16 x i1>
1476 %arg1 = bitcast <8 x i64> %a1 to <16 x i32>
1477 %arg2 = bitcast <8 x i64> %a2 to <16 x i32>
1478 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1479 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer
1480 %res2 = bitcast <16 x i32> %res1 to <8 x i64>
1484 define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
1485 ; CHECK-LABEL: test_mm512_unpacklo_epi64:
1487 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1488 ; CHECK-NEXT: ret{{[l|q]}}
1489 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1493 define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
1494 ; X86-LABEL: test_mm512_mask_unpacklo_epi64:
1496 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1497 ; X86-NEXT: kmovw %eax, %k1
1498 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1501 ; X64-LABEL: test_mm512_mask_unpacklo_epi64:
1503 ; X64-NEXT: kmovw %edi, %k1
1504 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1506 %arg1 = bitcast i8 %a1 to <8 x i1>
1507 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1508 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0
1512 define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
1513 ; X86-LABEL: test_mm512_maskz_unpacklo_epi64:
1515 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1516 ; X86-NEXT: kmovw %eax, %k1
1517 ; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1520 ; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
1522 ; X64-NEXT: kmovw %edi, %k1
1523 ; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1525 %arg0 = bitcast i8 %a0 to <8 x i1>
1526 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1527 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer
1531 define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
1532 ; CHECK-LABEL: test_mm512_unpacklo_pd:
1534 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1535 ; CHECK-NEXT: ret{{[l|q]}}
1536 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1537 ret <8 x double> %res
1540 define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
1541 ; X86-LABEL: test_mm512_mask_unpacklo_pd:
1543 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1544 ; X86-NEXT: kmovw %eax, %k1
1545 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1548 ; X64-LABEL: test_mm512_mask_unpacklo_pd:
1550 ; X64-NEXT: kmovw %edi, %k1
1551 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
1553 %arg1 = bitcast i8 %a1 to <8 x i1>
1554 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1555 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0
1556 ret <8 x double> %res1
1559 define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
1560 ; X86-LABEL: test_mm512_maskz_unpacklo_pd:
1562 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1563 ; X86-NEXT: kmovw %eax, %k1
1564 ; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1567 ; X64-LABEL: test_mm512_maskz_unpacklo_pd:
1569 ; X64-NEXT: kmovw %edi, %k1
1570 ; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
1572 %arg0 = bitcast i8 %a0 to <8 x i1>
1573 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
1574 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer
1575 ret <8 x double> %res1
1578 define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
1579 ; CHECK-LABEL: test_mm512_unpacklo_ps:
1581 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1582 ; CHECK-NEXT: ret{{[l|q]}}
1583 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1584 ret <16 x float> %res
1587 define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
1588 ; X86-LABEL: test_mm512_mask_unpacklo_ps:
1590 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1591 ; X86-NEXT: kmovw %eax, %k1
1592 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1595 ; X64-LABEL: test_mm512_mask_unpacklo_ps:
1597 ; X64-NEXT: kmovw %edi, %k1
1598 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
1600 %arg1 = bitcast i16 %a1 to <16 x i1>
1601 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1602 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0
1603 ret <16 x float> %res1
1606 define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
1607 ; X86-LABEL: test_mm512_maskz_unpacklo_ps:
1609 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT: kmovw %eax, %k1
1611 ; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1614 ; X64-LABEL: test_mm512_maskz_unpacklo_ps:
1616 ; X64-NEXT: kmovw %edi, %k1
1617 ; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
1619 %arg0 = bitcast i16 %a0 to <16 x i1>
1620 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1621 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer
1622 ret <16 x float> %res1
1625 define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
1626 ; CHECK-LABEL: test_mm512_zextpd128_pd512:
1628 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1629 ; CHECK-NEXT: ret{{[l|q]}}
1630 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1631 ret <8 x double> %res
1634 define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
1635 ; CHECK-LABEL: test_mm512_zextpd256_pd512:
1637 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1638 ; CHECK-NEXT: ret{{[l|q]}}
1639 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1640 ret <8 x double> %res
1643 define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
1644 ; CHECK-LABEL: test_mm512_zextps128_ps512:
1646 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1647 ; CHECK-NEXT: ret{{[l|q]}}
1648 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
1649 ret <16 x float> %res
1652 define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
1653 ; CHECK-LABEL: test_mm512_zextps256_ps512:
1655 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1656 ; CHECK-NEXT: ret{{[l|q]}}
1657 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1658 ret <16 x float> %res
1661 define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
1662 ; CHECK-LABEL: test_mm512_zextsi128_si512:
1664 ; CHECK-NEXT: vmovaps %xmm0, %xmm0
1665 ; CHECK-NEXT: ret{{[l|q]}}
1666 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
1670 define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
1671 ; CHECK-LABEL: test_mm512_zextsi256_si512:
1673 ; CHECK-NEXT: vmovaps %ymm0, %ymm0
1674 ; CHECK-NEXT: ret{{[l|q]}}
1675 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1679 define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1680 ; CHECK-LABEL: test_mm512_mul_epi32:
1682 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0
1683 ; CHECK-NEXT: ret{{[l|q]}}
1684 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1685 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1686 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1687 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1688 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1
1692 define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1693 ; X86-LABEL: test_mm512_maskz_mul_epi32:
1694 ; X86: # %bb.0: # %entry
1695 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1696 ; X86-NEXT: kmovw %eax, %k1
1697 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1700 ; X64-LABEL: test_mm512_maskz_mul_epi32:
1701 ; X64: # %bb.0: # %entry
1702 ; X64-NEXT: kmovw %edi, %k1
1703 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z}
1706 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1707 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1708 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1709 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1710 %4 = mul nsw <8 x i64> %3, %1
1711 %5 = bitcast i8 %__k to <8 x i1>
1712 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
1716 define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1717 ; X86-LABEL: test_mm512_mask_mul_epi32:
1718 ; X86: # %bb.0: # %entry
1719 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1720 ; X86-NEXT: kmovw %eax, %k1
1721 ; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1722 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1725 ; X64-LABEL: test_mm512_mask_mul_epi32:
1726 ; X64: # %bb.0: # %entry
1727 ; X64-NEXT: kmovw %edi, %k1
1728 ; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1}
1729 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1732 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1733 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1734 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1735 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
1736 %4 = mul nsw <8 x i64> %3, %1
1737 %5 = bitcast i8 %__k to <8 x i1>
1738 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src
1742 define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind {
1743 ; CHECK-LABEL: test_mm512_mul_epu32:
1745 ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0
1746 ; CHECK-NEXT: ret{{[l|q]}}
1747 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1748 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1749 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp
1753 define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind {
1754 ; X86-LABEL: test_mm512_maskz_mul_epu32:
1755 ; X86: # %bb.0: # %entry
1756 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1757 ; X86-NEXT: kmovw %eax, %k1
1758 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1761 ; X64-LABEL: test_mm512_maskz_mul_epu32:
1762 ; X64: # %bb.0: # %entry
1763 ; X64-NEXT: kmovw %edi, %k1
1764 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z}
1767 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1768 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1769 %2 = mul nuw <8 x i64> %1, %0
1770 %3 = bitcast i8 %__k to <8 x i1>
1771 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
1775 define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind {
1776 ; X86-LABEL: test_mm512_mask_mul_epu32:
1777 ; X86: # %bb.0: # %entry
1778 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1779 ; X86-NEXT: kmovw %eax, %k1
1780 ; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1781 ; X86-NEXT: vmovdqa64 %zmm2, %zmm0
1784 ; X64-LABEL: test_mm512_mask_mul_epu32:
1785 ; X64: # %bb.0: # %entry
1786 ; X64-NEXT: kmovw %edi, %k1
1787 ; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1}
1788 ; X64-NEXT: vmovdqa64 %zmm2, %zmm0
1791 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1792 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1793 %2 = mul nuw <8 x i64> %1, %0
1794 %3 = bitcast i8 %__k to <8 x i1>
1795 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src
1799 define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind {
1800 ; X86-LABEL: test_mm512_set1_epi8:
1801 ; X86: # %bb.0: # %entry
1802 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1803 ; X86-NEXT: vmovd %eax, %xmm0
1804 ; X86-NEXT: vpbroadcastb %xmm0, %ymm0
1805 ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1808 ; X64-LABEL: test_mm512_set1_epi8:
1809 ; X64: # %bb.0: # %entry
1810 ; X64-NEXT: vmovd %edi, %xmm0
1811 ; X64-NEXT: vpbroadcastb %xmm0, %ymm0
1812 ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1815 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0
1816 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer
1817 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double>
1821 define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) {
1822 ; X86-LABEL: test_mm_cvtu32_sd:
1823 ; X86: # %bb.0: # %entry
1824 ; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0
1827 ; X64-LABEL: test_mm_cvtu32_sd:
1828 ; X64: # %bb.0: # %entry
1829 ; X64-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0
1832 %conv.i = uitofp i32 %__B to double
1833 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1834 ret <2 x double> %vecins.i
1837 define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
1838 ; X86-LABEL: test_mm_cvtu64_sd:
1839 ; X86: # %bb.0: # %entry
1840 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1841 ; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1842 ; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
1843 ; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1
1844 ; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
1845 ; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1848 ; X64-LABEL: test_mm_cvtu64_sd:
1849 ; X64: # %bb.0: # %entry
1850 ; X64-NEXT: vcvtusi2sdq %rdi, %xmm0, %xmm0
1853 %conv.i = uitofp i64 %__B to double
1854 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0
1855 ret <2 x double> %vecins.i
1858 define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) {
1859 ; X86-LABEL: test_mm_cvtu32_ss:
1860 ; X86: # %bb.0: # %entry
1861 ; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
1864 ; X64-LABEL: test_mm_cvtu32_ss:
1865 ; X64: # %bb.0: # %entry
1866 ; X64-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0
1869 %conv.i = uitofp i32 %__B to float
1870 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1871 ret <4 x float> %vecins.i
1874 define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
1875 ; X86-LABEL: test_mm_cvtu64_ss:
1876 ; X86: # %bb.0: # %entry
1877 ; X86-NEXT: pushl %ebp
1878 ; X86-NEXT: .cfi_def_cfa_offset 8
1879 ; X86-NEXT: .cfi_offset %ebp, -8
1880 ; X86-NEXT: movl %esp, %ebp
1881 ; X86-NEXT: .cfi_def_cfa_register %ebp
1882 ; X86-NEXT: andl $-8, %esp
1883 ; X86-NEXT: subl $16, %esp
1884 ; X86-NEXT: movl 12(%ebp), %eax
1885 ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1886 ; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
1887 ; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
1888 ; X86-NEXT: xorl %ecx, %ecx
1889 ; X86-NEXT: testl %eax, %eax
1890 ; X86-NEXT: setns %cl
1891 ; X86-NEXT: fildll {{[0-9]+}}(%esp)
1892 ; X86-NEXT: fadds {{\.LCPI.*}}(,%ecx,4)
1893 ; X86-NEXT: fstps {{[0-9]+}}(%esp)
1894 ; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1895 ; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1896 ; X86-NEXT: movl %ebp, %esp
1897 ; X86-NEXT: popl %ebp
1898 ; X86-NEXT: .cfi_def_cfa %esp, 4
1901 ; X64-LABEL: test_mm_cvtu64_ss:
1902 ; X64: # %bb.0: # %entry
1903 ; X64-NEXT: vcvtusi2ssq %rdi, %xmm0, %xmm0
1906 %conv.i = uitofp i64 %__B to float
1907 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0
1908 ret <4 x float> %vecins.i
1911 define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) {
1912 ; CHECK-LABEL: test_mm512_cvtps_pd:
1913 ; CHECK: # %bb.0: # %entry
1914 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1915 ; CHECK-NEXT: ret{{[l|q]}}
1917 %conv.i = fpext <8 x float> %__A to <8 x double>
1918 ret <8 x double> %conv.i
1921 define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) {
1922 ; CHECK-LABEL: test_mm512_cvtpslo_pd:
1923 ; CHECK: # %bb.0: # %entry
1924 ; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0
1925 ; CHECK-NEXT: ret{{[l|q]}}
1927 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1928 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1929 ret <8 x double> %conv.i.i
1932 define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) {
1933 ; X86-LABEL: test_mm512_mask_cvtps_pd:
1934 ; X86: # %bb.0: # %entry
1935 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1936 ; X86-NEXT: kmovw %eax, %k1
1937 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1940 ; X64-LABEL: test_mm512_mask_cvtps_pd:
1941 ; X64: # %bb.0: # %entry
1942 ; X64-NEXT: kmovw %edi, %k1
1943 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1946 %conv.i.i = fpext <8 x float> %__A to <8 x double>
1947 %0 = bitcast i8 %__U to <8 x i1>
1948 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W
1952 define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) {
1953 ; X86-LABEL: test_mm512_mask_cvtpslo_pd:
1954 ; X86: # %bb.0: # %entry
1955 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1956 ; X86-NEXT: kmovw %eax, %k1
1957 ; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1960 ; X64-LABEL: test_mm512_mask_cvtpslo_pd:
1961 ; X64: # %bb.0: # %entry
1962 ; X64-NEXT: kmovw %edi, %k1
1963 ; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1}
1966 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1967 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double>
1968 %0 = bitcast i8 %__U to <8 x i1>
1969 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W
1973 define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) {
1974 ; X86-LABEL: test_mm512_maskz_cvtps_pd:
1975 ; X86: # %bb.0: # %entry
1976 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
1977 ; X86-NEXT: kmovw %eax, %k1
1978 ; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1981 ; X64-LABEL: test_mm512_maskz_cvtps_pd:
1982 ; X64: # %bb.0: # %entry
1983 ; X64-NEXT: kmovw %edi, %k1
1984 ; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z}
1987 %conv.i.i = fpext <8 x float> %__A to <8 x double>
1988 %0 = bitcast i8 %__U to <8 x i1>
1989 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer
1993 define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) {
1994 ; CHECK-LABEL: test_mm512_cvtepi32_epi8:
1995 ; CHECK: # %bb.0: # %entry
1996 ; CHECK-NEXT: vpmovdb %zmm0, %xmm0
1997 ; CHECK-NEXT: vzeroupper
1998 ; CHECK-NEXT: ret{{[l|q]}}
2000 %0 = bitcast <8 x i64> %__A to <16 x i32>
2001 %conv.i = trunc <16 x i32> %0 to <16 x i8>
2002 %1 = bitcast <16 x i8> %conv.i to <2 x i64>
2006 define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) {
2007 ; X86-LABEL: test_mm512_mask_cvtepi32_epi8:
2008 ; X86: # %bb.0: # %entry
2009 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2010 ; X86-NEXT: kmovw %eax, %k1
2011 ; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2012 ; X86-NEXT: vzeroupper
2015 ; X64-LABEL: test_mm512_mask_cvtepi32_epi8:
2016 ; X64: # %bb.0: # %entry
2017 ; X64-NEXT: kmovw %edi, %k1
2018 ; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1}
2019 ; X64-NEXT: vzeroupper
2022 %0 = bitcast <8 x i64> %__A to <16 x i32>
2023 %1 = bitcast <2 x i64> %__O to <16 x i8>
2024 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M)
2025 %3 = bitcast <16 x i8> %2 to <2 x i64>
2029 define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) {
2030 ; X86-LABEL: test_mm512_maskz_cvtepi32_epi8:
2031 ; X86: # %bb.0: # %entry
2032 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2033 ; X86-NEXT: kmovw %eax, %k1
2034 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2035 ; X86-NEXT: vzeroupper
2038 ; X64-LABEL: test_mm512_maskz_cvtepi32_epi8:
2039 ; X64: # %bb.0: # %entry
2040 ; X64-NEXT: kmovw %edi, %k1
2041 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
2042 ; X64-NEXT: vzeroupper
2045 %0 = bitcast <8 x i64> %__A to <16 x i32>
2046 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M)
2047 %2 = bitcast <16 x i8> %1 to <2 x i64>
2051 define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) {
2052 ; CHECK-LABEL: test_mm512_cvtepi64_epi32:
2053 ; CHECK: # %bb.0: # %entry
2054 ; CHECK-NEXT: vpmovqd %zmm0, %ymm0
2055 ; CHECK-NEXT: ret{{[l|q]}}
2057 %conv.i = trunc <8 x i64> %__A to <8 x i32>
2058 %0 = bitcast <8 x i32> %conv.i to <4 x i64>
2062 define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2063 ; X86-LABEL: test_mm512_mask_cvtepi64_epi32:
2064 ; X86: # %bb.0: # %entry
2065 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2066 ; X86-NEXT: kmovw %eax, %k1
2067 ; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2070 ; X64-LABEL: test_mm512_mask_cvtepi64_epi32:
2071 ; X64: # %bb.0: # %entry
2072 ; X64-NEXT: kmovw %edi, %k1
2073 ; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1}
2076 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2077 %0 = bitcast <4 x i64> %__O to <8 x i32>
2078 %1 = bitcast i8 %__M to <8 x i1>
2079 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0
2080 %3 = bitcast <8 x i32> %2 to <4 x i64>
2084 define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) {
2085 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi32:
2086 ; X86: # %bb.0: # %entry
2087 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2088 ; X86-NEXT: kmovw %eax, %k1
2089 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2092 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi32:
2093 ; X64: # %bb.0: # %entry
2094 ; X64-NEXT: kmovw %edi, %k1
2095 ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z}
2098 %conv.i.i = trunc <8 x i64> %__A to <8 x i32>
2099 %0 = bitcast i8 %__M to <8 x i1>
2100 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer
2101 %2 = bitcast <8 x i32> %1 to <4 x i64>
2105 define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) {
2106 ; CHECK-LABEL: test_mm512_cvtepi64_epi16:
2107 ; CHECK: # %bb.0: # %entry
2108 ; CHECK-NEXT: vpmovqw %zmm0, %xmm0
2109 ; CHECK-NEXT: vzeroupper
2110 ; CHECK-NEXT: ret{{[l|q]}}
2112 %conv.i = trunc <8 x i64> %__A to <8 x i16>
2113 %0 = bitcast <8 x i16> %conv.i to <2 x i64>
2117 define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) {
2118 ; X86-LABEL: test_mm512_mask_cvtepi64_epi16:
2119 ; X86: # %bb.0: # %entry
2120 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2121 ; X86-NEXT: kmovw %eax, %k1
2122 ; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2123 ; X86-NEXT: vzeroupper
2126 ; X64-LABEL: test_mm512_mask_cvtepi64_epi16:
2127 ; X64: # %bb.0: # %entry
2128 ; X64-NEXT: kmovw %edi, %k1
2129 ; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1}
2130 ; X64-NEXT: vzeroupper
2133 %0 = bitcast <2 x i64> %__O to <8 x i16>
2134 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M)
2135 %2 = bitcast <8 x i16> %1 to <2 x i64>
2139 define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) {
2140 ; X86-LABEL: test_mm512_maskz_cvtepi64_epi16:
2141 ; X86: # %bb.0: # %entry
2142 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2143 ; X86-NEXT: kmovw %eax, %k1
2144 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2145 ; X86-NEXT: vzeroupper
2148 ; X64-LABEL: test_mm512_maskz_cvtepi64_epi16:
2149 ; X64: # %bb.0: # %entry
2150 ; X64-NEXT: kmovw %edi, %k1
2151 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z}
2152 ; X64-NEXT: vzeroupper
2155 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M)
2156 %1 = bitcast <8 x i16> %0 to <2 x i64>
2160 declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
2161 declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
2163 define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2164 ; CHECK-LABEL: test_mm512_ternarylogic_epi32:
2165 ; CHECK: # %bb.0: # %entry
2166 ; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0
2167 ; CHECK-NEXT: ret{{[l|q]}}
2169 %0 = bitcast <8 x i64> %__A to <16 x i32>
2170 %1 = bitcast <8 x i64> %__B to <16 x i32>
2171 %2 = bitcast <8 x i64> %__C to <16 x i32>
2172 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2173 %4 = bitcast <16 x i32> %3 to <8 x i64>
2177 declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1
2179 define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2180 ; X86-LABEL: test_mm512_mask_ternarylogic_epi32:
2181 ; X86: # %bb.0: # %entry
2182 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2183 ; X86-NEXT: kmovw %eax, %k1
2184 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2187 ; X64-LABEL: test_mm512_mask_ternarylogic_epi32:
2188 ; X64: # %bb.0: # %entry
2189 ; X64-NEXT: kmovw %edi, %k1
2190 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1}
2193 %0 = bitcast <8 x i64> %__A to <16 x i32>
2194 %1 = bitcast <8 x i64> %__B to <16 x i32>
2195 %2 = bitcast <8 x i64> %__C to <16 x i32>
2196 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2197 %4 = bitcast i16 %__U to <16 x i1>
2198 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2199 %6 = bitcast <16 x i32> %5 to <8 x i64>
2203 define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2204 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi32:
2205 ; X86: # %bb.0: # %entry
2206 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2207 ; X86-NEXT: kmovw %eax, %k1
2208 ; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2211 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi32:
2212 ; X64: # %bb.0: # %entry
2213 ; X64-NEXT: kmovw %edi, %k1
2214 ; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2217 %0 = bitcast <8 x i64> %__A to <16 x i32>
2218 %1 = bitcast <8 x i64> %__B to <16 x i32>
2219 %2 = bitcast <8 x i64> %__C to <16 x i32>
2220 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4)
2221 %4 = bitcast i16 %__U to <16 x i1>
2222 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2223 %6 = bitcast <16 x i32> %5 to <8 x i64>
2227 define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2228 ; CHECK-LABEL: test_mm512_ternarylogic_epi64:
2229 ; CHECK: # %bb.0: # %entry
2230 ; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0
2231 ; CHECK-NEXT: ret{{[l|q]}}
2233 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2237 declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1
2239 define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) {
2240 ; X86-LABEL: test_mm512_mask_ternarylogic_epi64:
2241 ; X86: # %bb.0: # %entry
2242 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2243 ; X86-NEXT: kmovw %eax, %k1
2244 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2247 ; X64-LABEL: test_mm512_mask_ternarylogic_epi64:
2248 ; X64: # %bb.0: # %entry
2249 ; X64-NEXT: kmovw %edi, %k1
2250 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1}
2253 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2254 %1 = bitcast i8 %__U to <8 x i1>
2255 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2259 define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) {
2260 ; X86-LABEL: test_mm512_maskz_ternarylogic_epi64:
2261 ; X86: # %bb.0: # %entry
2262 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2263 ; X86-NEXT: kmovw %eax, %k1
2264 ; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2267 ; X64-LABEL: test_mm512_maskz_ternarylogic_epi64:
2268 ; X64: # %bb.0: # %entry
2269 ; X64-NEXT: kmovw %edi, %k1
2270 ; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z}
2273 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4)
2274 %1 = bitcast i8 %__U to <8 x i1>
2275 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2279 declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
2281 define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) {
2282 ; X86-LABEL: test_mm512_mask2_permutex2var_epi32:
2283 ; X86: # %bb.0: # %entry
2284 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2285 ; X86-NEXT: kmovw %eax, %k1
2286 ; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2287 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2290 ; X64-LABEL: test_mm512_mask2_permutex2var_epi32:
2291 ; X64: # %bb.0: # %entry
2292 ; X64-NEXT: kmovw %edi, %k1
2293 ; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1}
2294 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2297 %0 = bitcast <8 x i64> %__A to <16 x i32>
2298 %1 = bitcast <8 x i64> %__I to <16 x i32>
2299 %2 = bitcast <8 x i64> %__B to <16 x i32>
2300 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2301 %4 = bitcast i16 %__U to <16 x i1>
2302 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1
2303 %6 = bitcast <16 x i32> %5 to <8 x i64>
2307 declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
2309 define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) {
2310 ; X86-LABEL: test_mm512_mask2_permutex2var_pd:
2311 ; X86: # %bb.0: # %entry
2312 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2313 ; X86-NEXT: kmovw %eax, %k1
2314 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2315 ; X86-NEXT: vmovapd %zmm1, %zmm0
2318 ; X64-LABEL: test_mm512_mask2_permutex2var_pd:
2319 ; X64: # %bb.0: # %entry
2320 ; X64-NEXT: kmovw %edi, %k1
2321 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
2322 ; X64-NEXT: vmovapd %zmm1, %zmm0
2325 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2326 %1 = bitcast <8 x i64> %__I to <8 x double>
2327 %2 = bitcast i8 %__U to <8 x i1>
2328 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1
2332 declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
2334 define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) {
2335 ; X86-LABEL: test_mm512_mask2_permutex2var_ps:
2336 ; X86: # %bb.0: # %entry
2337 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2338 ; X86-NEXT: kmovw %eax, %k1
2339 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2340 ; X86-NEXT: vmovaps %zmm1, %zmm0
2343 ; X64-LABEL: test_mm512_mask2_permutex2var_ps:
2344 ; X64: # %bb.0: # %entry
2345 ; X64-NEXT: kmovw %edi, %k1
2346 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
2347 ; X64-NEXT: vmovaps %zmm1, %zmm0
2350 %0 = bitcast <8 x i64> %__I to <16 x i32>
2351 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2352 %2 = bitcast <8 x i64> %__I to <16 x float>
2353 %3 = bitcast i16 %__U to <16 x i1>
2354 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
2358 declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
2360 define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) {
2361 ; X86-LABEL: test_mm512_mask2_permutex2var_epi64:
2362 ; X86: # %bb.0: # %entry
2363 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2364 ; X86-NEXT: kmovw %eax, %k1
2365 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2366 ; X86-NEXT: vmovdqa64 %zmm1, %zmm0
2369 ; X64-LABEL: test_mm512_mask2_permutex2var_epi64:
2370 ; X64: # %bb.0: # %entry
2371 ; X64-NEXT: kmovw %edi, %k1
2372 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
2373 ; X64-NEXT: vmovdqa64 %zmm1, %zmm0
2376 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2377 %1 = bitcast i8 %__U to <8 x i1>
2378 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I
2382 define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2383 ; CHECK-LABEL: test_mm512_permutex2var_epi32:
2384 ; CHECK: # %bb.0: # %entry
2385 ; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
2386 ; CHECK-NEXT: ret{{[l|q]}}
2388 %0 = bitcast <8 x i64> %__A to <16 x i32>
2389 %1 = bitcast <8 x i64> %__I to <16 x i32>
2390 %2 = bitcast <8 x i64> %__B to <16 x i32>
2391 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2392 %4 = bitcast <16 x i32> %3 to <8 x i64>
2396 define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2397 ; X86-LABEL: test_mm512_maskz_permutex2var_epi32:
2398 ; X86: # %bb.0: # %entry
2399 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2400 ; X86-NEXT: kmovw %eax, %k1
2401 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2404 ; X64-LABEL: test_mm512_maskz_permutex2var_epi32:
2405 ; X64: # %bb.0: # %entry
2406 ; X64-NEXT: kmovw %edi, %k1
2407 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z}
2410 %0 = bitcast <8 x i64> %__A to <16 x i32>
2411 %1 = bitcast <8 x i64> %__I to <16 x i32>
2412 %2 = bitcast <8 x i64> %__B to <16 x i32>
2413 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2414 %4 = bitcast i16 %__U to <16 x i1>
2415 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer
2416 %6 = bitcast <16 x i32> %5 to <8 x i64>
2420 define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2421 ; X86-LABEL: test_mm512_mask_permutex2var_epi32:
2422 ; X86: # %bb.0: # %entry
2423 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2424 ; X86-NEXT: kmovw %eax, %k1
2425 ; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2428 ; X64-LABEL: test_mm512_mask_permutex2var_epi32:
2429 ; X64: # %bb.0: # %entry
2430 ; X64-NEXT: kmovw %edi, %k1
2431 ; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1}
2434 %0 = bitcast <8 x i64> %__A to <16 x i32>
2435 %1 = bitcast <8 x i64> %__I to <16 x i32>
2436 %2 = bitcast <8 x i64> %__B to <16 x i32>
2437 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
2438 %4 = bitcast i16 %__U to <16 x i1>
2439 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0
2440 %6 = bitcast <16 x i32> %5 to <8 x i64>
2444 define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2445 ; CHECK-LABEL: test_mm512_permutex2var_pd:
2446 ; CHECK: # %bb.0: # %entry
2447 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
2448 ; CHECK-NEXT: ret{{[l|q]}}
2450 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2454 define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) {
2455 ; X86-LABEL: test_mm512_mask_permutex2var_pd:
2456 ; X86: # %bb.0: # %entry
2457 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2458 ; X86-NEXT: kmovw %eax, %k1
2459 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2462 ; X64-LABEL: test_mm512_mask_permutex2var_pd:
2463 ; X64: # %bb.0: # %entry
2464 ; X64-NEXT: kmovw %edi, %k1
2465 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1}
2468 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2469 %1 = bitcast i8 %__U to <8 x i1>
2470 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
2474 define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) {
2475 ; X86-LABEL: test_mm512_maskz_permutex2var_pd:
2476 ; X86: # %bb.0: # %entry
2477 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2478 ; X86-NEXT: kmovw %eax, %k1
2479 ; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2482 ; X64-LABEL: test_mm512_maskz_permutex2var_pd:
2483 ; X64: # %bb.0: # %entry
2484 ; X64-NEXT: kmovw %edi, %k1
2485 ; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z}
2488 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B)
2489 %1 = bitcast i8 %__U to <8 x i1>
2490 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
2494 define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2495 ; CHECK-LABEL: test_mm512_permutex2var_ps:
2496 ; CHECK: # %bb.0: # %entry
2497 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
2498 ; CHECK-NEXT: ret{{[l|q]}}
2500 %0 = bitcast <8 x i64> %__I to <16 x i32>
2501 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2505 define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) {
2506 ; X86-LABEL: test_mm512_mask_permutex2var_ps:
2507 ; X86: # %bb.0: # %entry
2508 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2509 ; X86-NEXT: kmovw %eax, %k1
2510 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2513 ; X64-LABEL: test_mm512_mask_permutex2var_ps:
2514 ; X64: # %bb.0: # %entry
2515 ; X64-NEXT: kmovw %edi, %k1
2516 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1}
2519 %0 = bitcast <8 x i64> %__I to <16 x i32>
2520 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2521 %2 = bitcast i16 %__U to <16 x i1>
2522 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A
2526 define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) {
2527 ; X86-LABEL: test_mm512_maskz_permutex2var_ps:
2528 ; X86: # %bb.0: # %entry
2529 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
2530 ; X86-NEXT: kmovw %eax, %k1
2531 ; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2534 ; X64-LABEL: test_mm512_maskz_permutex2var_ps:
2535 ; X64: # %bb.0: # %entry
2536 ; X64-NEXT: kmovw %edi, %k1
2537 ; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
2540 %0 = bitcast <8 x i64> %__I to <16 x i32>
2541 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B)
2542 %2 = bitcast i16 %__U to <16 x i1>
2543 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
2547 define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2548 ; CHECK-LABEL: test_mm512_permutex2var_epi64:
2549 ; CHECK: # %bb.0: # %entry
2550 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
2551 ; CHECK-NEXT: ret{{[l|q]}}
2553 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2557 define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) {
2558 ; X86-LABEL: test_mm512_mask_permutex2var_epi64:
2559 ; X86: # %bb.0: # %entry
2560 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2561 ; X86-NEXT: kmovw %eax, %k1
2562 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2565 ; X64-LABEL: test_mm512_mask_permutex2var_epi64:
2566 ; X64: # %bb.0: # %entry
2567 ; X64-NEXT: kmovw %edi, %k1
2568 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1}
2571 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2572 %1 = bitcast i8 %__U to <8 x i1>
2573 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A
2577 define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
2578 ; X86-LABEL: test_mm512_maskz_permutex2var_epi64:
2579 ; X86: # %bb.0: # %entry
2580 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2581 ; X86-NEXT: kmovw %eax, %k1
2582 ; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2585 ; X64-LABEL: test_mm512_maskz_permutex2var_epi64:
2586 ; X64: # %bb.0: # %entry
2587 ; X64-NEXT: kmovw %edi, %k1
2588 ; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z}
2591 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B)
2592 %1 = bitcast i8 %__U to <8 x i1>
2593 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
2596 define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2597 ; X86-LABEL: test_mm_mask_add_ss:
2598 ; X86: # %bb.0: # %entry
2599 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2600 ; X86-NEXT: kmovw %eax, %k1
2601 ; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2604 ; X64-LABEL: test_mm_mask_add_ss:
2605 ; X64: # %bb.0: # %entry
2606 ; X64-NEXT: kmovw %edi, %k1
2607 ; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
2610 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2611 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2612 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2614 %tobool.i = icmp eq i8 %0, 0
2615 %vecext1.i = extractelement <4 x float> %__W, i32 0
2616 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i
2617 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2618 ret <4 x float> %vecins.i
2621 define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2622 ; X86-LABEL: test_mm_maskz_add_ss:
2623 ; X86: # %bb.0: # %entry
2624 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2625 ; X86-NEXT: kmovw %eax, %k1
2626 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2629 ; X64-LABEL: test_mm_maskz_add_ss:
2630 ; X64: # %bb.0: # %entry
2631 ; X64-NEXT: kmovw %edi, %k1
2632 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
2635 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2636 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2637 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i
2639 %tobool.i = icmp eq i8 %0, 0
2640 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i
2641 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2642 ret <4 x float> %vecins.i
2645 define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2646 ; X86-LABEL: test_mm_mask_add_sd:
2647 ; X86: # %bb.0: # %entry
2648 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2649 ; X86-NEXT: kmovw %eax, %k1
2650 ; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2653 ; X64-LABEL: test_mm_mask_add_sd:
2654 ; X64: # %bb.0: # %entry
2655 ; X64-NEXT: kmovw %edi, %k1
2656 ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1}
2659 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2660 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2661 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2663 %tobool.i = icmp eq i8 %0, 0
2664 %vecext1.i = extractelement <2 x double> %__W, i32 0
2665 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i
2666 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2667 ret <2 x double> %vecins.i
2670 define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2671 ; X86-LABEL: test_mm_maskz_add_sd:
2672 ; X86: # %bb.0: # %entry
2673 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2674 ; X86-NEXT: kmovw %eax, %k1
2675 ; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2678 ; X64-LABEL: test_mm_maskz_add_sd:
2679 ; X64: # %bb.0: # %entry
2680 ; X64-NEXT: kmovw %edi, %k1
2681 ; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2684 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2685 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2686 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i
2688 %tobool.i = icmp eq i8 %0, 0
2689 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i
2690 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2691 ret <2 x double> %vecins.i
2694 define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2695 ; X86-LABEL: test_mm_mask_sub_ss:
2696 ; X86: # %bb.0: # %entry
2697 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2698 ; X86-NEXT: kmovw %eax, %k1
2699 ; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2702 ; X64-LABEL: test_mm_mask_sub_ss:
2703 ; X64: # %bb.0: # %entry
2704 ; X64-NEXT: kmovw %edi, %k1
2705 ; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
2708 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2709 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2710 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2712 %tobool.i = icmp eq i8 %0, 0
2713 %vecext1.i = extractelement <4 x float> %__W, i32 0
2714 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i
2715 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2716 ret <4 x float> %vecins.i
2719 define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2720 ; X86-LABEL: test_mm_maskz_sub_ss:
2721 ; X86: # %bb.0: # %entry
2722 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2723 ; X86-NEXT: kmovw %eax, %k1
2724 ; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2727 ; X64-LABEL: test_mm_maskz_sub_ss:
2728 ; X64: # %bb.0: # %entry
2729 ; X64-NEXT: kmovw %edi, %k1
2730 ; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
2733 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2734 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2735 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i
2737 %tobool.i = icmp eq i8 %0, 0
2738 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i
2739 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2740 ret <4 x float> %vecins.i
2743 define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2744 ; X86-LABEL: test_mm_mask_sub_sd:
2745 ; X86: # %bb.0: # %entry
2746 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2747 ; X86-NEXT: kmovw %eax, %k1
2748 ; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2751 ; X64-LABEL: test_mm_mask_sub_sd:
2752 ; X64: # %bb.0: # %entry
2753 ; X64-NEXT: kmovw %edi, %k1
2754 ; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1}
2757 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2758 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2759 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2761 %tobool.i = icmp eq i8 %0, 0
2762 %vecext1.i = extractelement <2 x double> %__W, i32 0
2763 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i
2764 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2765 ret <2 x double> %vecins.i
2768 define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2769 ; X86-LABEL: test_mm_maskz_sub_sd:
2770 ; X86: # %bb.0: # %entry
2771 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2772 ; X86-NEXT: kmovw %eax, %k1
2773 ; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2776 ; X64-LABEL: test_mm_maskz_sub_sd:
2777 ; X64: # %bb.0: # %entry
2778 ; X64-NEXT: kmovw %edi, %k1
2779 ; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2782 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2783 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2784 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i
2786 %tobool.i = icmp eq i8 %0, 0
2787 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i
2788 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2789 ret <2 x double> %vecins.i
2792 define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2793 ; X86-LABEL: test_mm_mask_mul_ss:
2794 ; X86: # %bb.0: # %entry
2795 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2796 ; X86-NEXT: kmovw %eax, %k1
2797 ; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2800 ; X64-LABEL: test_mm_mask_mul_ss:
2801 ; X64: # %bb.0: # %entry
2802 ; X64-NEXT: kmovw %edi, %k1
2803 ; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
2806 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2807 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2808 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2810 %tobool.i = icmp eq i8 %0, 0
2811 %vecext1.i = extractelement <4 x float> %__W, i32 0
2812 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i
2813 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2814 ret <4 x float> %vecins.i
2817 define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2818 ; X86-LABEL: test_mm_maskz_mul_ss:
2819 ; X86: # %bb.0: # %entry
2820 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2821 ; X86-NEXT: kmovw %eax, %k1
2822 ; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2825 ; X64-LABEL: test_mm_maskz_mul_ss:
2826 ; X64: # %bb.0: # %entry
2827 ; X64-NEXT: kmovw %edi, %k1
2828 ; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
2831 %vecext.i.i = extractelement <4 x float> %__B, i32 0
2832 %vecext1.i.i = extractelement <4 x float> %__A, i32 0
2833 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i
2835 %tobool.i = icmp eq i8 %0, 0
2836 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i
2837 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
2838 ret <4 x float> %vecins.i
2841 define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2842 ; X86-LABEL: test_mm_mask_mul_sd:
2843 ; X86: # %bb.0: # %entry
2844 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2845 ; X86-NEXT: kmovw %eax, %k1
2846 ; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2849 ; X64-LABEL: test_mm_mask_mul_sd:
2850 ; X64: # %bb.0: # %entry
2851 ; X64-NEXT: kmovw %edi, %k1
2852 ; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1}
2855 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2856 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2857 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2859 %tobool.i = icmp eq i8 %0, 0
2860 %vecext1.i = extractelement <2 x double> %__W, i32 0
2861 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i
2862 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2863 ret <2 x double> %vecins.i
2866 define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2867 ; X86-LABEL: test_mm_maskz_mul_sd:
2868 ; X86: # %bb.0: # %entry
2869 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2870 ; X86-NEXT: kmovw %eax, %k1
2871 ; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2874 ; X64-LABEL: test_mm_maskz_mul_sd:
2875 ; X64: # %bb.0: # %entry
2876 ; X64-NEXT: kmovw %edi, %k1
2877 ; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2880 %vecext.i.i = extractelement <2 x double> %__B, i32 0
2881 %vecext1.i.i = extractelement <2 x double> %__A, i32 0
2882 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i
2884 %tobool.i = icmp eq i8 %0, 0
2885 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i
2886 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
2887 ret <2 x double> %vecins.i
2890 define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2891 ; X86-LABEL: test_mm_mask_div_ss:
2892 ; X86: # %bb.0: # %entry
2893 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2894 ; X86-NEXT: kmovw %eax, %k1
2895 ; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2898 ; X64-LABEL: test_mm_mask_div_ss:
2899 ; X64: # %bb.0: # %entry
2900 ; X64-NEXT: kmovw %edi, %k1
2901 ; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1}
2904 %0 = extractelement <4 x float> %__A, i64 0
2905 %1 = extractelement <4 x float> %__B, i64 0
2906 %2 = extractelement <4 x float> %__W, i64 0
2907 %3 = fdiv float %0, %1
2908 %4 = bitcast i8 %__U to <8 x i1>
2909 %5 = extractelement <8 x i1> %4, i64 0
2910 %6 = select i1 %5, float %3, float %2
2911 %7 = insertelement <4 x float> %__A, float %6, i64 0
2915 define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
2916 ; X86-LABEL: test_mm_maskz_div_ss:
2917 ; X86: # %bb.0: # %entry
2918 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2919 ; X86-NEXT: kmovw %eax, %k1
2920 ; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2923 ; X64-LABEL: test_mm_maskz_div_ss:
2924 ; X64: # %bb.0: # %entry
2925 ; X64-NEXT: kmovw %edi, %k1
2926 ; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z}
2929 %0 = extractelement <4 x float> %__A, i64 0
2930 %1 = extractelement <4 x float> %__B, i64 0
2931 %2 = fdiv float %0, %1
2932 %3 = bitcast i8 %__U to <8 x i1>
2933 %4 = extractelement <8 x i1> %3, i64 0
2934 %5 = select i1 %4, float %2, float 0.000000e+00
2935 %6 = insertelement <4 x float> %__A, float %5, i64 0
2939 define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2940 ; X86-LABEL: test_mm_mask_div_sd:
2941 ; X86: # %bb.0: # %entry
2942 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2943 ; X86-NEXT: kmovw %eax, %k1
2944 ; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2947 ; X64-LABEL: test_mm_mask_div_sd:
2948 ; X64: # %bb.0: # %entry
2949 ; X64-NEXT: kmovw %edi, %k1
2950 ; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1}
2953 %0 = extractelement <2 x double> %__A, i64 0
2954 %1 = extractelement <2 x double> %__B, i64 0
2955 %2 = extractelement <2 x double> %__W, i64 0
2956 %3 = fdiv double %0, %1
2957 %4 = bitcast i8 %__U to <8 x i1>
2958 %5 = extractelement <8 x i1> %4, i64 0
2959 %6 = select i1 %5, double %3, double %2
2960 %7 = insertelement <2 x double> %__A, double %6, i64 0
2964 define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
2965 ; X86-LABEL: test_mm_maskz_div_sd:
2966 ; X86: # %bb.0: # %entry
2967 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
2968 ; X86-NEXT: kmovw %eax, %k1
2969 ; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2972 ; X64-LABEL: test_mm_maskz_div_sd:
2973 ; X64: # %bb.0: # %entry
2974 ; X64-NEXT: kmovw %edi, %k1
2975 ; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z}
2978 %0 = extractelement <2 x double> %__A, i64 0
2979 %1 = extractelement <2 x double> %__B, i64 0
2980 %2 = fdiv double %0, %1
2981 %3 = bitcast i8 %__U to <8 x i1>
2982 %4 = extractelement <8 x i1> %3, i64 0
2983 %5 = select i1 %4, double %2, double 0.000000e+00
2984 %6 = insertelement <2 x double> %__A, double %5, i64 0
2989 define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
2990 ; CHECK-LABEL: test_mm512_fmadd_round_pd:
2991 ; CHECK: # %bb.0: # %entry
2992 ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
2993 ; CHECK-NEXT: ret{{[l|q]}}
2995 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
2999 declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3001 define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3002 ; X86-LABEL: test_mm512_mask_fmadd_round_pd:
3003 ; X86: # %bb.0: # %entry
3004 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3005 ; X86-NEXT: kmovw %eax, %k1
3006 ; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3009 ; X64-LABEL: test_mm512_mask_fmadd_round_pd:
3010 ; X64: # %bb.0: # %entry
3011 ; X64-NEXT: kmovw %edi, %k1
3012 ; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3015 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3016 %1 = bitcast i8 %__U to <8 x i1>
3017 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3021 define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3022 ; X86-LABEL: test_mm512_mask3_fmadd_round_pd:
3023 ; X86: # %bb.0: # %entry
3024 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3025 ; X86-NEXT: kmovw %eax, %k1
3026 ; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3027 ; X86-NEXT: vmovapd %zmm2, %zmm0
3030 ; X64-LABEL: test_mm512_mask3_fmadd_round_pd:
3031 ; X64: # %bb.0: # %entry
3032 ; X64-NEXT: kmovw %edi, %k1
3033 ; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3034 ; X64-NEXT: vmovapd %zmm2, %zmm0
3037 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3038 %1 = bitcast i8 %__U to <8 x i1>
3039 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3043 define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3044 ; X86-LABEL: test_mm512_maskz_fmadd_round_pd:
3045 ; X86: # %bb.0: # %entry
3046 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3047 ; X86-NEXT: kmovw %eax, %k1
3048 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3051 ; X64-LABEL: test_mm512_maskz_fmadd_round_pd:
3052 ; X64: # %bb.0: # %entry
3053 ; X64-NEXT: kmovw %edi, %k1
3054 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3057 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3058 %1 = bitcast i8 %__U to <8 x i1>
3059 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3063 define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3064 ; X86-LABEL: test_mm512_fmsub_round_pd:
3065 ; X86: # %bb.0: # %entry
3066 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3067 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3070 ; X64-LABEL: test_mm512_fmsub_round_pd:
3071 ; X64: # %bb.0: # %entry
3072 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3073 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3076 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3077 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3081 define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3082 ; X86-LABEL: test_mm512_mask_fmsub_round_pd:
3083 ; X86: # %bb.0: # %entry
3084 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3085 ; X86-NEXT: kmovw %eax, %k1
3086 ; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3089 ; X64-LABEL: test_mm512_mask_fmsub_round_pd:
3090 ; X64: # %bb.0: # %entry
3091 ; X64-NEXT: kmovw %edi, %k1
3092 ; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3095 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3096 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3097 %1 = bitcast i8 %__U to <8 x i1>
3098 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3102 define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3103 ; X86-LABEL: test_mm512_maskz_fmsub_round_pd:
3104 ; X86: # %bb.0: # %entry
3105 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3106 ; X86-NEXT: kmovw %eax, %k1
3107 ; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3110 ; X64-LABEL: test_mm512_maskz_fmsub_round_pd:
3111 ; X64: # %bb.0: # %entry
3112 ; X64-NEXT: kmovw %edi, %k1
3113 ; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3116 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3117 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
3118 %1 = bitcast i8 %__U to <8 x i1>
3119 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3123 define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3124 ; X86-LABEL: test_mm512_fnmadd_round_pd:
3125 ; X86: # %bb.0: # %entry
3126 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3127 ; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3130 ; X64-LABEL: test_mm512_fnmadd_round_pd:
3131 ; X64: # %bb.0: # %entry
3132 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3133 ; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3136 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3137 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3141 define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3142 ; X86-LABEL: test_mm512_mask3_fnmadd_round_pd:
3143 ; X86: # %bb.0: # %entry
3144 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3145 ; X86-NEXT: kmovw %eax, %k1
3146 ; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3147 ; X86-NEXT: vmovapd %zmm2, %zmm0
3150 ; X64-LABEL: test_mm512_mask3_fnmadd_round_pd:
3151 ; X64: # %bb.0: # %entry
3152 ; X64-NEXT: kmovw %edi, %k1
3153 ; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3154 ; X64-NEXT: vmovapd %zmm2, %zmm0
3157 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3158 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3159 %1 = bitcast i8 %__U to <8 x i1>
3160 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3164 define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3165 ; X86-LABEL: test_mm512_maskz_fnmadd_round_pd:
3166 ; X86: # %bb.0: # %entry
3167 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3168 ; X86-NEXT: kmovw %eax, %k1
3169 ; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3172 ; X64-LABEL: test_mm512_maskz_fnmadd_round_pd:
3173 ; X64: # %bb.0: # %entry
3174 ; X64-NEXT: kmovw %edi, %k1
3175 ; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3178 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3179 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
3180 %1 = bitcast i8 %__U to <8 x i1>
3181 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3185 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3186 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
3187 ; CHECK: # %bb.0: # %entry
3188 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3189 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3190 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3191 ; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
3192 ; CHECK-NEXT: ret{{[l|q]}}
3194 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3195 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3196 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3200 define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3201 ; X86-LABEL: test_mm512_maskz_fnmsub_round_pd:
3202 ; X86: # %bb.0: # %entry
3203 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3204 ; X86-NEXT: kmovw %eax, %k1
3205 ; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3208 ; X64-LABEL: test_mm512_maskz_fnmsub_round_pd:
3209 ; X64: # %bb.0: # %entry
3210 ; X64-NEXT: kmovw %edi, %k1
3211 ; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3214 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3215 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3216 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8)
3217 %1 = bitcast i8 %__U to <8 x i1>
3218 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3222 define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3223 ; CHECK-LABEL: test_mm512_fmadd_pd:
3224 ; CHECK: # %bb.0: # %entry
3225 ; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3226 ; CHECK-NEXT: ret{{[l|q]}}
3228 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3232 define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3233 ; X86-LABEL: test_mm512_mask_fmadd_pd:
3234 ; X86: # %bb.0: # %entry
3235 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3236 ; X86-NEXT: kmovw %eax, %k1
3237 ; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3240 ; X64-LABEL: test_mm512_mask_fmadd_pd:
3241 ; X64: # %bb.0: # %entry
3242 ; X64-NEXT: kmovw %edi, %k1
3243 ; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3246 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3247 %1 = bitcast i8 %__U to <8 x i1>
3248 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3252 define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3253 ; X86-LABEL: test_mm512_mask3_fmadd_pd:
3254 ; X86: # %bb.0: # %entry
3255 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3256 ; X86-NEXT: kmovw %eax, %k1
3257 ; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3258 ; X86-NEXT: vmovapd %zmm2, %zmm0
3261 ; X64-LABEL: test_mm512_mask3_fmadd_pd:
3262 ; X64: # %bb.0: # %entry
3263 ; X64-NEXT: kmovw %edi, %k1
3264 ; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3265 ; X64-NEXT: vmovapd %zmm2, %zmm0
3268 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3269 %1 = bitcast i8 %__U to <8 x i1>
3270 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3274 define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3275 ; X86-LABEL: test_mm512_maskz_fmadd_pd:
3276 ; X86: # %bb.0: # %entry
3277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3278 ; X86-NEXT: kmovw %eax, %k1
3279 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3282 ; X64-LABEL: test_mm512_maskz_fmadd_pd:
3283 ; X64: # %bb.0: # %entry
3284 ; X64-NEXT: kmovw %edi, %k1
3285 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3288 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
3289 %1 = bitcast i8 %__U to <8 x i1>
3290 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3294 define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3295 ; X86-LABEL: test_mm512_fmsub_pd:
3296 ; X86: # %bb.0: # %entry
3297 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3298 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3301 ; X64-LABEL: test_mm512_fmsub_pd:
3302 ; X64: # %bb.0: # %entry
3303 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
3304 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3307 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3308 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3312 define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3313 ; X86-LABEL: test_mm512_mask_fmsub_pd:
3314 ; X86: # %bb.0: # %entry
3315 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3316 ; X86-NEXT: kmovw %eax, %k1
3317 ; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3320 ; X64-LABEL: test_mm512_mask_fmsub_pd:
3321 ; X64: # %bb.0: # %entry
3322 ; X64-NEXT: kmovw %edi, %k1
3323 ; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3326 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3327 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3328 %1 = bitcast i8 %__U to <8 x i1>
3329 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3333 define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3334 ; X86-LABEL: test_mm512_maskz_fmsub_pd:
3335 ; X86: # %bb.0: # %entry
3336 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3337 ; X86-NEXT: kmovw %eax, %k1
3338 ; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3341 ; X64-LABEL: test_mm512_maskz_fmsub_pd:
3342 ; X64: # %bb.0: # %entry
3343 ; X64-NEXT: kmovw %edi, %k1
3344 ; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3347 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3348 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
3349 %1 = bitcast i8 %__U to <8 x i1>
3350 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3354 define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3355 ; X86-LABEL: test_mm512_fnmadd_pd:
3356 ; X86: # %bb.0: # %entry
3357 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
3358 ; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3361 ; X64-LABEL: test_mm512_fnmadd_pd:
3362 ; X64: # %bb.0: # %entry
3363 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
3364 ; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3367 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3368 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3372 define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3373 ; X86-LABEL: test_mm512_mask3_fnmadd_pd:
3374 ; X86: # %bb.0: # %entry
3375 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3376 ; X86-NEXT: kmovw %eax, %k1
3377 ; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3378 ; X86-NEXT: vmovapd %zmm2, %zmm0
3381 ; X64-LABEL: test_mm512_mask3_fnmadd_pd:
3382 ; X64: # %bb.0: # %entry
3383 ; X64-NEXT: kmovw %edi, %k1
3384 ; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3385 ; X64-NEXT: vmovapd %zmm2, %zmm0
3388 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3389 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3390 %1 = bitcast i8 %__U to <8 x i1>
3391 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3395 define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3396 ; X86-LABEL: test_mm512_maskz_fnmadd_pd:
3397 ; X86: # %bb.0: # %entry
3398 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3399 ; X86-NEXT: kmovw %eax, %k1
3400 ; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3403 ; X64-LABEL: test_mm512_maskz_fnmadd_pd:
3404 ; X64: # %bb.0: # %entry
3405 ; X64-NEXT: kmovw %edi, %k1
3406 ; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3409 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3410 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
3411 %1 = bitcast i8 %__U to <8 x i1>
3412 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3416 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3417 ; CHECK-LABEL: test_mm512_fnmsub_pd:
3418 ; CHECK: # %bb.0: # %entry
3419 ; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3420 ; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4
3421 ; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0
3422 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3423 ; CHECK-NEXT: ret{{[l|q]}}
3425 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3426 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3427 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3431 define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3432 ; X86-LABEL: test_mm512_maskz_fnmsub_pd:
3433 ; X86: # %bb.0: # %entry
3434 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3435 ; X86-NEXT: kmovw %eax, %k1
3436 ; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3439 ; X64-LABEL: test_mm512_maskz_fnmsub_pd:
3440 ; X64: # %bb.0: # %entry
3441 ; X64-NEXT: kmovw %edi, %k1
3442 ; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3445 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
3446 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
3447 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10
3448 %1 = bitcast i8 %__U to <8 x i1>
3449 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3453 define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3454 ; CHECK-LABEL: test_mm512_fmadd_round_ps:
3455 ; CHECK: # %bb.0: # %entry
3456 ; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3457 ; CHECK-NEXT: ret{{[l|q]}}
3459 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3463 declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
3465 define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3466 ; X86-LABEL: test_mm512_mask_fmadd_round_ps:
3467 ; X86: # %bb.0: # %entry
3468 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3469 ; X86-NEXT: kmovw %eax, %k1
3470 ; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3473 ; X64-LABEL: test_mm512_mask_fmadd_round_ps:
3474 ; X64: # %bb.0: # %entry
3475 ; X64-NEXT: kmovw %edi, %k1
3476 ; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3479 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3480 %1 = bitcast i16 %__U to <16 x i1>
3481 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3485 define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3486 ; X86-LABEL: test_mm512_mask3_fmadd_round_ps:
3487 ; X86: # %bb.0: # %entry
3488 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3489 ; X86-NEXT: kmovw %eax, %k1
3490 ; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3491 ; X86-NEXT: vmovaps %zmm2, %zmm0
3494 ; X64-LABEL: test_mm512_mask3_fmadd_round_ps:
3495 ; X64: # %bb.0: # %entry
3496 ; X64-NEXT: kmovw %edi, %k1
3497 ; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3498 ; X64-NEXT: vmovaps %zmm2, %zmm0
3501 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3502 %1 = bitcast i16 %__U to <16 x i1>
3503 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3507 define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3508 ; X86-LABEL: test_mm512_maskz_fmadd_round_ps:
3509 ; X86: # %bb.0: # %entry
3510 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3511 ; X86-NEXT: kmovw %eax, %k1
3512 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3515 ; X64-LABEL: test_mm512_maskz_fmadd_round_ps:
3516 ; X64: # %bb.0: # %entry
3517 ; X64-NEXT: kmovw %edi, %k1
3518 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3521 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
3522 %1 = bitcast i16 %__U to <16 x i1>
3523 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3527 define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3528 ; X86-LABEL: test_mm512_fmsub_round_ps:
3529 ; X86: # %bb.0: # %entry
3530 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3531 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3534 ; X64-LABEL: test_mm512_fmsub_round_ps:
3535 ; X64: # %bb.0: # %entry
3536 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3537 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3540 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3541 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3545 define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3546 ; X86-LABEL: test_mm512_mask_fmsub_round_ps:
3547 ; X86: # %bb.0: # %entry
3548 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3549 ; X86-NEXT: kmovw %eax, %k1
3550 ; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3553 ; X64-LABEL: test_mm512_mask_fmsub_round_ps:
3554 ; X64: # %bb.0: # %entry
3555 ; X64-NEXT: kmovw %edi, %k1
3556 ; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3559 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3560 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3561 %1 = bitcast i16 %__U to <16 x i1>
3562 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3566 define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3567 ; X86-LABEL: test_mm512_maskz_fmsub_round_ps:
3568 ; X86: # %bb.0: # %entry
3569 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3570 ; X86-NEXT: kmovw %eax, %k1
3571 ; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3574 ; X64-LABEL: test_mm512_maskz_fmsub_round_ps:
3575 ; X64: # %bb.0: # %entry
3576 ; X64-NEXT: kmovw %edi, %k1
3577 ; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3580 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3581 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
3582 %1 = bitcast i16 %__U to <16 x i1>
3583 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3587 define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3588 ; X86-LABEL: test_mm512_fnmadd_round_ps:
3589 ; X86: # %bb.0: # %entry
3590 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3591 ; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3594 ; X64-LABEL: test_mm512_fnmadd_round_ps:
3595 ; X64: # %bb.0: # %entry
3596 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3597 ; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
3600 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3601 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3605 define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3606 ; X86-LABEL: test_mm512_mask3_fnmadd_round_ps:
3607 ; X86: # %bb.0: # %entry
3608 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3609 ; X86-NEXT: kmovw %eax, %k1
3610 ; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3611 ; X86-NEXT: vmovaps %zmm2, %zmm0
3614 ; X64-LABEL: test_mm512_mask3_fnmadd_round_ps:
3615 ; X64: # %bb.0: # %entry
3616 ; X64-NEXT: kmovw %edi, %k1
3617 ; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3618 ; X64-NEXT: vmovaps %zmm2, %zmm0
3621 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3622 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3623 %1 = bitcast i16 %__U to <16 x i1>
3624 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3628 define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3629 ; X86-LABEL: test_mm512_maskz_fnmadd_round_ps:
3630 ; X86: # %bb.0: # %entry
3631 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3632 ; X86-NEXT: kmovw %eax, %k1
3633 ; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3636 ; X64-LABEL: test_mm512_maskz_fnmadd_round_ps:
3637 ; X64: # %bb.0: # %entry
3638 ; X64-NEXT: kmovw %edi, %k1
3639 ; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3642 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3643 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
3644 %1 = bitcast i16 %__U to <16 x i1>
3645 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3649 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3650 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
3651 ; CHECK: # %bb.0: # %entry
3652 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3653 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3654 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3655 ; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
3656 ; CHECK-NEXT: ret{{[l|q]}}
3658 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3659 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3660 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3664 define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3665 ; X86-LABEL: test_mm512_maskz_fnmsub_round_ps:
3666 ; X86: # %bb.0: # %entry
3667 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3668 ; X86-NEXT: kmovw %eax, %k1
3669 ; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3672 ; X64-LABEL: test_mm512_maskz_fnmsub_round_ps:
3673 ; X64: # %bb.0: # %entry
3674 ; X64-NEXT: kmovw %edi, %k1
3675 ; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3678 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3679 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3680 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8)
3681 %1 = bitcast i16 %__U to <16 x i1>
3682 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3686 define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3687 ; CHECK-LABEL: test_mm512_fmadd_ps:
3688 ; CHECK: # %bb.0: # %entry
3689 ; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3690 ; CHECK-NEXT: ret{{[l|q]}}
3692 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3696 define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3697 ; X86-LABEL: test_mm512_mask_fmadd_ps:
3698 ; X86: # %bb.0: # %entry
3699 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3700 ; X86-NEXT: kmovw %eax, %k1
3701 ; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3704 ; X64-LABEL: test_mm512_mask_fmadd_ps:
3705 ; X64: # %bb.0: # %entry
3706 ; X64-NEXT: kmovw %edi, %k1
3707 ; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
3710 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3711 %1 = bitcast i16 %__U to <16 x i1>
3712 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3716 define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3717 ; X86-LABEL: test_mm512_mask3_fmadd_ps:
3718 ; X86: # %bb.0: # %entry
3719 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3720 ; X86-NEXT: kmovw %eax, %k1
3721 ; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3722 ; X86-NEXT: vmovaps %zmm2, %zmm0
3725 ; X64-LABEL: test_mm512_mask3_fmadd_ps:
3726 ; X64: # %bb.0: # %entry
3727 ; X64-NEXT: kmovw %edi, %k1
3728 ; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2
3729 ; X64-NEXT: vmovaps %zmm2, %zmm0
3732 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3733 %1 = bitcast i16 %__U to <16 x i1>
3734 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3738 define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3739 ; X86-LABEL: test_mm512_maskz_fmadd_ps:
3740 ; X86: # %bb.0: # %entry
3741 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3742 ; X86-NEXT: kmovw %eax, %k1
3743 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3746 ; X64-LABEL: test_mm512_maskz_fmadd_ps:
3747 ; X64: # %bb.0: # %entry
3748 ; X64-NEXT: kmovw %edi, %k1
3749 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3752 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
3753 %1 = bitcast i16 %__U to <16 x i1>
3754 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3758 define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3759 ; X86-LABEL: test_mm512_fmsub_ps:
3760 ; X86: # %bb.0: # %entry
3761 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
3762 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3765 ; X64-LABEL: test_mm512_fmsub_ps:
3766 ; X64: # %bb.0: # %entry
3767 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
3768 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3771 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3772 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3776 define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
3777 ; X86-LABEL: test_mm512_mask_fmsub_ps:
3778 ; X86: # %bb.0: # %entry
3779 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3780 ; X86-NEXT: kmovw %eax, %k1
3781 ; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3784 ; X64-LABEL: test_mm512_mask_fmsub_ps:
3785 ; X64: # %bb.0: # %entry
3786 ; X64-NEXT: kmovw %edi, %k1
3787 ; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2
3790 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3791 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3792 %1 = bitcast i16 %__U to <16 x i1>
3793 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
3797 define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3798 ; X86-LABEL: test_mm512_maskz_fmsub_ps:
3799 ; X86: # %bb.0: # %entry
3800 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3801 ; X86-NEXT: kmovw %eax, %k1
3802 ; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3805 ; X64-LABEL: test_mm512_maskz_fmsub_ps:
3806 ; X64: # %bb.0: # %entry
3807 ; X64-NEXT: kmovw %edi, %k1
3808 ; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
3811 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3812 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
3813 %1 = bitcast i16 %__U to <16 x i1>
3814 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3818 define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3819 ; X86-LABEL: test_mm512_fnmadd_ps:
3820 ; X86: # %bb.0: # %entry
3821 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
3822 ; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3825 ; X64-LABEL: test_mm512_fnmadd_ps:
3826 ; X64: # %bb.0: # %entry
3827 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
3828 ; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
3831 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3832 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3836 define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
3837 ; X86-LABEL: test_mm512_mask3_fnmadd_ps:
3838 ; X86: # %bb.0: # %entry
3839 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3840 ; X86-NEXT: kmovw %eax, %k1
3841 ; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3842 ; X86-NEXT: vmovaps %zmm2, %zmm0
3845 ; X64-LABEL: test_mm512_mask3_fnmadd_ps:
3846 ; X64: # %bb.0: # %entry
3847 ; X64-NEXT: kmovw %edi, %k1
3848 ; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2
3849 ; X64-NEXT: vmovaps %zmm2, %zmm0
3852 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3853 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3854 %1 = bitcast i16 %__U to <16 x i1>
3855 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
3859 define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3860 ; X86-LABEL: test_mm512_maskz_fnmadd_ps:
3861 ; X86: # %bb.0: # %entry
3862 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3863 ; X86-NEXT: kmovw %eax, %k1
3864 ; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3867 ; X64-LABEL: test_mm512_maskz_fnmadd_ps:
3868 ; X64: # %bb.0: # %entry
3869 ; X64-NEXT: kmovw %edi, %k1
3870 ; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
3873 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3874 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
3875 %1 = bitcast i16 %__U to <16 x i1>
3876 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3880 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3881 ; CHECK-LABEL: test_mm512_fnmsub_ps:
3882 ; CHECK: # %bb.0: # %entry
3883 ; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
3884 ; CHECK-NEXT: vpxord %zmm3, %zmm0, %zmm4
3885 ; CHECK-NEXT: vpxord %zmm3, %zmm2, %zmm0
3886 ; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
3887 ; CHECK-NEXT: ret{{[l|q]}}
3889 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3890 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3891 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3895 define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
3896 ; X86-LABEL: test_mm512_maskz_fnmsub_ps:
3897 ; X86: # %bb.0: # %entry
3898 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
3899 ; X86-NEXT: kmovw %eax, %k1
3900 ; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3903 ; X64-LABEL: test_mm512_maskz_fnmsub_ps:
3904 ; X64: # %bb.0: # %entry
3905 ; X64-NEXT: kmovw %edi, %k1
3906 ; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
3909 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
3910 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
3911 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10
3912 %1 = bitcast i16 %__U to <16 x i1>
3913 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
3917 define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3918 ; CHECK-LABEL: test_mm512_fmaddsub_round_pd:
3919 ; CHECK: # %bb.0: # %entry
3920 ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3921 ; CHECK-NEXT: ret{{[l|q]}}
3923 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3927 declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1
3929 define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
3930 ; X86-LABEL: test_mm512_mask_fmaddsub_round_pd:
3931 ; X86: # %bb.0: # %entry
3932 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3933 ; X86-NEXT: kmovw %eax, %k1
3934 ; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3937 ; X64-LABEL: test_mm512_mask_fmaddsub_round_pd:
3938 ; X64: # %bb.0: # %entry
3939 ; X64-NEXT: kmovw %edi, %k1
3940 ; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
3943 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3944 %1 = bitcast i8 %__U to <8 x i1>
3945 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
3949 define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
3950 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3951 ; X86: # %bb.0: # %entry
3952 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3953 ; X86-NEXT: kmovw %eax, %k1
3954 ; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3955 ; X86-NEXT: vmovapd %zmm2, %zmm0
3958 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd:
3959 ; X64: # %bb.0: # %entry
3960 ; X64-NEXT: kmovw %edi, %k1
3961 ; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
3962 ; X64-NEXT: vmovapd %zmm2, %zmm0
3965 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3966 %1 = bitcast i8 %__U to <8 x i1>
3967 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
3971 define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3972 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3973 ; X86: # %bb.0: # %entry
3974 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
3975 ; X86-NEXT: kmovw %eax, %k1
3976 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3979 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd:
3980 ; X64: # %bb.0: # %entry
3981 ; X64-NEXT: kmovw %edi, %k1
3982 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
3985 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8)
3986 %1 = bitcast i8 %__U to <8 x i1>
3987 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
3991 define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
3992 ; X86-LABEL: test_mm512_fmsubadd_round_pd:
3993 ; X86: # %bb.0: # %entry
3994 ; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2
3995 ; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
3998 ; X64-LABEL: test_mm512_fmsubadd_round_pd:
3999 ; X64: # %bb.0: # %entry
4000 ; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2
4001 ; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
4004 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4005 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4009 define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4010 ; X86-LABEL: test_mm512_mask_fmsubadd_round_pd:
4011 ; X86: # %bb.0: # %entry
4012 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4013 ; X86-NEXT: kmovw %eax, %k1
4014 ; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4017 ; X64-LABEL: test_mm512_mask_fmsubadd_round_pd:
4018 ; X64: # %bb.0: # %entry
4019 ; X64-NEXT: kmovw %edi, %k1
4020 ; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4023 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4024 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4025 %1 = bitcast i8 %__U to <8 x i1>
4026 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4030 define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4031 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4032 ; X86: # %bb.0: # %entry
4033 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4034 ; X86-NEXT: kmovw %eax, %k1
4035 ; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4038 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd:
4039 ; X64: # %bb.0: # %entry
4040 ; X64-NEXT: kmovw %edi, %k1
4041 ; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4044 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4045 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4046 %1 = bitcast i8 %__U to <8 x i1>
4047 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
4051 define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4052 ; CHECK-LABEL: test_mm512_fmaddsub_pd:
4053 ; CHECK: # %bb.0: # %entry
4054 ; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4055 ; CHECK-NEXT: ret{{[l|q]}}
4057 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4058 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4059 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4060 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4064 define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4065 ; X86-LABEL: test_mm512_mask_fmaddsub_pd:
4066 ; X86: # %bb.0: # %entry
4067 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4068 ; X86-NEXT: kmovw %eax, %k1
4069 ; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4072 ; X64-LABEL: test_mm512_mask_fmaddsub_pd:
4073 ; X64: # %bb.0: # %entry
4074 ; X64-NEXT: kmovw %edi, %k1
4075 ; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4078 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4079 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4080 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4081 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4082 %4 = bitcast i8 %__U to <8 x i1>
4083 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A
4087 define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4088 ; X86-LABEL: test_mm512_mask3_fmaddsub_pd:
4089 ; X86: # %bb.0: # %entry
4090 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4091 ; X86-NEXT: kmovw %eax, %k1
4092 ; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4093 ; X86-NEXT: vmovapd %zmm2, %zmm0
4096 ; X64-LABEL: test_mm512_mask3_fmaddsub_pd:
4097 ; X64: # %bb.0: # %entry
4098 ; X64-NEXT: kmovw %edi, %k1
4099 ; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4100 ; X64-NEXT: vmovapd %zmm2, %zmm0
4103 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4104 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4105 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4106 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4107 %4 = bitcast i8 %__U to <8 x i1>
4108 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C
4112 define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4113 ; X86-LABEL: test_mm512_maskz_fmaddsub_pd:
4114 ; X86: # %bb.0: # %entry
4115 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4116 ; X86-NEXT: kmovw %eax, %k1
4117 ; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4120 ; X64-LABEL: test_mm512_maskz_fmaddsub_pd:
4121 ; X64: # %bb.0: # %entry
4122 ; X64-NEXT: kmovw %edi, %k1
4123 ; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4126 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4127 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4128 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10
4129 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4130 %4 = bitcast i8 %__U to <8 x i1>
4131 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer
4135 define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4136 ; CHECK-LABEL: test_mm512_fmsubadd_pd:
4137 ; CHECK: # %bb.0: # %entry
4138 ; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4139 ; CHECK-NEXT: ret{{[l|q]}}
4141 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4142 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4143 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4144 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4148 define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4149 ; X86-LABEL: test_mm512_mask_fmsubadd_pd:
4150 ; X86: # %bb.0: # %entry
4151 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4152 ; X86-NEXT: kmovw %eax, %k1
4153 ; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4156 ; X64-LABEL: test_mm512_mask_fmsubadd_pd:
4157 ; X64: # %bb.0: # %entry
4158 ; X64-NEXT: kmovw %edi, %k1
4159 ; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4162 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4163 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4164 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4165 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4166 %3 = bitcast i8 %__U to <8 x i1>
4167 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A
4171 define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
4172 ; X86-LABEL: test_mm512_maskz_fmsubadd_pd:
4173 ; X86: # %bb.0: # %entry
4174 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4175 ; X86-NEXT: kmovw %eax, %k1
4176 ; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4179 ; X64-LABEL: test_mm512_maskz_fmsubadd_pd:
4180 ; X64: # %bb.0: # %entry
4181 ; X64-NEXT: kmovw %edi, %k1
4182 ; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4185 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4186 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4187 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4188 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4189 %3 = bitcast i8 %__U to <8 x i1>
4190 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
4194 define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4195 ; CHECK-LABEL: test_mm512_fmaddsub_round_ps:
4196 ; CHECK: # %bb.0: # %entry
4197 ; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4198 ; CHECK-NEXT: ret{{[l|q]}}
4200 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4204 declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1
4206 define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4207 ; X86-LABEL: test_mm512_mask_fmaddsub_round_ps:
4208 ; X86: # %bb.0: # %entry
4209 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4210 ; X86-NEXT: kmovw %eax, %k1
4211 ; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4214 ; X64-LABEL: test_mm512_mask_fmaddsub_round_ps:
4215 ; X64: # %bb.0: # %entry
4216 ; X64-NEXT: kmovw %edi, %k1
4217 ; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4220 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4221 %1 = bitcast i16 %__U to <16 x i1>
4222 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4226 define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4227 ; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4228 ; X86: # %bb.0: # %entry
4229 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4230 ; X86-NEXT: kmovw %eax, %k1
4231 ; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4232 ; X86-NEXT: vmovaps %zmm2, %zmm0
4235 ; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps:
4236 ; X64: # %bb.0: # %entry
4237 ; X64-NEXT: kmovw %edi, %k1
4238 ; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4239 ; X64-NEXT: vmovaps %zmm2, %zmm0
4242 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4243 %1 = bitcast i16 %__U to <16 x i1>
4244 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4248 define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4249 ; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4250 ; X86: # %bb.0: # %entry
4251 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4252 ; X86-NEXT: kmovw %eax, %k1
4253 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4256 ; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps:
4257 ; X64: # %bb.0: # %entry
4258 ; X64-NEXT: kmovw %edi, %k1
4259 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4262 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8)
4263 %1 = bitcast i16 %__U to <16 x i1>
4264 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4268 define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4269 ; X86-LABEL: test_mm512_fmsubadd_round_ps:
4270 ; X86: # %bb.0: # %entry
4271 ; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2
4272 ; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4275 ; X64-LABEL: test_mm512_fmsubadd_round_ps:
4276 ; X64: # %bb.0: # %entry
4277 ; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2
4278 ; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
4281 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4282 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4286 define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4287 ; X86-LABEL: test_mm512_mask_fmsubadd_round_ps:
4288 ; X86: # %bb.0: # %entry
4289 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4290 ; X86-NEXT: kmovw %eax, %k1
4291 ; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4294 ; X64-LABEL: test_mm512_mask_fmsubadd_round_ps:
4295 ; X64: # %bb.0: # %entry
4296 ; X64-NEXT: kmovw %edi, %k1
4297 ; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4300 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4301 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4302 %1 = bitcast i16 %__U to <16 x i1>
4303 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4307 define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4308 ; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4309 ; X86: # %bb.0: # %entry
4310 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4311 ; X86-NEXT: kmovw %eax, %k1
4312 ; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4315 ; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps:
4316 ; X64: # %bb.0: # %entry
4317 ; X64-NEXT: kmovw %edi, %k1
4318 ; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
4321 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4322 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4323 %1 = bitcast i16 %__U to <16 x i1>
4324 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
4328 define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4329 ; CHECK-LABEL: test_mm512_fmaddsub_ps:
4330 ; CHECK: # %bb.0: # %entry
4331 ; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4332 ; CHECK-NEXT: ret{{[l|q]}}
4334 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4335 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4336 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4337 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4341 define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4342 ; X86-LABEL: test_mm512_mask_fmaddsub_ps:
4343 ; X86: # %bb.0: # %entry
4344 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4345 ; X86-NEXT: kmovw %eax, %k1
4346 ; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4349 ; X64-LABEL: test_mm512_mask_fmaddsub_ps:
4350 ; X64: # %bb.0: # %entry
4351 ; X64-NEXT: kmovw %edi, %k1
4352 ; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2
4355 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4356 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4357 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4358 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4359 %4 = bitcast i16 %__U to <16 x i1>
4360 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A
4364 define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4365 ; X86-LABEL: test_mm512_mask3_fmaddsub_ps:
4366 ; X86: # %bb.0: # %entry
4367 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4368 ; X86-NEXT: kmovw %eax, %k1
4369 ; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4370 ; X86-NEXT: vmovaps %zmm2, %zmm0
4373 ; X64-LABEL: test_mm512_mask3_fmaddsub_ps:
4374 ; X64: # %bb.0: # %entry
4375 ; X64-NEXT: kmovw %edi, %k1
4376 ; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2
4377 ; X64-NEXT: vmovaps %zmm2, %zmm0
4380 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4381 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4382 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4383 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4384 %4 = bitcast i16 %__U to <16 x i1>
4385 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C
4389 define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4390 ; X86-LABEL: test_mm512_maskz_fmaddsub_ps:
4391 ; X86: # %bb.0: # %entry
4392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4393 ; X86-NEXT: kmovw %eax, %k1
4394 ; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4397 ; X64-LABEL: test_mm512_maskz_fmaddsub_ps:
4398 ; X64: # %bb.0: # %entry
4399 ; X64-NEXT: kmovw %edi, %k1
4400 ; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2
4403 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4404 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4405 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10
4406 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4407 %4 = bitcast i16 %__U to <16 x i1>
4408 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer
4412 define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4413 ; CHECK-LABEL: test_mm512_fmsubadd_ps:
4414 ; CHECK: # %bb.0: # %entry
4415 ; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4416 ; CHECK-NEXT: ret{{[l|q]}}
4418 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4419 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4420 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4421 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4425 define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4426 ; X86-LABEL: test_mm512_mask_fmsubadd_ps:
4427 ; X86: # %bb.0: # %entry
4428 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4429 ; X86-NEXT: kmovw %eax, %k1
4430 ; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4433 ; X64-LABEL: test_mm512_mask_fmsubadd_ps:
4434 ; X64: # %bb.0: # %entry
4435 ; X64-NEXT: kmovw %edi, %k1
4436 ; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
4439 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4440 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4441 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4442 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4443 %3 = bitcast i16 %__U to <16 x i1>
4444 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A
4448 define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
4449 ; X86-LABEL: test_mm512_maskz_fmsubadd_ps:
4450 ; X86: # %bb.0: # %entry
4451 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4452 ; X86-NEXT: kmovw %eax, %k1
4453 ; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4456 ; X64-LABEL: test_mm512_maskz_fmsubadd_ps:
4457 ; X64: # %bb.0: # %entry
4458 ; X64-NEXT: kmovw %edi, %k1
4459 ; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2
4462 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4463 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4464 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4465 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4466 %3 = bitcast i16 %__U to <16 x i1>
4467 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
4471 define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4472 ; X86-LABEL: test_mm512_mask3_fmsub_round_pd:
4473 ; X86: # %bb.0: # %entry
4474 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4475 ; X86-NEXT: kmovw %eax, %k1
4476 ; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4477 ; X86-NEXT: vmovapd %zmm2, %zmm0
4480 ; X64-LABEL: test_mm512_mask3_fmsub_round_pd:
4481 ; X64: # %bb.0: # %entry
4482 ; X64-NEXT: kmovw %edi, %k1
4483 ; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4484 ; X64-NEXT: vmovapd %zmm2, %zmm0
4487 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4488 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4489 %1 = bitcast i8 %__U to <8 x i1>
4490 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4494 define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4495 ; X86-LABEL: test_mm512_mask3_fmsub_pd:
4496 ; X86: # %bb.0: # %entry
4497 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4498 ; X86-NEXT: kmovw %eax, %k1
4499 ; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4500 ; X86-NEXT: vmovapd %zmm2, %zmm0
4503 ; X64-LABEL: test_mm512_mask3_fmsub_pd:
4504 ; X64: # %bb.0: # %entry
4505 ; X64-NEXT: kmovw %edi, %k1
4506 ; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4507 ; X64-NEXT: vmovapd %zmm2, %zmm0
4510 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4511 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4512 %1 = bitcast i8 %__U to <8 x i1>
4513 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4517 define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4518 ; X86-LABEL: test_mm512_mask3_fmsub_round_ps:
4519 ; X86: # %bb.0: # %entry
4520 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4521 ; X86-NEXT: kmovw %eax, %k1
4522 ; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4523 ; X86-NEXT: vmovaps %zmm2, %zmm0
4526 ; X64-LABEL: test_mm512_mask3_fmsub_round_ps:
4527 ; X64: # %bb.0: # %entry
4528 ; X64-NEXT: kmovw %edi, %k1
4529 ; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4530 ; X64-NEXT: vmovaps %zmm2, %zmm0
4533 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4534 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4535 %1 = bitcast i16 %__U to <16 x i1>
4536 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4540 define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4541 ; X86-LABEL: test_mm512_mask3_fmsub_ps:
4542 ; X86: # %bb.0: # %entry
4543 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4544 ; X86-NEXT: kmovw %eax, %k1
4545 ; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4546 ; X86-NEXT: vmovaps %zmm2, %zmm0
4549 ; X64-LABEL: test_mm512_mask3_fmsub_ps:
4550 ; X64: # %bb.0: # %entry
4551 ; X64-NEXT: kmovw %edi, %k1
4552 ; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
4553 ; X64-NEXT: vmovaps %zmm2, %zmm0
4556 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4557 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4558 %1 = bitcast i16 %__U to <16 x i1>
4559 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4563 define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4564 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4565 ; X86: # %bb.0: # %entry
4566 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4567 ; X86-NEXT: kmovw %eax, %k1
4568 ; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4569 ; X86-NEXT: vmovapd %zmm2, %zmm0
4572 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd:
4573 ; X64: # %bb.0: # %entry
4574 ; X64-NEXT: kmovw %edi, %k1
4575 ; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4576 ; X64-NEXT: vmovapd %zmm2, %zmm0
4579 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4580 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8)
4581 %1 = bitcast i8 %__U to <8 x i1>
4582 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4586 define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4587 ; X86-LABEL: test_mm512_mask3_fmsubadd_pd:
4588 ; X86: # %bb.0: # %entry
4589 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4590 ; X86-NEXT: kmovw %eax, %k1
4591 ; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4592 ; X86-NEXT: vmovapd %zmm2, %zmm0
4595 ; X64-LABEL: test_mm512_mask3_fmsubadd_pd:
4596 ; X64: # %bb.0: # %entry
4597 ; X64-NEXT: kmovw %edi, %k1
4598 ; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4599 ; X64-NEXT: vmovapd %zmm2, %zmm0
4602 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4603 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10
4604 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10
4605 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
4606 %3 = bitcast i8 %__U to <8 x i1>
4607 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C
4611 define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4612 ; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4613 ; X86: # %bb.0: # %entry
4614 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4615 ; X86-NEXT: kmovw %eax, %k1
4616 ; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4617 ; X86-NEXT: vmovaps %zmm2, %zmm0
4620 ; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps:
4621 ; X64: # %bb.0: # %entry
4622 ; X64-NEXT: kmovw %edi, %k1
4623 ; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4624 ; X64-NEXT: vmovaps %zmm2, %zmm0
4627 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4628 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8)
4629 %1 = bitcast i16 %__U to <16 x i1>
4630 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4634 define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4635 ; X86-LABEL: test_mm512_mask3_fmsubadd_ps:
4636 ; X86: # %bb.0: # %entry
4637 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4638 ; X86-NEXT: kmovw %eax, %k1
4639 ; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4640 ; X86-NEXT: vmovaps %zmm2, %zmm0
4643 ; X64-LABEL: test_mm512_mask3_fmsubadd_ps:
4644 ; X64: # %bb.0: # %entry
4645 ; X64-NEXT: kmovw %edi, %k1
4646 ; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2
4647 ; X64-NEXT: vmovaps %zmm2, %zmm0
4650 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4651 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10
4652 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10
4653 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
4654 %3 = bitcast i16 %__U to <16 x i1>
4655 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C
4659 define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4660 ; X86-LABEL: test_mm512_mask_fnmadd_round_pd:
4661 ; X86: # %bb.0: # %entry
4662 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4663 ; X86-NEXT: kmovw %eax, %k1
4664 ; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4667 ; X64-LABEL: test_mm512_mask_fnmadd_round_pd:
4668 ; X64: # %bb.0: # %entry
4669 ; X64-NEXT: kmovw %edi, %k1
4670 ; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4673 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4674 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8)
4675 %1 = bitcast i8 %__U to <8 x i1>
4676 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4680 define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4681 ; X86-LABEL: test_mm512_mask_fnmadd_pd:
4682 ; X86: # %bb.0: # %entry
4683 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4684 ; X86-NEXT: kmovw %eax, %k1
4685 ; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4688 ; X64-LABEL: test_mm512_mask_fnmadd_pd:
4689 ; X64: # %bb.0: # %entry
4690 ; X64-NEXT: kmovw %edi, %k1
4691 ; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4694 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A
4695 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10
4696 %1 = bitcast i8 %__U to <8 x i1>
4697 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4701 define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4702 ; X86-LABEL: test_mm512_mask_fnmadd_round_ps:
4703 ; X86: # %bb.0: # %entry
4704 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4705 ; X86-NEXT: kmovw %eax, %k1
4706 ; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4709 ; X64-LABEL: test_mm512_mask_fnmadd_round_ps:
4710 ; X64: # %bb.0: # %entry
4711 ; X64-NEXT: kmovw %edi, %k1
4712 ; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4715 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4716 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8)
4717 %1 = bitcast i16 %__U to <16 x i1>
4718 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4722 define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4723 ; X86-LABEL: test_mm512_mask_fnmadd_ps:
4724 ; X86: # %bb.0: # %entry
4725 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4726 ; X86-NEXT: kmovw %eax, %k1
4727 ; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4730 ; X64-LABEL: test_mm512_mask_fnmadd_ps:
4731 ; X64: # %bb.0: # %entry
4732 ; X64-NEXT: kmovw %edi, %k1
4733 ; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2
4736 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A
4737 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10
4738 %1 = bitcast i16 %__U to <16 x i1>
4739 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4743 define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4744 ; X86-LABEL: test_mm512_mask_fnmsub_round_pd:
4745 ; X86: # %bb.0: # %entry
4746 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4747 ; X86-NEXT: kmovw %eax, %k1
4748 ; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4751 ; X64-LABEL: test_mm512_mask_fnmsub_round_pd:
4752 ; X64: # %bb.0: # %entry
4753 ; X64-NEXT: kmovw %edi, %k1
4754 ; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4757 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4758 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4759 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4760 %1 = bitcast i8 %__U to <8 x i1>
4761 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4765 define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4766 ; X86-LABEL: test_mm512_mask3_fnmsub_round_pd:
4767 ; X86: # %bb.0: # %entry
4768 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4769 ; X86-NEXT: kmovw %eax, %k1
4770 ; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4771 ; X86-NEXT: vmovapd %zmm2, %zmm0
4774 ; X64-LABEL: test_mm512_mask3_fnmsub_round_pd:
4775 ; X64: # %bb.0: # %entry
4776 ; X64-NEXT: kmovw %edi, %k1
4777 ; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4778 ; X64-NEXT: vmovapd %zmm2, %zmm0
4781 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4782 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4783 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8)
4784 %1 = bitcast i8 %__U to <8 x i1>
4785 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4789 define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) {
4790 ; X86-LABEL: test_mm512_mask_fnmsub_pd:
4791 ; X86: # %bb.0: # %entry
4792 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4793 ; X86-NEXT: kmovw %eax, %k1
4794 ; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4797 ; X64-LABEL: test_mm512_mask_fnmsub_pd:
4798 ; X64: # %bb.0: # %entry
4799 ; X64-NEXT: kmovw %edi, %k1
4800 ; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4803 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4804 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4805 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4806 %1 = bitcast i8 %__U to <8 x i1>
4807 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A
4811 define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) {
4812 ; X86-LABEL: test_mm512_mask3_fnmsub_pd:
4813 ; X86: # %bb.0: # %entry
4814 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4815 ; X86-NEXT: kmovw %eax, %k1
4816 ; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4817 ; X86-NEXT: vmovapd %zmm2, %zmm0
4820 ; X64-LABEL: test_mm512_mask3_fnmsub_pd:
4821 ; X64: # %bb.0: # %entry
4822 ; X64-NEXT: kmovw %edi, %k1
4823 ; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4824 ; X64-NEXT: vmovapd %zmm2, %zmm0
4827 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B
4828 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C
4829 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10
4830 %1 = bitcast i8 %__U to <8 x i1>
4831 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C
4835 define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4836 ; X86-LABEL: test_mm512_mask_fnmsub_round_ps:
4837 ; X86: # %bb.0: # %entry
4838 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4839 ; X86-NEXT: kmovw %eax, %k1
4840 ; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4843 ; X64-LABEL: test_mm512_mask_fnmsub_round_ps:
4844 ; X64: # %bb.0: # %entry
4845 ; X64-NEXT: kmovw %edi, %k1
4846 ; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
4849 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4850 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4851 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4852 %1 = bitcast i16 %__U to <16 x i1>
4853 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4857 define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4858 ; X86-LABEL: test_mm512_mask3_fnmsub_round_ps:
4859 ; X86: # %bb.0: # %entry
4860 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4861 ; X86-NEXT: kmovw %eax, %k1
4862 ; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4863 ; X86-NEXT: vmovaps %zmm2, %zmm0
4866 ; X64-LABEL: test_mm512_mask3_fnmsub_round_ps:
4867 ; X64: # %bb.0: # %entry
4868 ; X64-NEXT: kmovw %edi, %k1
4869 ; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
4870 ; X64-NEXT: vmovaps %zmm2, %zmm0
4873 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4874 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4875 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8)
4876 %1 = bitcast i16 %__U to <16 x i1>
4877 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4881 define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) {
4882 ; X86-LABEL: test_mm512_mask_fnmsub_ps:
4883 ; X86: # %bb.0: # %entry
4884 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4885 ; X86-NEXT: kmovw %eax, %k1
4886 ; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4889 ; X64-LABEL: test_mm512_mask_fnmsub_ps:
4890 ; X64: # %bb.0: # %entry
4891 ; X64-NEXT: kmovw %edi, %k1
4892 ; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2
4895 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4896 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4897 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4898 %1 = bitcast i16 %__U to <16 x i1>
4899 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A
4903 define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) {
4904 ; X86-LABEL: test_mm512_mask3_fnmsub_ps:
4905 ; X86: # %bb.0: # %entry
4906 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
4907 ; X86-NEXT: kmovw %eax, %k1
4908 ; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4909 ; X86-NEXT: vmovaps %zmm2, %zmm0
4912 ; X64-LABEL: test_mm512_mask3_fnmsub_ps:
4913 ; X64: # %bb.0: # %entry
4914 ; X64-NEXT: kmovw %edi, %k1
4915 ; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2
4916 ; X64-NEXT: vmovaps %zmm2, %zmm0
4919 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B
4920 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C
4921 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10
4922 %1 = bitcast i16 %__U to <16 x i1>
4923 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C
4927 define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4928 ; X86-LABEL: test_mm_mask_fmadd_ss:
4929 ; X86: # %bb.0: # %entry
4930 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4931 ; X86-NEXT: kmovw %eax, %k1
4932 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4935 ; X64-LABEL: test_mm_mask_fmadd_ss:
4936 ; X64: # %bb.0: # %entry
4937 ; X64-NEXT: kmovw %edi, %k1
4938 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4941 %0 = extractelement <4 x float> %__W, i64 0
4942 %1 = extractelement <4 x float> %__A, i64 0
4943 %2 = extractelement <4 x float> %__B, i64 0
4944 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4946 %tobool.i = icmp eq i8 %4, 0
4947 %vecext1.i = extractelement <4 x float> %__W, i32 0
4948 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
4949 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
4950 ret <4 x float> %vecins.i
4953 define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
4954 ; X86-LABEL: test_mm_mask_fmadd_round_ss:
4955 ; X86: # %bb.0: # %entry
4956 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4957 ; X86-NEXT: kmovw %eax, %k1
4958 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
4961 ; X64-LABEL: test_mm_mask_fmadd_round_ss:
4962 ; X64: # %bb.0: # %entry
4963 ; X64-NEXT: kmovw %edi, %k1
4964 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
4967 %0 = extractelement <4 x float> %__W, i64 0
4968 %1 = extractelement <4 x float> %__A, i64 0
4969 %2 = extractelement <4 x float> %__B, i64 0
4970 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
4971 %4 = bitcast i8 %__U to <8 x i1>
4972 %5 = extractelement <8 x i1> %4, i64 0
4973 %6 = select i1 %5, float %3, float %0
4974 %7 = insertelement <4 x float> %__W, float %6, i64 0
4978 declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1
4980 define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
4981 ; X86-LABEL: test_mm_maskz_fmadd_ss:
4982 ; X86: # %bb.0: # %entry
4983 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
4984 ; X86-NEXT: kmovw %eax, %k1
4985 ; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4988 ; X64-LABEL: test_mm_maskz_fmadd_ss:
4989 ; X64: # %bb.0: # %entry
4990 ; X64-NEXT: kmovw %edi, %k1
4991 ; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
4994 %0 = extractelement <4 x float> %__A, i64 0
4995 %1 = extractelement <4 x float> %__B, i64 0
4996 %2 = extractelement <4 x float> %__C, i64 0
4997 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
4999 %tobool.i = icmp eq i8 %4, 0
5000 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5001 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5002 ret <4 x float> %vecins.i
5005 define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5006 ; X86-LABEL: test_mm_maskz_fmadd_round_ss:
5007 ; X86: # %bb.0: # %entry
5008 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5009 ; X86-NEXT: kmovw %eax, %k1
5010 ; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5013 ; X64-LABEL: test_mm_maskz_fmadd_round_ss:
5014 ; X64: # %bb.0: # %entry
5015 ; X64-NEXT: kmovw %edi, %k1
5016 ; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5019 %0 = extractelement <4 x float> %__A, i64 0
5020 %1 = extractelement <4 x float> %__B, i64 0
5021 %2 = extractelement <4 x float> %__C, i64 0
5022 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5023 %4 = bitcast i8 %__U to <8 x i1>
5024 %5 = extractelement <8 x i1> %4, i64 0
5025 %6 = select i1 %5, float %3, float 0.000000e+00
5026 %7 = insertelement <4 x float> %__A, float %6, i64 0
5030 define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5031 ; X86-LABEL: test_mm_mask3_fmadd_ss:
5032 ; X86: # %bb.0: # %entry
5033 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5034 ; X86-NEXT: kmovw %eax, %k1
5035 ; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5036 ; X86-NEXT: vmovaps %xmm2, %xmm0
5039 ; X64-LABEL: test_mm_mask3_fmadd_ss:
5040 ; X64: # %bb.0: # %entry
5041 ; X64-NEXT: kmovw %edi, %k1
5042 ; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5043 ; X64-NEXT: vmovaps %xmm2, %xmm0
5046 %0 = extractelement <4 x float> %__W, i64 0
5047 %1 = extractelement <4 x float> %__X, i64 0
5048 %2 = extractelement <4 x float> %__Y, i64 0
5049 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5051 %tobool.i = icmp eq i8 %4, 0
5052 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5053 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5054 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5055 ret <4 x float> %vecins.i
5058 define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5059 ; X86-LABEL: test_mm_mask3_fmadd_round_ss:
5060 ; X86: # %bb.0: # %entry
5061 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5062 ; X86-NEXT: kmovw %eax, %k1
5063 ; X86-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5064 ; X86-NEXT: vmovaps %xmm2, %xmm0
5067 ; X64-LABEL: test_mm_mask3_fmadd_round_ss:
5068 ; X64: # %bb.0: # %entry
5069 ; X64-NEXT: kmovw %edi, %k1
5070 ; X64-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5071 ; X64-NEXT: vmovaps %xmm2, %xmm0
5074 %0 = extractelement <4 x float> %__W, i64 0
5075 %1 = extractelement <4 x float> %__X, i64 0
5076 %2 = extractelement <4 x float> %__Y, i64 0
5077 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5078 %4 = bitcast i8 %__U to <8 x i1>
5079 %5 = extractelement <8 x i1> %4, i64 0
5080 %6 = select i1 %5, float %3, float %2
5081 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5085 define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5086 ; X86-LABEL: test_mm_mask_fmsub_ss:
5087 ; X86: # %bb.0: # %entry
5088 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5089 ; X86-NEXT: kmovw %eax, %k1
5090 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5093 ; X64-LABEL: test_mm_mask_fmsub_ss:
5094 ; X64: # %bb.0: # %entry
5095 ; X64-NEXT: kmovw %edi, %k1
5096 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5099 %0 = extractelement <4 x float> %__W, i64 0
5100 %1 = extractelement <4 x float> %__A, i64 0
5101 %.rhs.i = extractelement <4 x float> %__B, i64 0
5102 %2 = fsub float -0.000000e+00, %.rhs.i
5103 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5105 %tobool.i = icmp eq i8 %4, 0
5106 %vecext1.i = extractelement <4 x float> %__W, i32 0
5107 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5108 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5109 ret <4 x float> %vecins.i
5112 define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5113 ; X86-LABEL: test_mm_mask_fmsub_round_ss:
5114 ; X86: # %bb.0: # %entry
5115 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5116 ; X86-NEXT: kmovw %eax, %k1
5117 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5120 ; X64-LABEL: test_mm_mask_fmsub_round_ss:
5121 ; X64: # %bb.0: # %entry
5122 ; X64-NEXT: kmovw %edi, %k1
5123 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5126 %0 = extractelement <4 x float> %__W, i64 0
5127 %1 = extractelement <4 x float> %__A, i64 0
5128 %.rhs = extractelement <4 x float> %__B, i64 0
5129 %2 = fsub float -0.000000e+00, %.rhs
5130 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5131 %4 = bitcast i8 %__U to <8 x i1>
5132 %5 = extractelement <8 x i1> %4, i64 0
5133 %6 = select i1 %5, float %3, float %0
5134 %7 = insertelement <4 x float> %__W, float %6, i64 0
5138 define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5139 ; X86-LABEL: test_mm_maskz_fmsub_ss:
5140 ; X86: # %bb.0: # %entry
5141 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5142 ; X86-NEXT: kmovw %eax, %k1
5143 ; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5146 ; X64-LABEL: test_mm_maskz_fmsub_ss:
5147 ; X64: # %bb.0: # %entry
5148 ; X64-NEXT: kmovw %edi, %k1
5149 ; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5152 %0 = extractelement <4 x float> %__A, i64 0
5153 %1 = extractelement <4 x float> %__B, i64 0
5154 %.rhs.i = extractelement <4 x float> %__C, i64 0
5155 %2 = fsub float -0.000000e+00, %.rhs.i
5156 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5158 %tobool.i = icmp eq i8 %4, 0
5159 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5160 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5161 ret <4 x float> %vecins.i
5164 define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5165 ; X86-LABEL: test_mm_maskz_fmsub_round_ss:
5166 ; X86: # %bb.0: # %entry
5167 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5168 ; X86-NEXT: kmovw %eax, %k1
5169 ; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5172 ; X64-LABEL: test_mm_maskz_fmsub_round_ss:
5173 ; X64: # %bb.0: # %entry
5174 ; X64-NEXT: kmovw %edi, %k1
5175 ; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5178 %0 = extractelement <4 x float> %__A, i64 0
5179 %1 = extractelement <4 x float> %__B, i64 0
5180 %.rhs = extractelement <4 x float> %__C, i64 0
5181 %2 = fsub float -0.000000e+00, %.rhs
5182 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5183 %4 = bitcast i8 %__U to <8 x i1>
5184 %5 = extractelement <8 x i1> %4, i64 0
5185 %6 = select i1 %5, float %3, float 0.000000e+00
5186 %7 = insertelement <4 x float> %__A, float %6, i64 0
5190 define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5191 ; X86-LABEL: test_mm_mask3_fmsub_ss:
5192 ; X86: # %bb.0: # %entry
5193 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5194 ; X86-NEXT: kmovw %eax, %k1
5195 ; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5196 ; X86-NEXT: vmovaps %xmm2, %xmm0
5199 ; X64-LABEL: test_mm_mask3_fmsub_ss:
5200 ; X64: # %bb.0: # %entry
5201 ; X64-NEXT: kmovw %edi, %k1
5202 ; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5203 ; X64-NEXT: vmovaps %xmm2, %xmm0
5206 %0 = extractelement <4 x float> %__W, i64 0
5207 %1 = extractelement <4 x float> %__X, i64 0
5208 %.rhs.i = extractelement <4 x float> %__Y, i64 0
5209 %2 = fsub float -0.000000e+00, %.rhs.i
5210 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5212 %tobool.i = icmp eq i8 %4, 0
5213 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5214 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5215 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5216 ret <4 x float> %vecins.i
5219 define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5220 ; X86-LABEL: test_mm_mask3_fmsub_round_ss:
5221 ; X86: # %bb.0: # %entry
5222 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5223 ; X86-NEXT: kmovw %eax, %k1
5224 ; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5225 ; X86-NEXT: vmovaps %xmm2, %xmm0
5228 ; X64-LABEL: test_mm_mask3_fmsub_round_ss:
5229 ; X64: # %bb.0: # %entry
5230 ; X64-NEXT: kmovw %edi, %k1
5231 ; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5232 ; X64-NEXT: vmovaps %xmm2, %xmm0
5235 %0 = extractelement <4 x float> %__W, i64 0
5236 %1 = extractelement <4 x float> %__X, i64 0
5237 %.rhs = extractelement <4 x float> %__Y, i64 0
5238 %2 = fsub float -0.000000e+00, %.rhs
5239 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5240 %4 = bitcast i8 %__U to <8 x i1>
5241 %5 = extractelement <8 x i1> %4, i64 0
5242 %6 = select i1 %5, float %3, float %.rhs
5243 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5247 define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5248 ; X86-LABEL: test_mm_mask_fnmadd_ss:
5249 ; X86: # %bb.0: # %entry
5250 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5251 ; X86-NEXT: kmovw %eax, %k1
5252 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5255 ; X64-LABEL: test_mm_mask_fnmadd_ss:
5256 ; X64: # %bb.0: # %entry
5257 ; X64-NEXT: kmovw %edi, %k1
5258 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5261 %0 = extractelement <4 x float> %__W, i64 0
5262 %.rhs.i = extractelement <4 x float> %__A, i64 0
5263 %1 = fsub float -0.000000e+00, %.rhs.i
5264 %2 = extractelement <4 x float> %__B, i64 0
5265 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5267 %tobool.i = icmp eq i8 %4, 0
5268 %vecext1.i = extractelement <4 x float> %__W, i32 0
5269 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5270 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5271 ret <4 x float> %vecins.i
5274 define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5275 ; X86-LABEL: test_mm_mask_fnmadd_round_ss:
5276 ; X86: # %bb.0: # %entry
5277 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5278 ; X86-NEXT: kmovw %eax, %k1
5279 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5282 ; X64-LABEL: test_mm_mask_fnmadd_round_ss:
5283 ; X64: # %bb.0: # %entry
5284 ; X64-NEXT: kmovw %edi, %k1
5285 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5288 %0 = extractelement <4 x float> %__W, i64 0
5289 %.rhs = extractelement <4 x float> %__A, i64 0
5290 %1 = fsub float -0.000000e+00, %.rhs
5291 %2 = extractelement <4 x float> %__B, i64 0
5292 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5293 %4 = bitcast i8 %__U to <8 x i1>
5294 %5 = extractelement <8 x i1> %4, i64 0
5295 %6 = select i1 %5, float %3, float %0
5296 %7 = insertelement <4 x float> %__W, float %6, i64 0
5300 define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5301 ; X86-LABEL: test_mm_maskz_fnmadd_ss:
5302 ; X86: # %bb.0: # %entry
5303 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5304 ; X86-NEXT: kmovw %eax, %k1
5305 ; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5308 ; X64-LABEL: test_mm_maskz_fnmadd_ss:
5309 ; X64: # %bb.0: # %entry
5310 ; X64-NEXT: kmovw %edi, %k1
5311 ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5314 %0 = extractelement <4 x float> %__A, i64 0
5315 %.rhs.i = extractelement <4 x float> %__B, i64 0
5316 %1 = fsub float -0.000000e+00, %.rhs.i
5317 %2 = extractelement <4 x float> %__C, i64 0
5318 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5320 %tobool.i = icmp eq i8 %4, 0
5321 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5322 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5323 ret <4 x float> %vecins.i
5326 define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5327 ; X86-LABEL: test_mm_maskz_fnmadd_round_ss:
5328 ; X86: # %bb.0: # %entry
5329 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5330 ; X86-NEXT: kmovw %eax, %k1
5331 ; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5334 ; X64-LABEL: test_mm_maskz_fnmadd_round_ss:
5335 ; X64: # %bb.0: # %entry
5336 ; X64-NEXT: kmovw %edi, %k1
5337 ; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5340 %0 = extractelement <4 x float> %__A, i64 0
5341 %.rhs = extractelement <4 x float> %__B, i64 0
5342 %1 = fsub float -0.000000e+00, %.rhs
5343 %2 = extractelement <4 x float> %__C, i64 0
5344 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5345 %4 = bitcast i8 %__U to <8 x i1>
5346 %5 = extractelement <8 x i1> %4, i64 0
5347 %6 = select i1 %5, float %3, float 0.000000e+00
5348 %7 = insertelement <4 x float> %__A, float %6, i64 0
5352 define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5353 ; X86-LABEL: test_mm_mask3_fnmadd_ss:
5354 ; X86: # %bb.0: # %entry
5355 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5356 ; X86-NEXT: kmovw %eax, %k1
5357 ; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5358 ; X86-NEXT: vmovaps %xmm2, %xmm0
5361 ; X64-LABEL: test_mm_mask3_fnmadd_ss:
5362 ; X64: # %bb.0: # %entry
5363 ; X64-NEXT: kmovw %edi, %k1
5364 ; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
5365 ; X64-NEXT: vmovaps %xmm2, %xmm0
5368 %0 = extractelement <4 x float> %__W, i64 0
5369 %.rhs.i = extractelement <4 x float> %__X, i64 0
5370 %1 = fsub float -0.000000e+00, %.rhs.i
5371 %2 = extractelement <4 x float> %__Y, i64 0
5372 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5374 %tobool.i = icmp eq i8 %4, 0
5375 %vecext1.i = extractelement <4 x float> %__Y, i32 0
5376 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3
5377 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5378 ret <4 x float> %vecins.i
5381 define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5382 ; X86-LABEL: test_mm_mask3_fnmadd_round_ss:
5383 ; X86: # %bb.0: # %entry
5384 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5385 ; X86-NEXT: kmovw %eax, %k1
5386 ; X86-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5387 ; X86-NEXT: vmovaps %xmm2, %xmm0
5390 ; X64-LABEL: test_mm_mask3_fnmadd_round_ss:
5391 ; X64: # %bb.0: # %entry
5392 ; X64-NEXT: kmovw %edi, %k1
5393 ; X64-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5394 ; X64-NEXT: vmovaps %xmm2, %xmm0
5397 %0 = extractelement <4 x float> %__W, i64 0
5398 %.rhs = extractelement <4 x float> %__X, i64 0
5399 %1 = fsub float -0.000000e+00, %.rhs
5400 %2 = extractelement <4 x float> %__Y, i64 0
5401 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5402 %4 = bitcast i8 %__U to <8 x i1>
5403 %5 = extractelement <8 x i1> %4, i64 0
5404 %6 = select i1 %5, float %3, float %2
5405 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5409 define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5410 ; X86-LABEL: test_mm_mask_fnmsub_ss:
5411 ; X86: # %bb.0: # %entry
5412 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5413 ; X86-NEXT: kmovw %eax, %k1
5414 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5417 ; X64-LABEL: test_mm_mask_fnmsub_ss:
5418 ; X64: # %bb.0: # %entry
5419 ; X64-NEXT: kmovw %edi, %k1
5420 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5423 %0 = extractelement <4 x float> %__W, i64 0
5424 %.rhs.i = extractelement <4 x float> %__A, i64 0
5425 %1 = fsub float -0.000000e+00, %.rhs.i
5426 %.rhs7.i = extractelement <4 x float> %__B, i64 0
5427 %2 = fsub float -0.000000e+00, %.rhs7.i
5428 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5430 %tobool.i = icmp eq i8 %4, 0
5431 %vecext2.i = extractelement <4 x float> %__W, i32 0
5432 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5433 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0
5434 ret <4 x float> %vecins.i
5437 define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
5438 ; X86-LABEL: test_mm_mask_fnmsub_round_ss:
5439 ; X86: # %bb.0: # %entry
5440 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5441 ; X86-NEXT: kmovw %eax, %k1
5442 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5445 ; X64-LABEL: test_mm_mask_fnmsub_round_ss:
5446 ; X64: # %bb.0: # %entry
5447 ; X64-NEXT: kmovw %edi, %k1
5448 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5451 %0 = extractelement <4 x float> %__W, i64 0
5452 %.rhs = extractelement <4 x float> %__A, i64 0
5453 %1 = fsub float -0.000000e+00, %.rhs
5454 %.rhs2 = extractelement <4 x float> %__B, i64 0
5455 %2 = fsub float -0.000000e+00, %.rhs2
5456 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5457 %4 = bitcast i8 %__U to <8 x i1>
5458 %5 = extractelement <8 x i1> %4, i64 0
5459 %6 = select i1 %5, float %3, float %0
5460 %7 = insertelement <4 x float> %__W, float %6, i64 0
5464 define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5465 ; X86-LABEL: test_mm_maskz_fnmsub_ss:
5466 ; X86: # %bb.0: # %entry
5467 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5468 ; X86-NEXT: kmovw %eax, %k1
5469 ; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5472 ; X64-LABEL: test_mm_maskz_fnmsub_ss:
5473 ; X64: # %bb.0: # %entry
5474 ; X64-NEXT: kmovw %edi, %k1
5475 ; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
5478 %0 = extractelement <4 x float> %__A, i64 0
5479 %.rhs.i = extractelement <4 x float> %__B, i64 0
5480 %1 = fsub float -0.000000e+00, %.rhs.i
5481 %.rhs5.i = extractelement <4 x float> %__C, i64 0
5482 %2 = fsub float -0.000000e+00, %.rhs5.i
5483 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5485 %tobool.i = icmp eq i8 %4, 0
5486 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3
5487 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
5488 ret <4 x float> %vecins.i
5491 define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) {
5492 ; X86-LABEL: test_mm_maskz_fnmsub_round_ss:
5493 ; X86: # %bb.0: # %entry
5494 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5495 ; X86-NEXT: kmovw %eax, %k1
5496 ; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5499 ; X64-LABEL: test_mm_maskz_fnmsub_round_ss:
5500 ; X64: # %bb.0: # %entry
5501 ; X64-NEXT: kmovw %edi, %k1
5502 ; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5505 %0 = extractelement <4 x float> %__A, i64 0
5506 %.rhs = extractelement <4 x float> %__B, i64 0
5507 %1 = fsub float -0.000000e+00, %.rhs
5508 %.rhs2 = extractelement <4 x float> %__C, i64 0
5509 %2 = fsub float -0.000000e+00, %.rhs2
5510 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5511 %4 = bitcast i8 %__U to <8 x i1>
5512 %5 = extractelement <8 x i1> %4, i64 0
5513 %6 = select i1 %5, float %3, float 0.000000e+00
5514 %7 = insertelement <4 x float> %__A, float %6, i64 0
5518 define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5519 ; X86-LABEL: test_mm_mask3_fnmsub_ss:
5520 ; X86: # %bb.0: # %entry
5521 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5522 ; X86-NEXT: kmovw %eax, %k1
5523 ; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5524 ; X86-NEXT: vmovaps %xmm2, %xmm0
5527 ; X64-LABEL: test_mm_mask3_fnmsub_ss:
5528 ; X64: # %bb.0: # %entry
5529 ; X64-NEXT: kmovw %edi, %k1
5530 ; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
5531 ; X64-NEXT: vmovaps %xmm2, %xmm0
5534 %0 = extractelement <4 x float> %__W, i64 0
5535 %.rhs.i = extractelement <4 x float> %__X, i64 0
5536 %1 = fsub float -0.000000e+00, %.rhs.i
5537 %.rhs7.i = extractelement <4 x float> %__Y, i64 0
5538 %2 = fsub float -0.000000e+00, %.rhs7.i
5539 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10
5541 %tobool.i = icmp eq i8 %4, 0
5542 %vecext2.i = extractelement <4 x float> %__Y, i32 0
5543 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3
5544 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0
5545 ret <4 x float> %vecins.i
5548 define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) {
5549 ; X86-LABEL: test_mm_mask3_fnmsub_round_ss:
5550 ; X86: # %bb.0: # %entry
5551 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5552 ; X86-NEXT: kmovw %eax, %k1
5553 ; X86-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5554 ; X86-NEXT: vmovaps %xmm2, %xmm0
5557 ; X64-LABEL: test_mm_mask3_fnmsub_round_ss:
5558 ; X64: # %bb.0: # %entry
5559 ; X64-NEXT: kmovw %edi, %k1
5560 ; X64-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5561 ; X64-NEXT: vmovaps %xmm2, %xmm0
5564 %0 = extractelement <4 x float> %__W, i64 0
5565 %.rhs = extractelement <4 x float> %__X, i64 0
5566 %1 = fsub float -0.000000e+00, %.rhs
5567 %.rhs1 = extractelement <4 x float> %__Y, i64 0
5568 %2 = fsub float -0.000000e+00, %.rhs1
5569 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8)
5570 %4 = bitcast i8 %__U to <8 x i1>
5571 %5 = extractelement <8 x i1> %4, i64 0
5572 %6 = select i1 %5, float %3, float %.rhs1
5573 %7 = insertelement <4 x float> %__Y, float %6, i64 0
5577 define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5578 ; X86-LABEL: test_mm_mask_fmadd_sd:
5579 ; X86: # %bb.0: # %entry
5580 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5581 ; X86-NEXT: kmovw %eax, %k1
5582 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5585 ; X64-LABEL: test_mm_mask_fmadd_sd:
5586 ; X64: # %bb.0: # %entry
5587 ; X64-NEXT: kmovw %edi, %k1
5588 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5591 %0 = extractelement <2 x double> %__W, i64 0
5592 %1 = extractelement <2 x double> %__A, i64 0
5593 %2 = extractelement <2 x double> %__B, i64 0
5594 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5596 %tobool.i = icmp eq i8 %4, 0
5597 %vecext1.i = extractelement <2 x double> %__W, i32 0
5598 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5599 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5600 ret <2 x double> %vecins.i
5603 define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5604 ; X86-LABEL: test_mm_mask_fmadd_round_sd:
5605 ; X86: # %bb.0: # %entry
5606 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5607 ; X86-NEXT: kmovw %eax, %k1
5608 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5611 ; X64-LABEL: test_mm_mask_fmadd_round_sd:
5612 ; X64: # %bb.0: # %entry
5613 ; X64-NEXT: kmovw %edi, %k1
5614 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5617 %0 = extractelement <2 x double> %__W, i64 0
5618 %1 = extractelement <2 x double> %__A, i64 0
5619 %2 = extractelement <2 x double> %__B, i64 0
5620 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5621 %4 = bitcast i8 %__U to <8 x i1>
5622 %5 = extractelement <8 x i1> %4, i64 0
5623 %6 = select i1 %5, double %3, double %0
5624 %7 = insertelement <2 x double> %__W, double %6, i64 0
5628 declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1
5630 define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5631 ; X86-LABEL: test_mm_maskz_fmadd_sd:
5632 ; X86: # %bb.0: # %entry
5633 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5634 ; X86-NEXT: kmovw %eax, %k1
5635 ; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5638 ; X64-LABEL: test_mm_maskz_fmadd_sd:
5639 ; X64: # %bb.0: # %entry
5640 ; X64-NEXT: kmovw %edi, %k1
5641 ; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
5644 %0 = extractelement <2 x double> %__A, i64 0
5645 %1 = extractelement <2 x double> %__B, i64 0
5646 %2 = extractelement <2 x double> %__C, i64 0
5647 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5649 %tobool.i = icmp eq i8 %4, 0
5650 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5651 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5652 ret <2 x double> %vecins.i
5655 define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5656 ; X86-LABEL: test_mm_maskz_fmadd_round_sd:
5657 ; X86: # %bb.0: # %entry
5658 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5659 ; X86-NEXT: kmovw %eax, %k1
5660 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5663 ; X64-LABEL: test_mm_maskz_fmadd_round_sd:
5664 ; X64: # %bb.0: # %entry
5665 ; X64-NEXT: kmovw %edi, %k1
5666 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5669 %0 = extractelement <2 x double> %__A, i64 0
5670 %1 = extractelement <2 x double> %__B, i64 0
5671 %2 = extractelement <2 x double> %__C, i64 0
5672 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5673 %4 = bitcast i8 %__U to <8 x i1>
5674 %5 = extractelement <8 x i1> %4, i64 0
5675 %6 = select i1 %5, double %3, double 0.000000e+00
5676 %7 = insertelement <2 x double> %__A, double %6, i64 0
5680 define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5681 ; X86-LABEL: test_mm_mask3_fmadd_sd:
5682 ; X86: # %bb.0: # %entry
5683 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5684 ; X86-NEXT: kmovw %eax, %k1
5685 ; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5686 ; X86-NEXT: vmovapd %xmm2, %xmm0
5689 ; X64-LABEL: test_mm_mask3_fmadd_sd:
5690 ; X64: # %bb.0: # %entry
5691 ; X64-NEXT: kmovw %edi, %k1
5692 ; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2
5693 ; X64-NEXT: vmovapd %xmm2, %xmm0
5696 %0 = extractelement <2 x double> %__W, i64 0
5697 %1 = extractelement <2 x double> %__X, i64 0
5698 %2 = extractelement <2 x double> %__Y, i64 0
5699 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5701 %tobool.i = icmp eq i8 %4, 0
5702 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5703 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5704 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5705 ret <2 x double> %vecins.i
5708 define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5709 ; X86-LABEL: test_mm_mask3_fmadd_round_sd:
5710 ; X86: # %bb.0: # %entry
5711 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5712 ; X86-NEXT: kmovw %eax, %k1
5713 ; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5714 ; X86-NEXT: vmovapd %xmm2, %xmm0
5717 ; X64-LABEL: test_mm_mask3_fmadd_round_sd:
5718 ; X64: # %bb.0: # %entry
5719 ; X64-NEXT: kmovw %edi, %k1
5720 ; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
5721 ; X64-NEXT: vmovapd %xmm2, %xmm0
5724 %0 = extractelement <2 x double> %__W, i64 0
5725 %1 = extractelement <2 x double> %__X, i64 0
5726 %2 = extractelement <2 x double> %__Y, i64 0
5727 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5728 %4 = bitcast i8 %__U to <8 x i1>
5729 %5 = extractelement <8 x i1> %4, i64 0
5730 %6 = select i1 %5, double %3, double %2
5731 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5735 define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5736 ; X86-LABEL: test_mm_mask_fmsub_sd:
5737 ; X86: # %bb.0: # %entry
5738 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5739 ; X86-NEXT: kmovw %eax, %k1
5740 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5743 ; X64-LABEL: test_mm_mask_fmsub_sd:
5744 ; X64: # %bb.0: # %entry
5745 ; X64-NEXT: kmovw %edi, %k1
5746 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5749 %0 = extractelement <2 x double> %__W, i64 0
5750 %1 = extractelement <2 x double> %__A, i64 0
5751 %.rhs.i = extractelement <2 x double> %__B, i64 0
5752 %2 = fsub double -0.000000e+00, %.rhs.i
5753 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5755 %tobool.i = icmp eq i8 %4, 0
5756 %vecext1.i = extractelement <2 x double> %__W, i32 0
5757 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5758 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5759 ret <2 x double> %vecins.i
5762 define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5763 ; X86-LABEL: test_mm_mask_fmsub_round_sd:
5764 ; X86: # %bb.0: # %entry
5765 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5766 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm2
5767 ; X86-NEXT: kmovw %eax, %k1
5768 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5771 ; X64-LABEL: test_mm_mask_fmsub_round_sd:
5772 ; X64: # %bb.0: # %entry
5773 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2
5774 ; X64-NEXT: kmovw %edi, %k1
5775 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5778 %0 = extractelement <2 x double> %__W, i64 0
5779 %1 = extractelement <2 x double> %__A, i64 0
5780 %.rhs = extractelement <2 x double> %__B, i64 0
5781 %2 = fsub double -0.000000e+00, %.rhs
5782 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5783 %4 = bitcast i8 %__U to <8 x i1>
5784 %5 = extractelement <8 x i1> %4, i64 0
5785 %6 = select i1 %5, double %3, double %0
5786 %7 = insertelement <2 x double> %__W, double %6, i64 0
5790 define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5791 ; X86-LABEL: test_mm_maskz_fmsub_sd:
5792 ; X86: # %bb.0: # %entry
5793 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5794 ; X86-NEXT: kmovw %eax, %k1
5795 ; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5798 ; X64-LABEL: test_mm_maskz_fmsub_sd:
5799 ; X64: # %bb.0: # %entry
5800 ; X64-NEXT: kmovw %edi, %k1
5801 ; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
5804 %0 = extractelement <2 x double> %__A, i64 0
5805 %1 = extractelement <2 x double> %__B, i64 0
5806 %.rhs.i = extractelement <2 x double> %__C, i64 0
5807 %2 = fsub double -0.000000e+00, %.rhs.i
5808 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5810 %tobool.i = icmp eq i8 %4, 0
5811 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5812 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5813 ret <2 x double> %vecins.i
5816 define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5817 ; X86-LABEL: test_mm_maskz_fmsub_round_sd:
5818 ; X86: # %bb.0: # %entry
5819 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5820 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm2
5821 ; X86-NEXT: kmovw %eax, %k1
5822 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5825 ; X64-LABEL: test_mm_maskz_fmsub_round_sd:
5826 ; X64: # %bb.0: # %entry
5827 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2
5828 ; X64-NEXT: kmovw %edi, %k1
5829 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5832 %0 = extractelement <2 x double> %__A, i64 0
5833 %1 = extractelement <2 x double> %__B, i64 0
5834 %.rhs = extractelement <2 x double> %__C, i64 0
5835 %2 = fsub double -0.000000e+00, %.rhs
5836 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5837 %4 = bitcast i8 %__U to <8 x i1>
5838 %5 = extractelement <8 x i1> %4, i64 0
5839 %6 = select i1 %5, double %3, double 0.000000e+00
5840 %7 = insertelement <2 x double> %__A, double %6, i64 0
5844 define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5845 ; X86-LABEL: test_mm_mask3_fmsub_sd:
5846 ; X86: # %bb.0: # %entry
5847 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5848 ; X86-NEXT: kmovw %eax, %k1
5849 ; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5850 ; X86-NEXT: vmovapd %xmm2, %xmm0
5853 ; X64-LABEL: test_mm_mask3_fmsub_sd:
5854 ; X64: # %bb.0: # %entry
5855 ; X64-NEXT: kmovw %edi, %k1
5856 ; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
5857 ; X64-NEXT: vmovapd %xmm2, %xmm0
5860 %0 = extractelement <2 x double> %__W, i64 0
5861 %1 = extractelement <2 x double> %__X, i64 0
5862 %.rhs.i = extractelement <2 x double> %__Y, i64 0
5863 %2 = fsub double -0.000000e+00, %.rhs.i
5864 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5866 %tobool.i = icmp eq i8 %4, 0
5867 %vecext1.i = extractelement <2 x double> %__Y, i32 0
5868 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5869 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
5870 ret <2 x double> %vecins.i
5873 define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
5874 ; X86-LABEL: test_mm_mask3_fmsub_round_sd:
5875 ; X86: # %bb.0: # %entry
5876 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5877 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm3
5878 ; X86-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1
5879 ; X86-NEXT: kmovw %eax, %k1
5880 ; X86-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
5881 ; X86-NEXT: vmovapd %xmm2, %xmm0
5884 ; X64-LABEL: test_mm_mask3_fmsub_round_sd:
5885 ; X64: # %bb.0: # %entry
5886 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm3
5887 ; X64-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1
5888 ; X64-NEXT: kmovw %edi, %k1
5889 ; X64-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1}
5890 ; X64-NEXT: vmovapd %xmm2, %xmm0
5893 %0 = extractelement <2 x double> %__W, i64 0
5894 %1 = extractelement <2 x double> %__X, i64 0
5895 %.rhs = extractelement <2 x double> %__Y, i64 0
5896 %2 = fsub double -0.000000e+00, %.rhs
5897 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5898 %4 = bitcast i8 %__U to <8 x i1>
5899 %5 = extractelement <8 x i1> %4, i64 0
5900 %6 = select i1 %5, double %3, double %.rhs
5901 %7 = insertelement <2 x double> %__Y, double %6, i64 0
5905 define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5906 ; X86-LABEL: test_mm_mask_fnmadd_sd:
5907 ; X86: # %bb.0: # %entry
5908 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5909 ; X86-NEXT: kmovw %eax, %k1
5910 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5913 ; X64-LABEL: test_mm_mask_fnmadd_sd:
5914 ; X64: # %bb.0: # %entry
5915 ; X64-NEXT: kmovw %edi, %k1
5916 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5919 %0 = extractelement <2 x double> %__W, i64 0
5920 %.rhs.i = extractelement <2 x double> %__A, i64 0
5921 %1 = fsub double -0.000000e+00, %.rhs.i
5922 %2 = extractelement <2 x double> %__B, i64 0
5923 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5925 %tobool.i = icmp eq i8 %4, 0
5926 %vecext1.i = extractelement <2 x double> %__W, i32 0
5927 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
5928 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
5929 ret <2 x double> %vecins.i
5932 define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
5933 ; X86-LABEL: test_mm_mask_fnmadd_round_sd:
5934 ; X86: # %bb.0: # %entry
5935 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5936 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
5937 ; X86-NEXT: kmovw %eax, %k1
5938 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5941 ; X64-LABEL: test_mm_mask_fnmadd_round_sd:
5942 ; X64: # %bb.0: # %entry
5943 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
5944 ; X64-NEXT: kmovw %edi, %k1
5945 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
5948 %0 = extractelement <2 x double> %__W, i64 0
5949 %.rhs = extractelement <2 x double> %__A, i64 0
5950 %1 = fsub double -0.000000e+00, %.rhs
5951 %2 = extractelement <2 x double> %__B, i64 0
5952 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
5953 %4 = bitcast i8 %__U to <8 x i1>
5954 %5 = extractelement <8 x i1> %4, i64 0
5955 %6 = select i1 %5, double %3, double %0
5956 %7 = insertelement <2 x double> %__W, double %6, i64 0
5960 define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5961 ; X86-LABEL: test_mm_maskz_fnmadd_sd:
5962 ; X86: # %bb.0: # %entry
5963 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5964 ; X86-NEXT: kmovw %eax, %k1
5965 ; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5968 ; X64-LABEL: test_mm_maskz_fnmadd_sd:
5969 ; X64: # %bb.0: # %entry
5970 ; X64-NEXT: kmovw %edi, %k1
5971 ; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
5974 %0 = extractelement <2 x double> %__A, i64 0
5975 %.rhs.i = extractelement <2 x double> %__B, i64 0
5976 %1 = fsub double -0.000000e+00, %.rhs.i
5977 %2 = extractelement <2 x double> %__C, i64 0
5978 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
5980 %tobool.i = icmp eq i8 %4, 0
5981 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
5982 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
5983 ret <2 x double> %vecins.i
5986 define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
5987 ; X86-LABEL: test_mm_maskz_fnmadd_round_sd:
5988 ; X86: # %bb.0: # %entry
5989 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
5990 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
5991 ; X86-NEXT: kmovw %eax, %k1
5992 ; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
5995 ; X64-LABEL: test_mm_maskz_fnmadd_round_sd:
5996 ; X64: # %bb.0: # %entry
5997 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
5998 ; X64-NEXT: kmovw %edi, %k1
5999 ; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6002 %0 = extractelement <2 x double> %__A, i64 0
6003 %.rhs = extractelement <2 x double> %__B, i64 0
6004 %1 = fsub double -0.000000e+00, %.rhs
6005 %2 = extractelement <2 x double> %__C, i64 0
6006 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6007 %4 = bitcast i8 %__U to <8 x i1>
6008 %5 = extractelement <8 x i1> %4, i64 0
6009 %6 = select i1 %5, double %3, double 0.000000e+00
6010 %7 = insertelement <2 x double> %__A, double %6, i64 0
6014 define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6015 ; X86-LABEL: test_mm_mask3_fnmadd_sd:
6016 ; X86: # %bb.0: # %entry
6017 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6018 ; X86-NEXT: kmovw %eax, %k1
6019 ; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
6020 ; X86-NEXT: vmovapd %xmm2, %xmm0
6023 ; X64-LABEL: test_mm_mask3_fnmadd_sd:
6024 ; X64: # %bb.0: # %entry
6025 ; X64-NEXT: kmovw %edi, %k1
6026 ; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
6027 ; X64-NEXT: vmovapd %xmm2, %xmm0
6030 %0 = extractelement <2 x double> %__W, i64 0
6031 %.rhs.i = extractelement <2 x double> %__X, i64 0
6032 %1 = fsub double -0.000000e+00, %.rhs.i
6033 %2 = extractelement <2 x double> %__Y, i64 0
6034 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6036 %tobool.i = icmp eq i8 %4, 0
6037 %vecext1.i = extractelement <2 x double> %__Y, i32 0
6038 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3
6039 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6040 ret <2 x double> %vecins.i
6043 define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6044 ; X86-LABEL: test_mm_mask3_fnmadd_round_sd:
6045 ; X86: # %bb.0: # %entry
6046 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6047 ; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1
6048 ; X86-NEXT: kmovw %eax, %k1
6049 ; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6050 ; X86-NEXT: vmovapd %xmm2, %xmm0
6053 ; X64-LABEL: test_mm_mask3_fnmadd_round_sd:
6054 ; X64: # %bb.0: # %entry
6055 ; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
6056 ; X64-NEXT: kmovw %edi, %k1
6057 ; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6058 ; X64-NEXT: vmovapd %xmm2, %xmm0
6061 %0 = extractelement <2 x double> %__W, i64 0
6062 %.rhs = extractelement <2 x double> %__X, i64 0
6063 %1 = fsub double -0.000000e+00, %.rhs
6064 %2 = extractelement <2 x double> %__Y, i64 0
6065 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6066 %4 = bitcast i8 %__U to <8 x i1>
6067 %5 = extractelement <8 x i1> %4, i64 0
6068 %6 = select i1 %5, double %3, double %2
6069 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6073 define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6074 ; X86-LABEL: test_mm_mask_fnmsub_sd:
6075 ; X86: # %bb.0: # %entry
6076 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6077 ; X86-NEXT: kmovw %eax, %k1
6078 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6081 ; X64-LABEL: test_mm_mask_fnmsub_sd:
6082 ; X64: # %bb.0: # %entry
6083 ; X64-NEXT: kmovw %edi, %k1
6084 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6087 %0 = extractelement <2 x double> %__W, i64 0
6088 %.rhs.i = extractelement <2 x double> %__A, i64 0
6089 %1 = fsub double -0.000000e+00, %.rhs.i
6090 %.rhs7.i = extractelement <2 x double> %__B, i64 0
6091 %2 = fsub double -0.000000e+00, %.rhs7.i
6092 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6094 %tobool.i = icmp eq i8 %4, 0
6095 %vecext2.i = extractelement <2 x double> %__W, i32 0
6096 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6097 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0
6098 ret <2 x double> %vecins.i
6101 define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
6102 ; X86-LABEL: test_mm_mask_fnmsub_round_sd:
6103 ; X86: # %bb.0: # %entry
6104 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6105 ; X86-NEXT: kmovw %eax, %k1
6106 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6109 ; X64-LABEL: test_mm_mask_fnmsub_round_sd:
6110 ; X64: # %bb.0: # %entry
6111 ; X64-NEXT: kmovw %edi, %k1
6112 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
6115 %0 = extractelement <2 x double> %__W, i64 0
6116 %.rhs = extractelement <2 x double> %__A, i64 0
6117 %1 = fsub double -0.000000e+00, %.rhs
6118 %.rhs2 = extractelement <2 x double> %__B, i64 0
6119 %2 = fsub double -0.000000e+00, %.rhs2
6120 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6121 %4 = bitcast i8 %__U to <8 x i1>
6122 %5 = extractelement <8 x i1> %4, i64 0
6123 %6 = select i1 %5, double %3, double %0
6124 %7 = insertelement <2 x double> %__W, double %6, i64 0
6128 define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6129 ; X86-LABEL: test_mm_maskz_fnmsub_sd:
6130 ; X86: # %bb.0: # %entry
6131 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6132 ; X86-NEXT: kmovw %eax, %k1
6133 ; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6136 ; X64-LABEL: test_mm_maskz_fnmsub_sd:
6137 ; X64: # %bb.0: # %entry
6138 ; X64-NEXT: kmovw %edi, %k1
6139 ; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
6142 %0 = extractelement <2 x double> %__A, i64 0
6143 %.rhs.i = extractelement <2 x double> %__B, i64 0
6144 %1 = fsub double -0.000000e+00, %.rhs.i
6145 %.rhs5.i = extractelement <2 x double> %__C, i64 0
6146 %2 = fsub double -0.000000e+00, %.rhs5.i
6147 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6149 %tobool.i = icmp eq i8 %4, 0
6150 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3
6151 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
6152 ret <2 x double> %vecins.i
6155 define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) {
6156 ; X86-LABEL: test_mm_maskz_fnmsub_round_sd:
6157 ; X86: # %bb.0: # %entry
6158 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6159 ; X86-NEXT: kmovw %eax, %k1
6160 ; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6163 ; X64-LABEL: test_mm_maskz_fnmsub_round_sd:
6164 ; X64: # %bb.0: # %entry
6165 ; X64-NEXT: kmovw %edi, %k1
6166 ; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
6169 %0 = extractelement <2 x double> %__A, i64 0
6170 %.rhs = extractelement <2 x double> %__B, i64 0
6171 %1 = fsub double -0.000000e+00, %.rhs
6172 %.rhs2 = extractelement <2 x double> %__C, i64 0
6173 %2 = fsub double -0.000000e+00, %.rhs2
6174 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6175 %4 = bitcast i8 %__U to <8 x i1>
6176 %5 = extractelement <8 x i1> %4, i64 0
6177 %6 = select i1 %5, double %3, double 0.000000e+00
6178 %7 = insertelement <2 x double> %__A, double %6, i64 0
6182 define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6183 ; X86-LABEL: test_mm_mask3_fnmsub_sd:
6184 ; X86: # %bb.0: # %entry
6185 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6186 ; X86-NEXT: kmovw %eax, %k1
6187 ; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6188 ; X86-NEXT: vmovapd %xmm2, %xmm0
6191 ; X64-LABEL: test_mm_mask3_fnmsub_sd:
6192 ; X64: # %bb.0: # %entry
6193 ; X64-NEXT: kmovw %edi, %k1
6194 ; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2
6195 ; X64-NEXT: vmovapd %xmm2, %xmm0
6198 %0 = extractelement <2 x double> %__W, i64 0
6199 %.rhs.i = extractelement <2 x double> %__X, i64 0
6200 %1 = fsub double -0.000000e+00, %.rhs.i
6201 %.rhs7.i = extractelement <2 x double> %__Y, i64 0
6202 %2 = fsub double -0.000000e+00, %.rhs7.i
6203 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10
6205 %tobool.i = icmp eq i8 %4, 0
6206 %vecext2.i = extractelement <2 x double> %__Y, i32 0
6207 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3
6208 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0
6209 ret <2 x double> %vecins.i
6212 define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) {
6213 ; X86-LABEL: test_mm_mask3_fnmsub_round_sd:
6214 ; X86: # %bb.0: # %entry
6215 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6216 ; X86-NEXT: kmovw %eax, %k1
6217 ; X86-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6218 ; X86-NEXT: vmovapd %xmm2, %xmm0
6221 ; X64-LABEL: test_mm_mask3_fnmsub_round_sd:
6222 ; X64: # %bb.0: # %entry
6223 ; X64-NEXT: kmovw %edi, %k1
6224 ; X64-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
6225 ; X64-NEXT: vmovapd %xmm2, %xmm0
6228 %0 = extractelement <2 x double> %__W, i64 0
6229 %.rhs = extractelement <2 x double> %__X, i64 0
6230 %1 = fsub double -0.000000e+00, %.rhs
6231 %.rhs1 = extractelement <2 x double> %__Y, i64 0
6232 %2 = fsub double -0.000000e+00, %.rhs1
6233 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8)
6234 %4 = bitcast i8 %__U to <8 x i1>
6235 %5 = extractelement <8 x i1> %4, i64 0
6236 %6 = select i1 %5, double %3, double %.rhs1
6237 %7 = insertelement <2 x double> %__Y, double %6, i64 0
6241 define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
6242 ; X86-LABEL: test_mm512_mask_expandloadu_epi64:
6243 ; X86: # %bb.0: # %entry
6244 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6245 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6246 ; X86-NEXT: kmovw %ecx, %k1
6247 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1}
6250 ; X64-LABEL: test_mm512_mask_expandloadu_epi64:
6251 ; X64: # %bb.0: # %entry
6252 ; X64-NEXT: kmovw %edi, %k1
6253 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1}
6256 %0 = bitcast i8* %__P to i64*
6257 %1 = bitcast i8 %__U to <8 x i1>
6258 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
6262 define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
6263 ; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
6264 ; X86: # %bb.0: # %entry
6265 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6266 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6267 ; X86-NEXT: kmovw %ecx, %k1
6268 ; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
6271 ; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
6272 ; X64: # %bb.0: # %entry
6273 ; X64-NEXT: kmovw %edi, %k1
6274 ; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
6277 %0 = bitcast i8* %__P to i64*
6278 %1 = bitcast i8 %__U to <8 x i1>
6279 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
6283 define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
6284 ; X86-LABEL: test_mm512_mask_expandloadu_pd:
6285 ; X86: # %bb.0: # %entry
6286 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6287 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6288 ; X86-NEXT: kmovw %ecx, %k1
6289 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1}
6292 ; X64-LABEL: test_mm512_mask_expandloadu_pd:
6293 ; X64: # %bb.0: # %entry
6294 ; X64-NEXT: kmovw %edi, %k1
6295 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1}
6298 %0 = bitcast i8* %__P to double*
6299 %1 = bitcast i8 %__U to <8 x i1>
6300 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
6304 define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
6305 ; X86-LABEL: test_mm512_maskz_expandloadu_pd:
6306 ; X86: # %bb.0: # %entry
6307 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6308 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
6309 ; X86-NEXT: kmovw %ecx, %k1
6310 ; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z}
6313 ; X64-LABEL: test_mm512_maskz_expandloadu_pd:
6314 ; X64: # %bb.0: # %entry
6315 ; X64-NEXT: kmovw %edi, %k1
6316 ; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z}
6319 %0 = bitcast i8* %__P to double*
6320 %1 = bitcast i8 %__U to <8 x i1>
6321 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
6325 define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
6326 ; X86-LABEL: test_mm512_mask_expandloadu_epi32:
6327 ; X86: # %bb.0: # %entry
6328 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6329 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6330 ; X86-NEXT: kmovw %ecx, %k1
6331 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1}
6334 ; X64-LABEL: test_mm512_mask_expandloadu_epi32:
6335 ; X64: # %bb.0: # %entry
6336 ; X64-NEXT: kmovw %edi, %k1
6337 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1}
6340 %0 = bitcast <8 x i64> %__W to <16 x i32>
6341 %1 = bitcast i8* %__P to i32*
6342 %2 = bitcast i16 %__U to <16 x i1>
6343 %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
6344 %4 = bitcast <16 x i32> %3 to <8 x i64>
6348 define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
6349 ; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
6350 ; X86: # %bb.0: # %entry
6351 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6352 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6353 ; X86-NEXT: kmovw %ecx, %k1
6354 ; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z}
6357 ; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
6358 ; X64: # %bb.0: # %entry
6359 ; X64-NEXT: kmovw %edi, %k1
6360 ; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z}
6363 %0 = bitcast i8* %__P to i32*
6364 %1 = bitcast i16 %__U to <16 x i1>
6365 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
6366 %3 = bitcast <16 x i32> %2 to <8 x i64>
6370 define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
6371 ; X86-LABEL: test_mm512_mask_expandloadu_ps:
6372 ; X86: # %bb.0: # %entry
6373 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6374 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6375 ; X86-NEXT: kmovw %ecx, %k1
6376 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1}
6379 ; X64-LABEL: test_mm512_mask_expandloadu_ps:
6380 ; X64: # %bb.0: # %entry
6381 ; X64-NEXT: kmovw %edi, %k1
6382 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1}
6385 %0 = bitcast i8* %__P to float*
6386 %1 = bitcast i16 %__U to <16 x i1>
6387 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
6391 define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
6392 ; X86-LABEL: test_mm512_maskz_expandloadu_ps:
6393 ; X86: # %bb.0: # %entry
6394 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
6395 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
6396 ; X86-NEXT: kmovw %ecx, %k1
6397 ; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z}
6400 ; X64-LABEL: test_mm512_maskz_expandloadu_ps:
6401 ; X64: # %bb.0: # %entry
6402 ; X64-NEXT: kmovw %edi, %k1
6403 ; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z}
6406 %0 = bitcast i8* %__P to float*
6407 %1 = bitcast i16 %__U to <16 x i1>
6408 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
6412 define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
6413 ; X86-LABEL: test_mm512_mask_compressstoreu_pd:
6414 ; X86: # %bb.0: # %entry
6415 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6416 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6417 ; X86-NEXT: kmovw %eax, %k1
6418 ; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1}
6419 ; X86-NEXT: vzeroupper
6422 ; X64-LABEL: test_mm512_mask_compressstoreu_pd:
6423 ; X64: # %bb.0: # %entry
6424 ; X64-NEXT: kmovw %esi, %k1
6425 ; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
6426 ; X64-NEXT: vzeroupper
6429 %0 = bitcast i8* %__P to double*
6430 %1 = bitcast i8 %__U to <8 x i1>
6431 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
6435 define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
6436 ; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
6437 ; X86: # %bb.0: # %entry
6438 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6439 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6440 ; X86-NEXT: kmovw %eax, %k1
6441 ; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1}
6442 ; X86-NEXT: vzeroupper
6445 ; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
6446 ; X64: # %bb.0: # %entry
6447 ; X64-NEXT: kmovw %esi, %k1
6448 ; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
6449 ; X64-NEXT: vzeroupper
6452 %0 = bitcast i8* %__P to i64*
6453 %1 = bitcast i8 %__U to <8 x i1>
6454 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
6458 define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
6459 ; X86-LABEL: test_mm512_mask_compressstoreu_ps:
6460 ; X86: # %bb.0: # %entry
6461 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6462 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6463 ; X86-NEXT: kmovw %eax, %k1
6464 ; X86-NEXT: vcompressps %zmm0, (%ecx) {%k1}
6465 ; X86-NEXT: vzeroupper
6468 ; X64-LABEL: test_mm512_mask_compressstoreu_ps:
6469 ; X64: # %bb.0: # %entry
6470 ; X64-NEXT: kmovw %esi, %k1
6471 ; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1}
6472 ; X64-NEXT: vzeroupper
6475 %0 = bitcast i8* %__P to float*
6476 %1 = bitcast i16 %__U to <16 x i1>
6477 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
6481 define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
6482 ; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
6483 ; X86: # %bb.0: # %entry
6484 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
6485 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
6486 ; X86-NEXT: kmovw %eax, %k1
6487 ; X86-NEXT: vpcompressd %zmm0, (%ecx) {%k1}
6488 ; X86-NEXT: vzeroupper
6491 ; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
6492 ; X64: # %bb.0: # %entry
6493 ; X64-NEXT: kmovw %esi, %k1
6494 ; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1}
6495 ; X64-NEXT: vzeroupper
6498 %0 = bitcast <8 x i64> %__A to <16 x i32>
6499 %1 = bitcast i8* %__P to i32*
6500 %2 = bitcast i16 %__U to <16 x i1>
6501 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
6505 define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) {
6506 ; X86-LABEL: test_mm512_reduce_add_epi64:
6507 ; X86: # %bb.0: # %entry
6508 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6509 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6510 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6511 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6512 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6513 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6514 ; X86-NEXT: vmovd %xmm0, %eax
6515 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6516 ; X86-NEXT: vzeroupper
6519 ; X64-LABEL: test_mm512_reduce_add_epi64:
6520 ; X64: # %bb.0: # %entry
6521 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6522 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6523 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6524 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6525 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6526 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6527 ; X64-NEXT: vmovq %xmm0, %rax
6528 ; X64-NEXT: vzeroupper
6531 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6532 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6533 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6534 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6535 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6536 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6537 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6538 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6539 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6543 define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) {
6544 ; X86-LABEL: test_mm512_reduce_mul_epi64:
6545 ; X86: # %bb.0: # %entry
6546 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6547 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm2
6548 ; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6549 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm3
6550 ; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6551 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6552 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6553 ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6554 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6555 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6556 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6557 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6558 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6559 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6560 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6561 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6562 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6563 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6564 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6565 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6566 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6567 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6568 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6569 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6570 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6571 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6572 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6573 ; X86-NEXT: vmovd %xmm0, %eax
6574 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6575 ; X86-NEXT: vzeroupper
6578 ; X64-LABEL: test_mm512_reduce_mul_epi64:
6579 ; X64: # %bb.0: # %entry
6580 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6581 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
6582 ; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
6583 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
6584 ; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
6585 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6586 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6587 ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
6588 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6589 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6590 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6591 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6592 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6593 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6594 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6595 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6596 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6597 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6598 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6599 ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6600 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6601 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6602 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6603 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6604 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6605 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6606 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6607 ; X64-NEXT: vmovq %xmm0, %rax
6608 ; X64-NEXT: vzeroupper
6611 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6612 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6613 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6614 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6615 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6616 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6617 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6618 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6619 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6623 define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) {
6624 ; X86-LABEL: test_mm512_reduce_or_epi64:
6625 ; X86: # %bb.0: # %entry
6626 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6627 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6628 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6629 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6630 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6631 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6632 ; X86-NEXT: vmovd %xmm0, %eax
6633 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6634 ; X86-NEXT: vzeroupper
6637 ; X64-LABEL: test_mm512_reduce_or_epi64:
6638 ; X64: # %bb.0: # %entry
6639 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6640 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6641 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6642 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6643 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6644 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6645 ; X64-NEXT: vmovq %xmm0, %rax
6646 ; X64-NEXT: vzeroupper
6649 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6650 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6651 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6652 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6653 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6654 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6655 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6656 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6657 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6661 define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) {
6662 ; X86-LABEL: test_mm512_reduce_and_epi64:
6663 ; X86: # %bb.0: # %entry
6664 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6665 ; X86-NEXT: vpand %ymm1, %ymm0, %ymm0
6666 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6667 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6668 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6669 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6670 ; X86-NEXT: vmovd %xmm0, %eax
6671 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6672 ; X86-NEXT: vzeroupper
6675 ; X64-LABEL: test_mm512_reduce_and_epi64:
6676 ; X64: # %bb.0: # %entry
6677 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6678 ; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
6679 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6680 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6681 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6682 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6683 ; X64-NEXT: vmovq %xmm0, %rax
6684 ; X64-NEXT: vzeroupper
6687 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6688 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6689 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6690 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6691 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6692 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6693 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6694 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6695 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6699 define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6700 ; X86-LABEL: test_mm512_mask_reduce_add_epi64:
6701 ; X86: # %bb.0: # %entry
6702 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6703 ; X86-NEXT: kmovw %eax, %k1
6704 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6705 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6706 ; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6707 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6708 ; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6709 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6710 ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6711 ; X86-NEXT: vmovd %xmm0, %eax
6712 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6713 ; X86-NEXT: vzeroupper
6716 ; X64-LABEL: test_mm512_mask_reduce_add_epi64:
6717 ; X64: # %bb.0: # %entry
6718 ; X64-NEXT: kmovw %edi, %k1
6719 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6720 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6721 ; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
6722 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6723 ; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0
6724 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6725 ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
6726 ; X64-NEXT: vmovq %xmm0, %rax
6727 ; X64-NEXT: vzeroupper
6730 %0 = bitcast i8 %__M to <8 x i1>
6731 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6732 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6733 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6734 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i
6735 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6736 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6737 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i
6738 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6739 %add7.i = add <2 x i64> %shuffle6.i, %add4.i
6740 %vecext.i = extractelement <2 x i64> %add7.i, i32 0
6744 define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6745 ; X86-LABEL: test_mm512_mask_reduce_mul_epi64:
6746 ; X86: # %bb.0: # %entry
6747 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6748 ; X86-NEXT: kmovw %eax, %k1
6749 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0]
6750 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6751 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6752 ; X86-NEXT: vpsrlq $32, %ymm1, %ymm2
6753 ; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6754 ; X86-NEXT: vpsrlq $32, %ymm0, %ymm3
6755 ; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6756 ; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6757 ; X86-NEXT: vpsllq $32, %ymm2, %ymm2
6758 ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6759 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6760 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6761 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm2
6762 ; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6763 ; X86-NEXT: vpsrlq $32, %xmm1, %xmm3
6764 ; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6765 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6766 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6767 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6768 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6769 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6770 ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6771 ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6772 ; X86-NEXT: vpsrlq $32, %xmm0, %xmm3
6773 ; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6774 ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6775 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2
6776 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6777 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6778 ; X86-NEXT: vmovd %xmm0, %eax
6779 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6780 ; X86-NEXT: vzeroupper
6783 ; X64-LABEL: test_mm512_mask_reduce_mul_epi64:
6784 ; X64: # %bb.0: # %entry
6785 ; X64-NEXT: kmovw %edi, %k1
6786 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
6787 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6788 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6789 ; X64-NEXT: vpsrlq $32, %ymm1, %ymm2
6790 ; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
6791 ; X64-NEXT: vpsrlq $32, %ymm0, %ymm3
6792 ; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
6793 ; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
6794 ; X64-NEXT: vpsllq $32, %ymm2, %ymm2
6795 ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0
6796 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
6797 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6798 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm2
6799 ; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
6800 ; X64-NEXT: vpsrlq $32, %xmm1, %xmm3
6801 ; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
6802 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6803 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6804 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
6805 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6806 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6807 ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6808 ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
6809 ; X64-NEXT: vpsrlq $32, %xmm0, %xmm3
6810 ; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
6811 ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2
6812 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2
6813 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
6814 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0
6815 ; X64-NEXT: vmovq %xmm0, %rax
6816 ; X64-NEXT: vzeroupper
6819 %0 = bitcast i8 %__M to <8 x i1>
6820 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
6821 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6822 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6823 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i
6824 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6825 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6826 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i
6827 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6828 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i
6829 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0
6833 define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6834 ; X86-LABEL: test_mm512_mask_reduce_and_epi64:
6835 ; X86: # %bb.0: # %entry
6836 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6837 ; X86-NEXT: kmovw %eax, %k1
6838 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6839 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6840 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6841 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
6842 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6843 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
6844 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6845 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
6846 ; X86-NEXT: vmovd %xmm0, %eax
6847 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6848 ; X86-NEXT: vzeroupper
6851 ; X64-LABEL: test_mm512_mask_reduce_and_epi64:
6852 ; X64: # %bb.0: # %entry
6853 ; X64-NEXT: kmovw %edi, %k1
6854 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
6855 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
6856 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
6857 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
6858 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6859 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
6860 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6861 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
6862 ; X64-NEXT: vmovq %xmm0, %rax
6863 ; X64-NEXT: vzeroupper
6866 %0 = bitcast i8 %__M to <8 x i1>
6867 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
6868 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6869 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6870 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i
6871 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6872 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6873 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i
6874 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6875 %and7.i = and <2 x i64> %shuffle6.i, %and4.i
6876 %vecext.i = extractelement <2 x i64> %and7.i, i32 0
6880 define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) {
6881 ; X86-LABEL: test_mm512_mask_reduce_or_epi64:
6882 ; X86: # %bb.0: # %entry
6883 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
6884 ; X86-NEXT: kmovw %eax, %k1
6885 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6886 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6887 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
6888 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
6889 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
6890 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6891 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
6892 ; X86-NEXT: vmovd %xmm0, %eax
6893 ; X86-NEXT: vpextrd $1, %xmm0, %edx
6894 ; X86-NEXT: vzeroupper
6897 ; X64-LABEL: test_mm512_mask_reduce_or_epi64:
6898 ; X64: # %bb.0: # %entry
6899 ; X64-NEXT: kmovw %edi, %k1
6900 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
6901 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6902 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
6903 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
6904 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
6905 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6906 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
6907 ; X64-NEXT: vmovq %xmm0, %rax
6908 ; X64-NEXT: vzeroupper
6911 %0 = bitcast i8 %__M to <8 x i1>
6912 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
6913 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6914 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6915 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i
6916 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6917 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6918 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i
6919 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
6920 %or7.i = or <2 x i64> %shuffle6.i, %or4.i
6921 %vecext.i = extractelement <2 x i64> %or7.i, i32 0
6925 define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
6926 ; CHECK-LABEL: test_mm512_reduce_add_epi32:
6927 ; CHECK: # %bb.0: # %entry
6928 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6929 ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
6930 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6931 ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
6932 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6933 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6934 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6935 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
6936 ; CHECK-NEXT: vmovd %xmm0, %eax
6937 ; CHECK-NEXT: vzeroupper
6938 ; CHECK-NEXT: ret{{[l|q]}}
6940 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6941 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6942 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6943 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6944 %add.i = add <8 x i32> %0, %1
6945 %2 = bitcast <8 x i32> %add.i to <4 x i64>
6946 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6947 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6948 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6949 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6950 %add5.i = add <4 x i32> %3, %4
6951 %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6952 %add6.i = add <4 x i32> %shuffle.i, %add5.i
6953 %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6954 %add8.i = add <4 x i32> %shuffle7.i, %add6.i
6955 %vecext.i = extractelement <4 x i32> %add8.i, i32 0
6959 define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) {
6960 ; CHECK-LABEL: test_mm512_reduce_mul_epi32:
6961 ; CHECK: # %bb.0: # %entry
6962 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6963 ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
6964 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6965 ; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0
6966 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
6967 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6968 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
6969 ; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0
6970 ; CHECK-NEXT: vmovd %xmm0, %eax
6971 ; CHECK-NEXT: vzeroupper
6972 ; CHECK-NEXT: ret{{[l|q]}}
6974 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
6975 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
6976 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6977 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
6978 %mul.i = mul <8 x i32> %0, %1
6979 %2 = bitcast <8 x i32> %mul.i to <4 x i64>
6980 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
6981 %3 = bitcast <2 x i64> %extract3.i to <4 x i32>
6982 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
6983 %4 = bitcast <2 x i64> %extract4.i to <4 x i32>
6984 %mul5.i = mul <4 x i32> %3, %4
6985 %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
6986 %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i
6987 %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
6988 %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i
6989 %vecext.i = extractelement <4 x i32> %mul8.i, i32 0
6993 define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) {
6994 ; CHECK-LABEL: test_mm512_reduce_or_epi32:
6995 ; CHECK: # %bb.0: # %entry
6996 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
6997 ; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
6998 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
6999 ; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
7000 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7001 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
7002 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7003 ; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0
7004 ; CHECK-NEXT: vmovd %xmm0, %eax
7005 ; CHECK-NEXT: vzeroupper
7006 ; CHECK-NEXT: ret{{[l|q]}}
7008 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7009 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7010 %or25.i = or <4 x i64> %extract.i, %extract2.i
7011 %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7012 %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7013 %or526.i = or <2 x i64> %extract3.i, %extract4.i
7014 %or5.i = bitcast <2 x i64> %or526.i to <4 x i32>
7015 %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7016 %or6.i = or <4 x i32> %shuffle.i, %or5.i
7017 %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7018 %or8.i = or <4 x i32> %shuffle7.i, %or6.i
7019 %vecext.i = extractelement <4 x i32> %or8.i, i32 0
7023 define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) {
7024 ; CHECK-LABEL: test_mm512_reduce_and_epi32:
7025 ; CHECK: # %bb.0: # %entry
7026 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7027 ; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
7028 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
7029 ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
7030 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7031 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7032 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7033 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
7034 ; CHECK-NEXT: vmovd %xmm0, %eax
7035 ; CHECK-NEXT: vzeroupper
7036 ; CHECK-NEXT: ret{{[l|q]}}
7038 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7039 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7040 %and25.i = and <4 x i64> %extract.i, %extract2.i
7041 %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7042 %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7043 %and526.i = and <2 x i64> %extract3.i, %extract4.i
7044 %and5.i = bitcast <2 x i64> %and526.i to <4 x i32>
7045 %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7046 %and6.i = and <4 x i32> %shuffle.i, %and5.i
7047 %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7048 %and8.i = and <4 x i32> %shuffle7.i, %and6.i
7049 %vecext.i = extractelement <4 x i32> %and8.i, i32 0
7053 define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7054 ; X86-LABEL: test_mm512_mask_reduce_add_epi32:
7055 ; X86: # %bb.0: # %entry
7056 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7057 ; X86-NEXT: kmovw %eax, %k1
7058 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7059 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7060 ; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7061 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7062 ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7063 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7064 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7065 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7066 ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7067 ; X86-NEXT: vmovd %xmm0, %eax
7068 ; X86-NEXT: vzeroupper
7071 ; X64-LABEL: test_mm512_mask_reduce_add_epi32:
7072 ; X64: # %bb.0: # %entry
7073 ; X64-NEXT: kmovw %edi, %k1
7074 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7075 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7076 ; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
7077 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7078 ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
7079 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7080 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7081 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7082 ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0
7083 ; X64-NEXT: vmovd %xmm0, %eax
7084 ; X64-NEXT: vzeroupper
7087 %0 = bitcast <8 x i64> %__W to <16 x i32>
7088 %1 = bitcast i16 %__M to <16 x i1>
7089 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7090 %3 = bitcast <16 x i32> %2 to <8 x i64>
7091 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7092 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7093 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7094 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
7095 %add.i = add <8 x i32> %4, %5
7096 %6 = bitcast <8 x i32> %add.i to <4 x i64>
7097 %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7098 %7 = bitcast <2 x i64> %extract4.i to <4 x i32>
7099 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7100 %8 = bitcast <2 x i64> %extract5.i to <4 x i32>
7101 %add6.i = add <4 x i32> %7, %8
7102 %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7103 %add7.i = add <4 x i32> %shuffle.i, %add6.i
7104 %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7105 %add9.i = add <4 x i32> %shuffle8.i, %add7.i
7106 %vecext.i = extractelement <4 x i32> %add9.i, i32 0
7110 define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7111 ; X86-LABEL: test_mm512_mask_reduce_mul_epi32:
7112 ; X86: # %bb.0: # %entry
7113 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7114 ; X86-NEXT: kmovw %eax, %k1
7115 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7116 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7117 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7118 ; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7119 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7120 ; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7121 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7122 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7123 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7124 ; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7125 ; X86-NEXT: vmovd %xmm0, %eax
7126 ; X86-NEXT: vzeroupper
7129 ; X64-LABEL: test_mm512_mask_reduce_mul_epi32:
7130 ; X64: # %bb.0: # %entry
7131 ; X64-NEXT: kmovw %edi, %k1
7132 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
7133 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7134 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7135 ; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0
7136 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7137 ; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
7138 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7139 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7140 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7141 ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0
7142 ; X64-NEXT: vmovd %xmm0, %eax
7143 ; X64-NEXT: vzeroupper
7146 %0 = bitcast <8 x i64> %__W to <16 x i32>
7147 %1 = bitcast i16 %__M to <16 x i1>
7148 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
7149 %3 = bitcast <16 x i32> %2 to <8 x i64>
7150 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7151 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
7152 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7153 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
7154 %mul.i = mul <8 x i32> %4, %5
7155 %6 = bitcast <8 x i32> %mul.i to <4 x i64>
7156 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7157 %7 = bitcast <2 x i64> %extract5.i to <4 x i32>
7158 %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7159 %8 = bitcast <2 x i64> %extract6.i to <4 x i32>
7160 %mul7.i = mul <4 x i32> %7, %8
7161 %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7162 %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i
7163 %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7164 %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i
7165 %vecext.i = extractelement <4 x i32> %mul10.i, i32 0
7169 define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7170 ; X86-LABEL: test_mm512_mask_reduce_and_epi32:
7171 ; X86: # %bb.0: # %entry
7172 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7173 ; X86-NEXT: kmovw %eax, %k1
7174 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7175 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7176 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7177 ; X86-NEXT: vpand %ymm0, %ymm1, %ymm0
7178 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7179 ; X86-NEXT: vpand %xmm1, %xmm0, %xmm0
7180 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7181 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7182 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7183 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm0
7184 ; X86-NEXT: vmovd %xmm0, %eax
7185 ; X86-NEXT: vzeroupper
7188 ; X64-LABEL: test_mm512_mask_reduce_and_epi32:
7189 ; X64: # %bb.0: # %entry
7190 ; X64-NEXT: kmovw %edi, %k1
7191 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
7192 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
7193 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7194 ; X64-NEXT: vpand %ymm0, %ymm1, %ymm0
7195 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7196 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
7197 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7198 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7199 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7200 ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
7201 ; X64-NEXT: vmovd %xmm0, %eax
7202 ; X64-NEXT: vzeroupper
7205 %0 = bitcast <8 x i64> %__W to <16 x i32>
7206 %1 = bitcast i16 %__M to <16 x i1>
7207 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
7208 %3 = bitcast <16 x i32> %2 to <8 x i64>
7209 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7210 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7211 %and28.i = and <4 x i64> %extract.i, %extract4.i
7212 %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7213 %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7214 %and729.i = and <2 x i64> %extract5.i, %extract6.i
7215 %and7.i = bitcast <2 x i64> %and729.i to <4 x i32>
7216 %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7217 %and8.i = and <4 x i32> %shuffle.i, %and7.i
7218 %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7219 %and10.i = and <4 x i32> %shuffle9.i, %and8.i
7220 %vecext.i = extractelement <4 x i32> %and10.i, i32 0
7224 define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) {
7225 ; X86-LABEL: test_mm512_mask_reduce_or_epi32:
7226 ; X86: # %bb.0: # %entry
7227 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7228 ; X86-NEXT: kmovw %eax, %k1
7229 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7230 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7231 ; X86-NEXT: vpor %ymm1, %ymm0, %ymm0
7232 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
7233 ; X86-NEXT: vpor %xmm1, %xmm0, %xmm0
7234 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7235 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7236 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7237 ; X86-NEXT: vpor %xmm0, %xmm1, %xmm0
7238 ; X86-NEXT: vmovd %xmm0, %eax
7239 ; X86-NEXT: vzeroupper
7242 ; X64-LABEL: test_mm512_mask_reduce_or_epi32:
7243 ; X64: # %bb.0: # %entry
7244 ; X64-NEXT: kmovw %edi, %k1
7245 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
7246 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
7247 ; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
7248 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
7249 ; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
7250 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
7251 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7252 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
7253 ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
7254 ; X64-NEXT: vmovd %xmm0, %eax
7255 ; X64-NEXT: vzeroupper
7258 %0 = bitcast <8 x i64> %__W to <16 x i32>
7259 %1 = bitcast i16 %__M to <16 x i1>
7260 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
7261 %3 = bitcast <16 x i32> %2 to <8 x i64>
7262 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7263 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7264 %or27.i = or <4 x i64> %extract.i, %extract3.i
7265 %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
7266 %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
7267 %or628.i = or <2 x i64> %extract4.i, %extract5.i
7268 %or6.i = bitcast <2 x i64> %or628.i to <4 x i32>
7269 %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7270 %or7.i = or <4 x i32> %shuffle.i, %or6.i
7271 %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
7272 %or9.i = or <4 x i32> %shuffle8.i, %or7.i
7273 %vecext.i = extractelement <4 x i32> %or9.i, i32 0
7277 define double @test_mm512_reduce_add_pd(<8 x double> %__W) {
7278 ; X86-LABEL: test_mm512_reduce_add_pd:
7279 ; X86: # %bb.0: # %entry
7280 ; X86-NEXT: pushl %ebp
7281 ; X86-NEXT: .cfi_def_cfa_offset 8
7282 ; X86-NEXT: .cfi_offset %ebp, -8
7283 ; X86-NEXT: movl %esp, %ebp
7284 ; X86-NEXT: .cfi_def_cfa_register %ebp
7285 ; X86-NEXT: andl $-8, %esp
7286 ; X86-NEXT: subl $8, %esp
7287 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7288 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7289 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7290 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7291 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7292 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7293 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7294 ; X86-NEXT: fldl (%esp)
7295 ; X86-NEXT: movl %ebp, %esp
7296 ; X86-NEXT: popl %ebp
7297 ; X86-NEXT: .cfi_def_cfa %esp, 4
7298 ; X86-NEXT: vzeroupper
7301 ; X64-LABEL: test_mm512_reduce_add_pd:
7302 ; X64: # %bb.0: # %entry
7303 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7304 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7305 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7306 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7307 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7308 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7309 ; X64-NEXT: vzeroupper
7312 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7313 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7314 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7315 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7316 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7317 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7318 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7319 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7320 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7321 ret double %vecext.i
7324 define double @test_mm512_reduce_mul_pd(<8 x double> %__W) {
7325 ; X86-LABEL: test_mm512_reduce_mul_pd:
7326 ; X86: # %bb.0: # %entry
7327 ; X86-NEXT: pushl %ebp
7328 ; X86-NEXT: .cfi_def_cfa_offset 8
7329 ; X86-NEXT: .cfi_offset %ebp, -8
7330 ; X86-NEXT: movl %esp, %ebp
7331 ; X86-NEXT: .cfi_def_cfa_register %ebp
7332 ; X86-NEXT: andl $-8, %esp
7333 ; X86-NEXT: subl $8, %esp
7334 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7335 ; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7336 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7337 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7338 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7339 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7340 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7341 ; X86-NEXT: fldl (%esp)
7342 ; X86-NEXT: movl %ebp, %esp
7343 ; X86-NEXT: popl %ebp
7344 ; X86-NEXT: .cfi_def_cfa %esp, 4
7345 ; X86-NEXT: vzeroupper
7348 ; X64-LABEL: test_mm512_reduce_mul_pd:
7349 ; X64: # %bb.0: # %entry
7350 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7351 ; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
7352 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7353 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7354 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7355 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7356 ; X64-NEXT: vzeroupper
7359 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7360 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7361 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7362 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7363 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7364 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7365 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7366 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7367 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7368 ret double %vecext.i
7371 define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
7372 ; X86-LABEL: test_mm512_reduce_add_ps:
7373 ; X86: # %bb.0: # %entry
7374 ; X86-NEXT: pushl %eax
7375 ; X86-NEXT: .cfi_def_cfa_offset 8
7376 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7377 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7378 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7379 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7380 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7381 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7382 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7383 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7384 ; X86-NEXT: vmovss %xmm0, (%esp)
7385 ; X86-NEXT: flds (%esp)
7386 ; X86-NEXT: popl %eax
7387 ; X86-NEXT: .cfi_def_cfa_offset 4
7388 ; X86-NEXT: vzeroupper
7391 ; X64-LABEL: test_mm512_reduce_add_ps:
7392 ; X64: # %bb.0: # %entry
7393 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7394 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7395 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7396 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7397 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7398 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7399 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7400 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7401 ; X64-NEXT: vzeroupper
7404 %0 = bitcast <16 x float> %__W to <8 x double>
7405 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7406 %1 = bitcast <4 x double> %extract.i to <8 x float>
7407 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7408 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7409 %add.i = fadd <8 x float> %1, %2
7410 %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7411 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7412 %add5.i = fadd <4 x float> %extract3.i, %extract4.i
7413 %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7414 %add6.i = fadd <4 x float> %add5.i, %shuffle.i
7415 %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7416 %add8.i = fadd <4 x float> %add6.i, %shuffle7.i
7417 %vecext.i = extractelement <4 x float> %add8.i, i32 0
7421 define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
7422 ; X86-LABEL: test_mm512_reduce_mul_ps:
7423 ; X86: # %bb.0: # %entry
7424 ; X86-NEXT: pushl %eax
7425 ; X86-NEXT: .cfi_def_cfa_offset 8
7426 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7427 ; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0
7428 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7429 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7430 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7431 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7432 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7433 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7434 ; X86-NEXT: vmovss %xmm0, (%esp)
7435 ; X86-NEXT: flds (%esp)
7436 ; X86-NEXT: popl %eax
7437 ; X86-NEXT: .cfi_def_cfa_offset 4
7438 ; X86-NEXT: vzeroupper
7441 ; X64-LABEL: test_mm512_reduce_mul_ps:
7442 ; X64: # %bb.0: # %entry
7443 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7444 ; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
7445 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7446 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7447 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7448 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7449 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7450 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7451 ; X64-NEXT: vzeroupper
7454 %0 = bitcast <16 x float> %__W to <8 x double>
7455 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7456 %1 = bitcast <4 x double> %extract.i to <8 x float>
7457 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7458 %2 = bitcast <4 x double> %extract2.i to <8 x float>
7459 %mul.i = fmul <8 x float> %1, %2
7460 %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7461 %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7462 %mul5.i = fmul <4 x float> %extract3.i, %extract4.i
7463 %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7464 %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i
7465 %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7466 %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i
7467 %vecext.i = extractelement <4 x float> %mul8.i, i32 0
7471 define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) {
7472 ; X86-LABEL: test_mm512_mask_reduce_add_pd:
7473 ; X86: # %bb.0: # %entry
7474 ; X86-NEXT: pushl %ebp
7475 ; X86-NEXT: .cfi_def_cfa_offset 8
7476 ; X86-NEXT: .cfi_offset %ebp, -8
7477 ; X86-NEXT: movl %esp, %ebp
7478 ; X86-NEXT: .cfi_def_cfa_register %ebp
7479 ; X86-NEXT: andl $-8, %esp
7480 ; X86-NEXT: subl $8, %esp
7481 ; X86-NEXT: movb 8(%ebp), %al
7482 ; X86-NEXT: kmovw %eax, %k1
7483 ; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7484 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7485 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7486 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7487 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7488 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7489 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7490 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7491 ; X86-NEXT: fldl (%esp)
7492 ; X86-NEXT: movl %ebp, %esp
7493 ; X86-NEXT: popl %ebp
7494 ; X86-NEXT: .cfi_def_cfa %esp, 4
7495 ; X86-NEXT: vzeroupper
7498 ; X64-LABEL: test_mm512_mask_reduce_add_pd:
7499 ; X64: # %bb.0: # %entry
7500 ; X64-NEXT: kmovw %edi, %k1
7501 ; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
7502 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7503 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
7504 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7505 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7506 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7507 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
7508 ; X64-NEXT: vzeroupper
7511 %0 = bitcast i8 %__M to <8 x i1>
7512 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer
7513 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7514 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7515 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
7516 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7517 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7518 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
7519 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7520 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
7521 %vecext.i = extractelement <2 x double> %add7.i, i32 0
7522 ret double %vecext.i
7525 define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) {
7526 ; X86-LABEL: test_mm512_mask_reduce_mul_pd:
7527 ; X86: # %bb.0: # %entry
7528 ; X86-NEXT: pushl %ebp
7529 ; X86-NEXT: .cfi_def_cfa_offset 8
7530 ; X86-NEXT: .cfi_offset %ebp, -8
7531 ; X86-NEXT: movl %esp, %ebp
7532 ; X86-NEXT: .cfi_def_cfa_register %ebp
7533 ; X86-NEXT: andl $-8, %esp
7534 ; X86-NEXT: subl $8, %esp
7535 ; X86-NEXT: movb 8(%ebp), %al
7536 ; X86-NEXT: kmovw %eax, %k1
7537 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7538 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7539 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7540 ; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7541 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7542 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7543 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7544 ; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7545 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7546 ; X86-NEXT: fldl (%esp)
7547 ; X86-NEXT: movl %ebp, %esp
7548 ; X86-NEXT: popl %ebp
7549 ; X86-NEXT: .cfi_def_cfa %esp, 4
7550 ; X86-NEXT: vzeroupper
7553 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
7554 ; X64: # %bb.0: # %entry
7555 ; X64-NEXT: kmovw %edi, %k1
7556 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7557 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
7558 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7559 ; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0
7560 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7561 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7562 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7563 ; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0
7564 ; X64-NEXT: vzeroupper
7567 %0 = bitcast i8 %__M to <8 x i1>
7568 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
7569 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7570 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7571 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i
7572 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7573 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7574 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i
7575 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7576 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i
7577 %vecext.i = extractelement <2 x double> %mul7.i, i32 0
7578 ret double %vecext.i
7581 define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) {
7582 ; X86-LABEL: test_mm512_mask_reduce_add_ps:
7583 ; X86: # %bb.0: # %entry
7584 ; X86-NEXT: pushl %eax
7585 ; X86-NEXT: .cfi_def_cfa_offset 8
7586 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7587 ; X86-NEXT: kmovw %eax, %k1
7588 ; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7589 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7590 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
7591 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7592 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7593 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7594 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7595 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7596 ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
7597 ; X86-NEXT: vmovss %xmm0, (%esp)
7598 ; X86-NEXT: flds (%esp)
7599 ; X86-NEXT: popl %eax
7600 ; X86-NEXT: .cfi_def_cfa_offset 4
7601 ; X86-NEXT: vzeroupper
7604 ; X64-LABEL: test_mm512_mask_reduce_add_ps:
7605 ; X64: # %bb.0: # %entry
7606 ; X64-NEXT: kmovw %edi, %k1
7607 ; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
7608 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7609 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
7610 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7611 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7612 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7613 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7614 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7615 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0
7616 ; X64-NEXT: vzeroupper
7619 %0 = bitcast i16 %__M to <16 x i1>
7620 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer
7621 %2 = bitcast <16 x float> %1 to <8 x double>
7622 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7623 %3 = bitcast <4 x double> %extract.i to <8 x float>
7624 %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7625 %4 = bitcast <4 x double> %extract3.i to <8 x float>
7626 %add.i = fadd <8 x float> %3, %4
7627 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7628 %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7629 %add6.i = fadd <4 x float> %extract4.i, %extract5.i
7630 %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7631 %add7.i = fadd <4 x float> %add6.i, %shuffle.i
7632 %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7633 %add9.i = fadd <4 x float> %add7.i, %shuffle8.i
7634 %vecext.i = extractelement <4 x float> %add9.i, i32 0
7638 define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) {
7639 ; X86-LABEL: test_mm512_mask_reduce_mul_ps:
7640 ; X86: # %bb.0: # %entry
7641 ; X86-NEXT: pushl %eax
7642 ; X86-NEXT: .cfi_def_cfa_offset 8
7643 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
7644 ; X86-NEXT: kmovw %eax, %k1
7645 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7646 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7647 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7648 ; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0
7649 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7650 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7651 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7652 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7653 ; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7654 ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0
7655 ; X86-NEXT: vmovss %xmm0, (%esp)
7656 ; X86-NEXT: flds (%esp)
7657 ; X86-NEXT: popl %eax
7658 ; X86-NEXT: .cfi_def_cfa_offset 4
7659 ; X86-NEXT: vzeroupper
7662 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
7663 ; X64: # %bb.0: # %entry
7664 ; X64-NEXT: kmovw %edi, %k1
7665 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
7666 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
7667 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
7668 ; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0
7669 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7670 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7671 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7672 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7673 ; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
7674 ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0
7675 ; X64-NEXT: vzeroupper
7678 %0 = bitcast i16 %__M to <16 x i1>
7679 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
7680 %2 = bitcast <16 x float> %1 to <8 x double>
7681 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7682 %3 = bitcast <4 x double> %extract.i to <8 x float>
7683 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7684 %4 = bitcast <4 x double> %extract4.i to <8 x float>
7685 %mul.i = fmul <8 x float> %3, %4
7686 %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7687 %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7688 %mul7.i = fmul <4 x float> %extract5.i, %extract6.i
7689 %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
7690 %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i
7691 %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7692 %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i
7693 %vecext.i = extractelement <4 x float> %mul10.i, i32 0
7697 define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) {
7698 ; X86-LABEL: test_mm512_reduce_max_epi64:
7699 ; X86: # %bb.0: # %entry
7700 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7701 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7702 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7703 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7704 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7705 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7706 ; X86-NEXT: vmovd %xmm0, %eax
7707 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7708 ; X86-NEXT: vzeroupper
7711 ; X64-LABEL: test_mm512_reduce_max_epi64:
7712 ; X64: # %bb.0: # %entry
7713 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7714 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7715 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7716 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7717 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7718 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7719 ; X64-NEXT: vmovq %xmm0, %rax
7720 ; X64-NEXT: vzeroupper
7723 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7724 %0 = icmp slt <8 x i64> %shuffle.i, %__W
7725 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7726 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7727 %2 = icmp sgt <8 x i64> %1, %shuffle1.i
7728 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7729 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7730 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7731 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7732 %vecext.i = extractelement <8 x i64> %5, i32 0
7736 define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) {
7737 ; X86-LABEL: test_mm512_reduce_max_epu64:
7738 ; X86: # %bb.0: # %entry
7739 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7740 ; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7741 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7742 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7743 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7744 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7745 ; X86-NEXT: vmovd %xmm0, %eax
7746 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7747 ; X86-NEXT: vzeroupper
7750 ; X64-LABEL: test_mm512_reduce_max_epu64:
7751 ; X64: # %bb.0: # %entry
7752 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7753 ; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0
7754 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7755 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7756 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7757 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
7758 ; X64-NEXT: vmovq %xmm0, %rax
7759 ; X64-NEXT: vzeroupper
7762 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7763 %0 = icmp ult <8 x i64> %shuffle.i, %__W
7764 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7765 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7766 %2 = icmp ugt <8 x i64> %1, %shuffle1.i
7767 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7768 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7769 %4 = icmp ugt <8 x i64> %3, %shuffle3.i
7770 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7771 %vecext.i = extractelement <8 x i64> %5, i32 0
7775 define double @test_mm512_reduce_max_pd(<8 x double> %__W) {
7776 ; X86-LABEL: test_mm512_reduce_max_pd:
7777 ; X86: # %bb.0: # %entry
7778 ; X86-NEXT: pushl %ebp
7779 ; X86-NEXT: .cfi_def_cfa_offset 8
7780 ; X86-NEXT: .cfi_offset %ebp, -8
7781 ; X86-NEXT: movl %esp, %ebp
7782 ; X86-NEXT: .cfi_def_cfa_register %ebp
7783 ; X86-NEXT: andl $-8, %esp
7784 ; X86-NEXT: subl $8, %esp
7785 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7786 ; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7787 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7788 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7789 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7790 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7791 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7792 ; X86-NEXT: fldl (%esp)
7793 ; X86-NEXT: movl %ebp, %esp
7794 ; X86-NEXT: popl %ebp
7795 ; X86-NEXT: .cfi_def_cfa %esp, 4
7796 ; X86-NEXT: vzeroupper
7799 ; X64-LABEL: test_mm512_reduce_max_pd:
7800 ; X64: # %bb.0: # %entry
7801 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7802 ; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
7803 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7804 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7805 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7806 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
7807 ; X64-NEXT: vzeroupper
7810 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7811 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7812 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7813 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7814 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7815 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7816 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7817 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i)
7818 %vecext.i = extractelement <2 x double> %2, i32 0
7819 ret double %vecext.i
7822 define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) {
7823 ; X86-LABEL: test_mm512_reduce_min_epi64:
7824 ; X86: # %bb.0: # %entry
7825 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7826 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7827 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7828 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7829 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7830 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7831 ; X86-NEXT: vmovd %xmm0, %eax
7832 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7833 ; X86-NEXT: vzeroupper
7836 ; X64-LABEL: test_mm512_reduce_min_epi64:
7837 ; X64: # %bb.0: # %entry
7838 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7839 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
7840 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7841 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7842 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7843 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
7844 ; X64-NEXT: vmovq %xmm0, %rax
7845 ; X64-NEXT: vzeroupper
7848 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7849 %0 = icmp sgt <8 x i64> %shuffle.i, %__W
7850 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7851 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7852 %2 = icmp slt <8 x i64> %1, %shuffle1.i
7853 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7854 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7855 %4 = icmp slt <8 x i64> %3, %shuffle3.i
7856 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7857 %vecext.i = extractelement <8 x i64> %5, i32 0
7861 define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) {
7862 ; X86-LABEL: test_mm512_reduce_min_epu64:
7863 ; X86: # %bb.0: # %entry
7864 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7865 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7866 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7867 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7868 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7869 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7870 ; X86-NEXT: vmovd %xmm0, %eax
7871 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7872 ; X86-NEXT: vzeroupper
7875 ; X64-LABEL: test_mm512_reduce_min_epu64:
7876 ; X64: # %bb.0: # %entry
7877 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
7878 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
7879 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7880 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7881 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7882 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
7883 ; X64-NEXT: vmovq %xmm0, %rax
7884 ; X64-NEXT: vzeroupper
7887 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7888 %0 = icmp ugt <8 x i64> %shuffle.i, %__W
7889 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i
7890 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7891 %2 = icmp ult <8 x i64> %1, %shuffle1.i
7892 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i
7893 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7894 %4 = icmp ult <8 x i64> %3, %shuffle3.i
7895 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7896 %vecext.i = extractelement <8 x i64> %5, i32 0
7900 define double @test_mm512_reduce_min_pd(<8 x double> %__W) {
7901 ; X86-LABEL: test_mm512_reduce_min_pd:
7902 ; X86: # %bb.0: # %entry
7903 ; X86-NEXT: pushl %ebp
7904 ; X86-NEXT: .cfi_def_cfa_offset 8
7905 ; X86-NEXT: .cfi_offset %ebp, -8
7906 ; X86-NEXT: movl %esp, %ebp
7907 ; X86-NEXT: .cfi_def_cfa_register %ebp
7908 ; X86-NEXT: andl $-8, %esp
7909 ; X86-NEXT: subl $8, %esp
7910 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7911 ; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0
7912 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
7913 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
7914 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7915 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
7916 ; X86-NEXT: vmovlpd %xmm0, (%esp)
7917 ; X86-NEXT: fldl (%esp)
7918 ; X86-NEXT: movl %ebp, %esp
7919 ; X86-NEXT: popl %ebp
7920 ; X86-NEXT: .cfi_def_cfa %esp, 4
7921 ; X86-NEXT: vzeroupper
7924 ; X64-LABEL: test_mm512_reduce_min_pd:
7925 ; X64: # %bb.0: # %entry
7926 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
7927 ; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
7928 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
7929 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
7930 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
7931 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
7932 ; X64-NEXT: vzeroupper
7935 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
7936 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
7937 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i)
7938 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1>
7939 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3>
7940 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i)
7941 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
7942 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i)
7943 %vecext.i = extractelement <2 x double> %2, i32 0
7944 ret double %vecext.i
7947 define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) {
7948 ; X86-LABEL: test_mm512_mask_reduce_max_epi64:
7949 ; X86: # %bb.0: # %entry
7950 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7951 ; X86-NEXT: kmovw %eax, %k1
7952 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648]
7953 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7954 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7955 ; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7956 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7957 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7958 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7959 ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7960 ; X86-NEXT: vmovd %xmm0, %eax
7961 ; X86-NEXT: vpextrd $1, %xmm0, %edx
7962 ; X86-NEXT: vzeroupper
7965 ; X64-LABEL: test_mm512_mask_reduce_max_epi64:
7966 ; X64: # %bb.0: # %entry
7967 ; X64-NEXT: kmovw %edi, %k1
7968 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
7969 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
7970 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
7971 ; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
7972 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
7973 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7974 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
7975 ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
7976 ; X64-NEXT: vmovq %xmm0, %rax
7977 ; X64-NEXT: vzeroupper
7980 %0 = bitcast i8 %__M to <8 x i1>
7981 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808>
7982 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
7983 %2 = icmp sgt <8 x i64> %1, %shuffle.i
7984 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
7985 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
7986 %4 = icmp sgt <8 x i64> %3, %shuffle3.i
7987 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
7988 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
7989 %6 = icmp sgt <8 x i64> %5, %shuffle5.i
7990 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
7991 %vecext.i = extractelement <8 x i64> %7, i32 0
7995 define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) {
7996 ; X86-LABEL: test_mm512_mask_reduce_max_epu64:
7997 ; X86: # %bb.0: # %entry
7998 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
7999 ; X86-NEXT: kmovw %eax, %k1
8000 ; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8001 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8002 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8003 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8004 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8005 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8006 ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8007 ; X86-NEXT: vmovd %xmm0, %eax
8008 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8009 ; X86-NEXT: vzeroupper
8012 ; X64-LABEL: test_mm512_mask_reduce_max_epu64:
8013 ; X64: # %bb.0: # %entry
8014 ; X64-NEXT: kmovw %edi, %k1
8015 ; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
8016 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3]
8017 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8018 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8019 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8020 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8021 ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
8022 ; X64-NEXT: vmovq %xmm0, %rax
8023 ; X64-NEXT: vzeroupper
8026 %0 = bitcast i8 %__M to <8 x i1>
8027 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer
8028 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8029 %2 = icmp ugt <8 x i64> %1, %shuffle.i
8030 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8031 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8032 %4 = icmp ugt <8 x i64> %3, %shuffle2.i
8033 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i
8034 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8035 %6 = icmp ugt <8 x i64> %5, %shuffle4.i
8036 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i
8037 %vecext.i = extractelement <8 x i64> %7, i32 0
8041 define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) {
8042 ; X86-LABEL: test_mm512_mask_reduce_max_pd:
8043 ; X86: # %bb.0: # %entry
8044 ; X86-NEXT: pushl %ebp
8045 ; X86-NEXT: .cfi_def_cfa_offset 8
8046 ; X86-NEXT: .cfi_offset %ebp, -8
8047 ; X86-NEXT: movl %esp, %ebp
8048 ; X86-NEXT: .cfi_def_cfa_register %ebp
8049 ; X86-NEXT: andl $-8, %esp
8050 ; X86-NEXT: subl $8, %esp
8051 ; X86-NEXT: movb 8(%ebp), %al
8052 ; X86-NEXT: kmovw %eax, %k1
8053 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8054 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8055 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8056 ; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8057 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8058 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8059 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8060 ; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8061 ; X86-NEXT: vmovlpd %xmm0, (%esp)
8062 ; X86-NEXT: fldl (%esp)
8063 ; X86-NEXT: movl %ebp, %esp
8064 ; X86-NEXT: popl %ebp
8065 ; X86-NEXT: .cfi_def_cfa %esp, 4
8066 ; X86-NEXT: vzeroupper
8069 ; X64-LABEL: test_mm512_mask_reduce_max_pd:
8070 ; X64: # %bb.0: # %entry
8071 ; X64-NEXT: kmovw %edi, %k1
8072 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8073 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8074 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8075 ; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0
8076 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8077 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8078 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8079 ; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
8080 ; X64-NEXT: vzeroupper
8083 %0 = bitcast i8 %__M to <8 x i1>
8084 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000>
8085 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8086 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8087 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3
8088 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8089 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8090 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3
8091 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8092 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3
8093 %vecext.i = extractelement <2 x double> %4, i32 0
8094 ret double %vecext.i
8097 define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) {
8098 ; X86-LABEL: test_mm512_mask_reduce_min_epi64:
8099 ; X86: # %bb.0: # %entry
8100 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8101 ; X86-NEXT: kmovw %eax, %k1
8102 ; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647]
8103 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8104 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8105 ; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8106 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8107 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8108 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8109 ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8110 ; X86-NEXT: vmovd %xmm0, %eax
8111 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8112 ; X86-NEXT: vzeroupper
8115 ; X64-LABEL: test_mm512_mask_reduce_min_epi64:
8116 ; X64: # %bb.0: # %entry
8117 ; X64-NEXT: kmovw %edi, %k1
8118 ; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
8119 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8120 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8121 ; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0
8122 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8123 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8124 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8125 ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0
8126 ; X64-NEXT: vmovq %xmm0, %rax
8127 ; X64-NEXT: vzeroupper
8130 %0 = bitcast i8 %__M to <8 x i1>
8131 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807>
8132 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8133 %2 = icmp slt <8 x i64> %1, %shuffle.i
8134 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8135 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8136 %4 = icmp slt <8 x i64> %3, %shuffle3.i
8137 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8138 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8139 %6 = icmp slt <8 x i64> %5, %shuffle5.i
8140 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8141 %vecext.i = extractelement <8 x i64> %7, i32 0
8145 define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) {
8146 ; X86-LABEL: test_mm512_mask_reduce_min_epu64:
8147 ; X86: # %bb.0: # %entry
8148 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8149 ; X86-NEXT: kmovw %eax, %k1
8150 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8151 ; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8152 ; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8153 ; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8154 ; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8155 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8156 ; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8157 ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8158 ; X86-NEXT: vmovd %xmm0, %eax
8159 ; X86-NEXT: vpextrd $1, %xmm0, %edx
8160 ; X86-NEXT: vzeroupper
8163 ; X64-LABEL: test_mm512_mask_reduce_min_epu64:
8164 ; X64: # %bb.0: # %entry
8165 ; X64-NEXT: kmovw %edi, %k1
8166 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8167 ; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
8168 ; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3]
8169 ; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0
8170 ; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5]
8171 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8172 ; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
8173 ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0
8174 ; X64-NEXT: vmovq %xmm0, %rax
8175 ; X64-NEXT: vzeroupper
8178 %0 = bitcast i8 %__M to <8 x i1>
8179 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
8180 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
8181 %2 = icmp ult <8 x i64> %1, %shuffle.i
8182 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i
8183 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
8184 %4 = icmp ult <8 x i64> %3, %shuffle3.i
8185 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i
8186 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
8187 %6 = icmp ult <8 x i64> %5, %shuffle5.i
8188 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i
8189 %vecext.i = extractelement <8 x i64> %7, i32 0
8193 define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) {
8194 ; X86-LABEL: test_mm512_mask_reduce_min_pd:
8195 ; X86: # %bb.0: # %entry
8196 ; X86-NEXT: pushl %ebp
8197 ; X86-NEXT: .cfi_def_cfa_offset 8
8198 ; X86-NEXT: .cfi_offset %ebp, -8
8199 ; X86-NEXT: movl %esp, %ebp
8200 ; X86-NEXT: .cfi_def_cfa_register %ebp
8201 ; X86-NEXT: andl $-8, %esp
8202 ; X86-NEXT: subl $8, %esp
8203 ; X86-NEXT: movb 8(%ebp), %al
8204 ; X86-NEXT: kmovw %eax, %k1
8205 ; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8206 ; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8207 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8208 ; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0
8209 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8210 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
8211 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8212 ; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0
8213 ; X86-NEXT: vmovlpd %xmm0, (%esp)
8214 ; X86-NEXT: fldl (%esp)
8215 ; X86-NEXT: movl %ebp, %esp
8216 ; X86-NEXT: popl %ebp
8217 ; X86-NEXT: .cfi_def_cfa %esp, 4
8218 ; X86-NEXT: vzeroupper
8221 ; X64-LABEL: test_mm512_mask_reduce_min_pd:
8222 ; X64: # %bb.0: # %entry
8223 ; X64-NEXT: kmovw %edi, %k1
8224 ; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8225 ; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1}
8226 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8227 ; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0
8228 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8229 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
8230 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8231 ; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0
8232 ; X64-NEXT: vzeroupper
8235 %0 = bitcast i8 %__M to <8 x i1>
8236 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000>
8237 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8238 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8239 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i)
8240 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1>
8241 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3>
8242 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i)
8243 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0>
8244 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i)
8245 %vecext.i = extractelement <2 x double> %4, i32 0
8246 ret double %vecext.i
8249 define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) {
8250 ; CHECK-LABEL: test_mm512_reduce_max_epi32:
8251 ; CHECK: # %bb.0: # %entry
8252 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8253 ; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
8254 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8255 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8256 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8257 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8258 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8259 ; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8260 ; CHECK-NEXT: vmovd %xmm0, %eax
8261 ; CHECK-NEXT: vzeroupper
8262 ; CHECK-NEXT: ret{{[l|q]}}
8264 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8265 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8266 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8267 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8268 %2 = icmp sgt <8 x i32> %0, %1
8269 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8270 %4 = bitcast <8 x i32> %3 to <4 x i64>
8271 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8272 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8273 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8274 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8275 %7 = icmp sgt <4 x i32> %5, %6
8276 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8277 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8278 %9 = icmp sgt <4 x i32> %8, %shuffle.i
8279 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8280 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8281 %11 = icmp sgt <4 x i32> %10, %shuffle8.i
8282 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8283 %vecext.i = extractelement <4 x i32> %12, i32 0
8287 define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) {
8288 ; CHECK-LABEL: test_mm512_reduce_max_epu32:
8289 ; CHECK: # %bb.0: # %entry
8290 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8291 ; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8292 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8293 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8294 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8295 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8296 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8297 ; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8298 ; CHECK-NEXT: vmovd %xmm0, %eax
8299 ; CHECK-NEXT: vzeroupper
8300 ; CHECK-NEXT: ret{{[l|q]}}
8302 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8303 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8304 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8305 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8306 %2 = icmp ugt <8 x i32> %0, %1
8307 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8308 %4 = bitcast <8 x i32> %3 to <4 x i64>
8309 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8310 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8311 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8312 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8313 %7 = icmp ugt <4 x i32> %5, %6
8314 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8315 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8316 %9 = icmp ugt <4 x i32> %8, %shuffle.i
8317 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8318 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8319 %11 = icmp ugt <4 x i32> %10, %shuffle8.i
8320 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8321 %vecext.i = extractelement <4 x i32> %12, i32 0
8325 define float @test_mm512_reduce_max_ps(<16 x float> %__W) {
8326 ; X86-LABEL: test_mm512_reduce_max_ps:
8327 ; X86: # %bb.0: # %entry
8328 ; X86-NEXT: pushl %eax
8329 ; X86-NEXT: .cfi_def_cfa_offset 8
8330 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8331 ; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8332 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8333 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8334 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8335 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8336 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8337 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8338 ; X86-NEXT: vmovss %xmm0, (%esp)
8339 ; X86-NEXT: flds (%esp)
8340 ; X86-NEXT: popl %eax
8341 ; X86-NEXT: .cfi_def_cfa_offset 4
8342 ; X86-NEXT: vzeroupper
8345 ; X64-LABEL: test_mm512_reduce_max_ps:
8346 ; X64: # %bb.0: # %entry
8347 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8348 ; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
8349 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8350 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8351 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8352 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8353 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8354 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8355 ; X64-NEXT: vzeroupper
8358 %0 = bitcast <16 x float> %__W to <8 x double>
8359 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8360 %1 = bitcast <4 x double> %extract.i to <8 x float>
8361 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8362 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8363 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
8364 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8365 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8366 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8367 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8368 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i)
8369 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8370 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8371 %vecext.i = extractelement <4 x float> %6, i32 0
8375 define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) {
8376 ; CHECK-LABEL: test_mm512_reduce_min_epi32:
8377 ; CHECK: # %bb.0: # %entry
8378 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8379 ; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
8380 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8381 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8382 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8383 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8384 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8385 ; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8386 ; CHECK-NEXT: vmovd %xmm0, %eax
8387 ; CHECK-NEXT: vzeroupper
8388 ; CHECK-NEXT: ret{{[l|q]}}
8390 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8391 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8392 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8393 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8394 %2 = icmp slt <8 x i32> %0, %1
8395 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8396 %4 = bitcast <8 x i32> %3 to <4 x i64>
8397 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8398 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8399 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8400 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8401 %7 = icmp slt <4 x i32> %5, %6
8402 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8403 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8404 %9 = icmp slt <4 x i32> %8, %shuffle.i
8405 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8406 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8407 %11 = icmp slt <4 x i32> %10, %shuffle8.i
8408 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8409 %vecext.i = extractelement <4 x i32> %12, i32 0
8413 define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) {
8414 ; CHECK-LABEL: test_mm512_reduce_min_epu32:
8415 ; CHECK: # %bb.0: # %entry
8416 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8417 ; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
8418 ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
8419 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8420 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8421 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8422 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8423 ; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0
8424 ; CHECK-NEXT: vmovd %xmm0, %eax
8425 ; CHECK-NEXT: vzeroupper
8426 ; CHECK-NEXT: ret{{[l|q]}}
8428 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8429 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8430 %0 = bitcast <4 x i64> %extract.i to <8 x i32>
8431 %1 = bitcast <4 x i64> %extract2.i to <8 x i32>
8432 %2 = icmp ult <8 x i32> %0, %1
8433 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
8434 %4 = bitcast <8 x i32> %3 to <4 x i64>
8435 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8436 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8437 %5 = bitcast <2 x i64> %extract4.i to <4 x i32>
8438 %6 = bitcast <2 x i64> %extract5.i to <4 x i32>
8439 %7 = icmp ult <4 x i32> %5, %6
8440 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6
8441 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8442 %9 = icmp ult <4 x i32> %8, %shuffle.i
8443 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i
8444 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8445 %11 = icmp ult <4 x i32> %10, %shuffle8.i
8446 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i
8447 %vecext.i = extractelement <4 x i32> %12, i32 0
8451 define float @test_mm512_reduce_min_ps(<16 x float> %__W) {
8452 ; X86-LABEL: test_mm512_reduce_min_ps:
8453 ; X86: # %bb.0: # %entry
8454 ; X86-NEXT: pushl %eax
8455 ; X86-NEXT: .cfi_def_cfa_offset 8
8456 ; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8457 ; X86-NEXT: vminps %ymm1, %ymm0, %ymm0
8458 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8459 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8460 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8461 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8462 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8463 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8464 ; X86-NEXT: vmovss %xmm0, (%esp)
8465 ; X86-NEXT: flds (%esp)
8466 ; X86-NEXT: popl %eax
8467 ; X86-NEXT: .cfi_def_cfa_offset 4
8468 ; X86-NEXT: vzeroupper
8471 ; X64-LABEL: test_mm512_reduce_min_ps:
8472 ; X64: # %bb.0: # %entry
8473 ; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1
8474 ; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
8475 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8476 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8477 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8478 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8479 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8480 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8481 ; X64-NEXT: vzeroupper
8484 %0 = bitcast <16 x float> %__W to <8 x double>
8485 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8486 %1 = bitcast <4 x double> %extract.i to <8 x float>
8487 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8488 %2 = bitcast <4 x double> %extract2.i to <8 x float>
8489 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
8490 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8491 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8492 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i)
8493 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8494 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i)
8495 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8496 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i)
8497 %vecext.i = extractelement <4 x float> %6, i32 0
8501 define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8502 ; X86-LABEL: test_mm512_mask_reduce_max_epi32:
8503 ; X86: # %bb.0: # %entry
8504 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8505 ; X86-NEXT: kmovw %eax, %k1
8506 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8507 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8508 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8509 ; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8510 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8511 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8512 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8513 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8514 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8515 ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8516 ; X86-NEXT: vmovd %xmm0, %eax
8517 ; X86-NEXT: vzeroupper
8520 ; X64-LABEL: test_mm512_mask_reduce_max_epi32:
8521 ; X64: # %bb.0: # %entry
8522 ; X64-NEXT: kmovw %edi, %k1
8523 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
8524 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8525 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8526 ; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0
8527 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8528 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8529 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8530 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8531 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8532 ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
8533 ; X64-NEXT: vmovd %xmm0, %eax
8534 ; X64-NEXT: vzeroupper
8537 %0 = bitcast <8 x i64> %__W to <16 x i32>
8538 %1 = bitcast i16 %__M to <16 x i1>
8539 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
8540 %3 = bitcast <16 x i32> %2 to <8 x i64>
8541 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8542 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8543 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8544 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8545 %6 = icmp sgt <8 x i32> %4, %5
8546 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8547 %8 = bitcast <8 x i32> %7 to <4 x i64>
8548 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8549 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8550 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8551 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8552 %11 = icmp sgt <4 x i32> %9, %10
8553 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8554 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8555 %13 = icmp sgt <4 x i32> %12, %shuffle.i
8556 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8557 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8558 %15 = icmp sgt <4 x i32> %14, %shuffle10.i
8559 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8560 %vecext.i = extractelement <4 x i32> %16, i32 0
8564 define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8565 ; X86-LABEL: test_mm512_mask_reduce_max_epu32:
8566 ; X86: # %bb.0: # %entry
8567 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8568 ; X86-NEXT: kmovw %eax, %k1
8569 ; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8570 ; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8571 ; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8572 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8573 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8574 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8575 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8576 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8577 ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8578 ; X86-NEXT: vmovd %xmm0, %eax
8579 ; X86-NEXT: vzeroupper
8582 ; X64-LABEL: test_mm512_mask_reduce_max_epu32:
8583 ; X64: # %bb.0: # %entry
8584 ; X64-NEXT: kmovw %edi, %k1
8585 ; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
8586 ; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1
8587 ; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
8588 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8589 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8590 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8591 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8592 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8593 ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
8594 ; X64-NEXT: vmovd %xmm0, %eax
8595 ; X64-NEXT: vzeroupper
8598 %0 = bitcast <8 x i64> %__W to <16 x i32>
8599 %1 = bitcast i16 %__M to <16 x i1>
8600 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
8601 %3 = bitcast <16 x i32> %2 to <8 x i64>
8602 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8603 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8604 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8605 %5 = bitcast <4 x i64> %extract3.i to <8 x i32>
8606 %6 = icmp ugt <8 x i32> %4, %5
8607 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8608 %8 = bitcast <8 x i32> %7 to <4 x i64>
8609 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8610 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8611 %9 = bitcast <2 x i64> %extract5.i to <4 x i32>
8612 %10 = bitcast <2 x i64> %extract6.i to <4 x i32>
8613 %11 = icmp ugt <4 x i32> %9, %10
8614 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8615 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8616 %13 = icmp ugt <4 x i32> %12, %shuffle.i
8617 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8618 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8619 %15 = icmp ugt <4 x i32> %14, %shuffle9.i
8620 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i
8621 %vecext.i = extractelement <4 x i32> %16, i32 0
8625 define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) {
8626 ; X86-LABEL: test_mm512_mask_reduce_max_ps:
8627 ; X86: # %bb.0: # %entry
8628 ; X86-NEXT: pushl %eax
8629 ; X86-NEXT: .cfi_def_cfa_offset 8
8630 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8631 ; X86-NEXT: kmovw %eax, %k1
8632 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8633 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8634 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8635 ; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8636 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8637 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8638 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8639 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8640 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8641 ; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8642 ; X86-NEXT: vmovss %xmm0, (%esp)
8643 ; X86-NEXT: flds (%esp)
8644 ; X86-NEXT: popl %eax
8645 ; X86-NEXT: .cfi_def_cfa_offset 4
8646 ; X86-NEXT: vzeroupper
8649 ; X64-LABEL: test_mm512_mask_reduce_max_ps:
8650 ; X64: # %bb.0: # %entry
8651 ; X64-NEXT: kmovw %edi, %k1
8652 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf]
8653 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8654 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8655 ; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0
8656 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8657 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8658 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8659 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8660 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8661 ; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0
8662 ; X64-NEXT: vzeroupper
8665 %0 = bitcast i16 %__M to <16 x i1>
8666 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
8667 %2 = bitcast <16 x float> %1 to <8 x double>
8668 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8669 %3 = bitcast <4 x double> %extract.i to <8 x float>
8670 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8671 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8672 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4)
8673 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8674 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8675 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8676 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8677 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i)
8678 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8679 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8680 %vecext.i = extractelement <4 x float> %8, i32 0
8684 define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) {
8685 ; X86-LABEL: test_mm512_mask_reduce_min_epi32:
8686 ; X86: # %bb.0: # %entry
8687 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8688 ; X86-NEXT: kmovw %eax, %k1
8689 ; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8690 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8691 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8692 ; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8693 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8694 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8695 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8696 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8697 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8698 ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8699 ; X86-NEXT: vmovd %xmm0, %eax
8700 ; X86-NEXT: vzeroupper
8703 ; X64-LABEL: test_mm512_mask_reduce_min_epi32:
8704 ; X64: # %bb.0: # %entry
8705 ; X64-NEXT: kmovw %edi, %k1
8706 ; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
8707 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8708 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8709 ; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0
8710 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8711 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8712 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8713 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8714 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8715 ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0
8716 ; X64-NEXT: vmovd %xmm0, %eax
8717 ; X64-NEXT: vzeroupper
8720 %0 = bitcast <8 x i64> %__W to <16 x i32>
8721 %1 = bitcast i16 %__M to <16 x i1>
8722 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
8723 %3 = bitcast <16 x i32> %2 to <8 x i64>
8724 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8725 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8726 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8727 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8728 %6 = icmp slt <8 x i32> %4, %5
8729 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8730 %8 = bitcast <8 x i32> %7 to <4 x i64>
8731 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8732 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8733 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8734 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8735 %11 = icmp slt <4 x i32> %9, %10
8736 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8737 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8738 %13 = icmp slt <4 x i32> %12, %shuffle.i
8739 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8740 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8741 %15 = icmp slt <4 x i32> %14, %shuffle10.i
8742 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8743 %vecext.i = extractelement <4 x i32> %16, i32 0
8747 define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) {
8748 ; X86-LABEL: test_mm512_mask_reduce_min_epu32:
8749 ; X86: # %bb.0: # %entry
8750 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8751 ; X86-NEXT: kmovw %eax, %k1
8752 ; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8753 ; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8754 ; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8755 ; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0
8756 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1
8757 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8758 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8759 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8760 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8761 ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0
8762 ; X86-NEXT: vmovd %xmm0, %eax
8763 ; X86-NEXT: vzeroupper
8766 ; X64-LABEL: test_mm512_mask_reduce_min_epu32:
8767 ; X64: # %bb.0: # %entry
8768 ; X64-NEXT: kmovw %edi, %k1
8769 ; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
8770 ; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
8771 ; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8772 ; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0
8773 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
8774 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8775 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
8776 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8777 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
8778 ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0
8779 ; X64-NEXT: vmovd %xmm0, %eax
8780 ; X64-NEXT: vzeroupper
8783 %0 = bitcast <8 x i64> %__W to <16 x i32>
8784 %1 = bitcast i16 %__M to <16 x i1>
8785 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
8786 %3 = bitcast <16 x i32> %2 to <8 x i64>
8787 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8788 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8789 %4 = bitcast <4 x i64> %extract.i to <8 x i32>
8790 %5 = bitcast <4 x i64> %extract4.i to <8 x i32>
8791 %6 = icmp ult <8 x i32> %4, %5
8792 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5
8793 %8 = bitcast <8 x i32> %7 to <4 x i64>
8794 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
8795 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
8796 %9 = bitcast <2 x i64> %extract6.i to <4 x i32>
8797 %10 = bitcast <2 x i64> %extract7.i to <4 x i32>
8798 %11 = icmp ult <4 x i32> %9, %10
8799 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10
8800 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8801 %13 = icmp ult <4 x i32> %12, %shuffle.i
8802 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i
8803 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8804 %15 = icmp ult <4 x i32> %14, %shuffle10.i
8805 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i
8806 %vecext.i = extractelement <4 x i32> %16, i32 0
8810 define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) {
8811 ; X86-LABEL: test_mm512_mask_reduce_min_ps:
8812 ; X86: # %bb.0: # %entry
8813 ; X86-NEXT: pushl %eax
8814 ; X86-NEXT: .cfi_def_cfa_offset 8
8815 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8816 ; X86-NEXT: kmovw %eax, %k1
8817 ; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8818 ; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8819 ; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8820 ; X86-NEXT: vminps %ymm0, %ymm1, %ymm0
8821 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
8822 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8823 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8824 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8825 ; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8826 ; X86-NEXT: vminps %xmm1, %xmm0, %xmm0
8827 ; X86-NEXT: vmovss %xmm0, (%esp)
8828 ; X86-NEXT: flds (%esp)
8829 ; X86-NEXT: popl %eax
8830 ; X86-NEXT: .cfi_def_cfa_offset 4
8831 ; X86-NEXT: vzeroupper
8834 ; X64-LABEL: test_mm512_mask_reduce_min_ps:
8835 ; X64: # %bb.0: # %entry
8836 ; X64-NEXT: kmovw %edi, %k1
8837 ; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf]
8838 ; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
8839 ; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0
8840 ; X64-NEXT: vminps %ymm0, %ymm1, %ymm0
8841 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
8842 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8843 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
8844 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8845 ; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
8846 ; X64-NEXT: vminps %xmm1, %xmm0, %xmm0
8847 ; X64-NEXT: vzeroupper
8850 %0 = bitcast i16 %__M to <16 x i1>
8851 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
8852 %2 = bitcast <16 x float> %1 to <8 x double>
8853 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8854 %3 = bitcast <4 x double> %extract.i to <8 x float>
8855 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8856 %4 = bitcast <4 x double> %extract4.i to <8 x float>
8857 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4)
8858 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
8859 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
8860 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i)
8861 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
8862 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i)
8863 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
8864 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i)
8865 %vecext.i = extractelement <4 x float> %8, i32 0
8869 define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8870 ; X86-LABEL: test_mm512_mask_max_pd:
8871 ; X86: # %bb.0: # %entry
8872 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8873 ; X86-NEXT: kmovw %eax, %k1
8874 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8877 ; X64-LABEL: test_mm512_mask_max_pd:
8878 ; X64: # %bb.0: # %entry
8879 ; X64-NEXT: kmovw %edi, %k1
8880 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8883 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8884 %1 = bitcast i8 %__U to <8 x i1>
8885 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8889 define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8890 ; X86-LABEL: test_mm512_maskz_max_pd:
8891 ; X86: # %bb.0: # %entry
8892 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8893 ; X86-NEXT: kmovw %eax, %k1
8894 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8897 ; X64-LABEL: test_mm512_maskz_max_pd:
8898 ; X64: # %bb.0: # %entry
8899 ; X64-NEXT: kmovw %edi, %k1
8900 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8903 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8904 %1 = bitcast i8 %__U to <8 x i1>
8905 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8909 define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8910 ; X86-LABEL: test_mm512_mask_max_ps:
8911 ; X86: # %bb.0: # %entry
8912 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8913 ; X86-NEXT: kmovw %eax, %k1
8914 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8917 ; X64-LABEL: test_mm512_mask_max_ps:
8918 ; X64: # %bb.0: # %entry
8919 ; X64-NEXT: kmovw %edi, %k1
8920 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
8923 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8924 %1 = bitcast i16 %__U to <16 x i1>
8925 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
8929 define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8930 ; X86-LABEL: test_mm512_mask_max_round_pd:
8931 ; X86: # %bb.0: # %entry
8932 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8933 ; X86-NEXT: kmovw %eax, %k1
8934 ; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8937 ; X64-LABEL: test_mm512_mask_max_round_pd:
8938 ; X64: # %bb.0: # %entry
8939 ; X64-NEXT: kmovw %edi, %k1
8940 ; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
8943 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8944 %1 = bitcast i8 %__U to <8 x i1>
8945 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
8949 declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
8951 define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
8952 ; X86-LABEL: test_mm512_maskz_max_round_pd:
8953 ; X86: # %bb.0: # %entry
8954 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
8955 ; X86-NEXT: kmovw %eax, %k1
8956 ; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8959 ; X64-LABEL: test_mm512_maskz_max_round_pd:
8960 ; X64: # %bb.0: # %entry
8961 ; X64-NEXT: kmovw %edi, %k1
8962 ; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z}
8965 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8966 %1 = bitcast i8 %__U to <8 x i1>
8967 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
8971 define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) {
8972 ; CHECK-LABEL: test_mm512_max_round_pd:
8973 ; CHECK: # %bb.0: # %entry
8974 ; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
8975 ; CHECK-NEXT: ret{{[l|q]}}
8977 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
8981 define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
8982 ; X86-LABEL: test_mm512_maskz_max_ps:
8983 ; X86: # %bb.0: # %entry
8984 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
8985 ; X86-NEXT: kmovw %eax, %k1
8986 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8989 ; X64-LABEL: test_mm512_maskz_max_ps:
8990 ; X64: # %bb.0: # %entry
8991 ; X64-NEXT: kmovw %edi, %k1
8992 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
8995 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
8996 %1 = bitcast i16 %__U to <16 x i1>
8997 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9001 define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9002 ; X86-LABEL: test_mm512_mask_max_round_ps:
9003 ; X86: # %bb.0: # %entry
9004 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9005 ; X86-NEXT: kmovw %eax, %k1
9006 ; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9009 ; X64-LABEL: test_mm512_mask_max_round_ps:
9010 ; X64: # %bb.0: # %entry
9011 ; X64-NEXT: kmovw %edi, %k1
9012 ; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
9015 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9016 %1 = bitcast i16 %__U to <16 x i1>
9017 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9021 declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
9023 define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9024 ; X86-LABEL: test_mm512_maskz_max_round_ps:
9025 ; X86: # %bb.0: # %entry
9026 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9027 ; X86-NEXT: kmovw %eax, %k1
9028 ; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9031 ; X64-LABEL: test_mm512_maskz_max_round_ps:
9032 ; X64: # %bb.0: # %entry
9033 ; X64-NEXT: kmovw %edi, %k1
9034 ; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
9037 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9038 %1 = bitcast i16 %__U to <16 x i1>
9039 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9043 define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) {
9044 ; CHECK-LABEL: test_mm512_max_round_ps:
9045 ; CHECK: # %bb.0: # %entry
9046 ; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
9047 ; CHECK-NEXT: ret{{[l|q]}}
9049 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9053 define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9054 ; X86-LABEL: test_mm512_mask_min_pd:
9055 ; X86: # %bb.0: # %entry
9056 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9057 ; X86-NEXT: kmovw %eax, %k1
9058 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9061 ; X64-LABEL: test_mm512_mask_min_pd:
9062 ; X64: # %bb.0: # %entry
9063 ; X64-NEXT: kmovw %edi, %k1
9064 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9067 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9068 %1 = bitcast i8 %__U to <8 x i1>
9069 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9073 define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9074 ; X86-LABEL: test_mm512_maskz_min_pd:
9075 ; X86: # %bb.0: # %entry
9076 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9077 ; X86-NEXT: kmovw %eax, %k1
9078 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9081 ; X64-LABEL: test_mm512_maskz_min_pd:
9082 ; X64: # %bb.0: # %entry
9083 ; X64-NEXT: kmovw %edi, %k1
9084 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9087 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9088 %1 = bitcast i8 %__U to <8 x i1>
9089 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9093 define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9094 ; X86-LABEL: test_mm512_mask_min_round_pd:
9095 ; X86: # %bb.0: # %entry
9096 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9097 ; X86-NEXT: kmovw %eax, %k1
9098 ; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9101 ; X64-LABEL: test_mm512_mask_min_round_pd:
9102 ; X64: # %bb.0: # %entry
9103 ; X64-NEXT: kmovw %edi, %k1
9104 ; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
9107 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9108 %1 = bitcast i8 %__U to <8 x i1>
9109 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9113 declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
9115 define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
9116 ; X86-LABEL: test_mm512_maskz_min_round_pd:
9117 ; X86: # %bb.0: # %entry
9118 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9119 ; X86-NEXT: kmovw %eax, %k1
9120 ; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9123 ; X64-LABEL: test_mm512_maskz_min_round_pd:
9124 ; X64: # %bb.0: # %entry
9125 ; X64-NEXT: kmovw %edi, %k1
9126 ; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z}
9129 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9130 %1 = bitcast i8 %__U to <8 x i1>
9131 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9135 define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) {
9136 ; CHECK-LABEL: test_mm512_min_round_pd:
9137 ; CHECK: # %bb.0: # %entry
9138 ; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
9139 ; CHECK-NEXT: ret{{[l|q]}}
9141 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4)
9145 define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9146 ; X86-LABEL: test_mm512_mask_min_ps:
9147 ; X86: # %bb.0: # %entry
9148 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9149 ; X86-NEXT: kmovw %eax, %k1
9150 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9153 ; X64-LABEL: test_mm512_mask_min_ps:
9154 ; X64: # %bb.0: # %entry
9155 ; X64-NEXT: kmovw %edi, %k1
9156 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9159 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9160 %1 = bitcast i16 %__U to <16 x i1>
9161 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9165 define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9166 ; X86-LABEL: test_mm512_maskz_min_ps:
9167 ; X86: # %bb.0: # %entry
9168 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9169 ; X86-NEXT: kmovw %eax, %k1
9170 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9173 ; X64-LABEL: test_mm512_maskz_min_ps:
9174 ; X64: # %bb.0: # %entry
9175 ; X64-NEXT: kmovw %edi, %k1
9176 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9179 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9180 %1 = bitcast i16 %__U to <16 x i1>
9181 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9185 define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9186 ; X86-LABEL: test_mm512_mask_min_round_ps:
9187 ; X86: # %bb.0: # %entry
9188 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9189 ; X86-NEXT: kmovw %eax, %k1
9190 ; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9193 ; X64-LABEL: test_mm512_mask_min_round_ps:
9194 ; X64: # %bb.0: # %entry
9195 ; X64-NEXT: kmovw %edi, %k1
9196 ; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
9199 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9200 %1 = bitcast i16 %__U to <16 x i1>
9201 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9205 declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
9207 define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
9208 ; X86-LABEL: test_mm512_maskz_min_round_ps:
9209 ; X86: # %bb.0: # %entry
9210 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9211 ; X86-NEXT: kmovw %eax, %k1
9212 ; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9215 ; X64-LABEL: test_mm512_maskz_min_round_ps:
9216 ; X64: # %bb.0: # %entry
9217 ; X64-NEXT: kmovw %edi, %k1
9218 ; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
9221 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9222 %1 = bitcast i16 %__U to <16 x i1>
9223 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9227 define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) {
9228 ; CHECK-LABEL: test_mm512_min_round_ps:
9229 ; CHECK: # %bb.0: # %entry
9230 ; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
9231 ; CHECK-NEXT: ret{{[l|q]}}
9233 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4)
9237 define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) {
9238 ; CHECK-LABEL: test_mm512_sqrt_pd:
9239 ; CHECK: # %bb.0: # %entry
9240 ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
9241 ; CHECK-NEXT: ret{{[l|q]}}
9243 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
9247 define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9248 ; X86-LABEL: test_mm512_mask_sqrt_pd:
9249 ; X86: # %bb.0: # %entry
9250 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9251 ; X86-NEXT: kmovw %eax, %k1
9252 ; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9255 ; X64-LABEL: test_mm512_mask_sqrt_pd:
9256 ; X64: # %bb.0: # %entry
9257 ; X64-NEXT: kmovw %edi, %k1
9258 ; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1}
9261 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9262 %1 = bitcast i8 %__U to <8 x i1>
9263 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9267 define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) {
9268 ; X86-LABEL: test_mm512_maskz_sqrt_pd:
9269 ; X86: # %bb.0: # %entry
9270 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9271 ; X86-NEXT: kmovw %eax, %k1
9272 ; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9275 ; X64-LABEL: test_mm512_maskz_sqrt_pd:
9276 ; X64: # %bb.0: # %entry
9277 ; X64-NEXT: kmovw %edi, %k1
9278 ; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z}
9281 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A)
9282 %1 = bitcast i8 %__U to <8 x i1>
9283 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9287 define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) {
9288 ; X86-LABEL: test_mm512_mask_sqrt_round_pd:
9289 ; X86: # %bb.0: # %entry
9290 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9291 ; X86-NEXT: kmovw %eax, %k1
9292 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9295 ; X64-LABEL: test_mm512_mask_sqrt_round_pd:
9296 ; X64: # %bb.0: # %entry
9297 ; X64-NEXT: kmovw %edi, %k1
9298 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1}
9301 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9302 %1 = bitcast i8 %__U to <8 x i1>
9303 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W
9307 declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32)
9309 define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) {
9310 ; X86-LABEL: test_mm512_maskz_sqrt_round_pd:
9311 ; X86: # %bb.0: # %entry
9312 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9313 ; X86-NEXT: kmovw %eax, %k1
9314 ; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9317 ; X64-LABEL: test_mm512_maskz_sqrt_round_pd:
9318 ; X64: # %bb.0: # %entry
9319 ; X64-NEXT: kmovw %edi, %k1
9320 ; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9323 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9324 %1 = bitcast i8 %__U to <8 x i1>
9325 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer
9329 define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) {
9330 ; CHECK-LABEL: test_mm512_sqrt_round_pd:
9331 ; CHECK: # %bb.0: # %entry
9332 ; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0
9333 ; CHECK-NEXT: ret{{[l|q]}}
9335 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8)
9339 define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) {
9340 ; CHECK-LABEL: test_mm512_sqrt_ps:
9341 ; CHECK: # %bb.0: # %entry
9342 ; CHECK-NEXT: vsqrtps %zmm0, %zmm0
9343 ; CHECK-NEXT: ret{{[l|q]}}
9345 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
9349 define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9350 ; X86-LABEL: test_mm512_mask_sqrt_ps:
9351 ; X86: # %bb.0: # %entry
9352 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9353 ; X86-NEXT: kmovw %eax, %k1
9354 ; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9357 ; X64-LABEL: test_mm512_mask_sqrt_ps:
9358 ; X64: # %bb.0: # %entry
9359 ; X64-NEXT: kmovw %edi, %k1
9360 ; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1}
9363 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9364 %1 = bitcast i16 %__U to <16 x i1>
9365 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9369 define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) {
9370 ; X86-LABEL: test_mm512_maskz_sqrt_ps:
9371 ; X86: # %bb.0: # %entry
9372 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9373 ; X86-NEXT: kmovw %eax, %k1
9374 ; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9377 ; X64-LABEL: test_mm512_maskz_sqrt_ps:
9378 ; X64: # %bb.0: # %entry
9379 ; X64-NEXT: kmovw %edi, %k1
9380 ; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z}
9383 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A)
9384 %1 = bitcast i16 %__U to <16 x i1>
9385 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9389 define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) {
9390 ; X86-LABEL: test_mm512_mask_sqrt_round_ps:
9391 ; X86: # %bb.0: # %entry
9392 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9393 ; X86-NEXT: kmovw %eax, %k1
9394 ; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9397 ; X64-LABEL: test_mm512_mask_sqrt_round_ps:
9398 ; X64: # %bb.0: # %entry
9399 ; X64-NEXT: kmovw %edi, %k1
9400 ; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1}
9403 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9404 %1 = bitcast i16 %__U to <16 x i1>
9405 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W
9409 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32)
9411 define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) {
9412 ; X86-LABEL: test_mm512_maskz_sqrt_round_ps:
9413 ; X86: # %bb.0: # %entry
9414 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9415 ; X86-NEXT: kmovw %eax, %k1
9416 ; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9419 ; X64-LABEL: test_mm512_maskz_sqrt_round_ps:
9420 ; X64: # %bb.0: # %entry
9421 ; X64-NEXT: kmovw %edi, %k1
9422 ; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z}
9425 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9426 %1 = bitcast i16 %__U to <16 x i1>
9427 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
9431 define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) {
9432 ; CHECK-LABEL: test_mm512_sqrt_round_ps:
9433 ; CHECK: # %bb.0: # %entry
9434 ; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0
9435 ; CHECK-NEXT: ret{{[l|q]}}
9437 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8)
9441 define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 {
9442 ; CHECK-LABEL: test_mm512_rol_epi32:
9443 ; CHECK: # %bb.0: # %entry
9444 ; CHECK-NEXT: vprold $5, %zmm0, %zmm0
9445 ; CHECK-NEXT: ret{{[l|q]}}
9447 %0 = bitcast <8 x i64> %__A to <16 x i32>
9448 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9449 %2 = bitcast <16 x i32> %1 to <8 x i64>
9453 define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9454 ; X86-LABEL: test_mm512_mask_rol_epi32:
9455 ; X86: # %bb.0: # %entry
9456 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9457 ; X86-NEXT: kmovw %eax, %k1
9458 ; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9461 ; X64-LABEL: test_mm512_mask_rol_epi32:
9462 ; X64: # %bb.0: # %entry
9463 ; X64-NEXT: kmovw %edi, %k1
9464 ; X64-NEXT: vprold $5, %zmm1, %zmm0 {%k1}
9467 %0 = bitcast <8 x i64> %__A to <16 x i32>
9468 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9469 %2 = bitcast <8 x i64> %__W to <16 x i32>
9470 %3 = bitcast i16 %__U to <16 x i1>
9471 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9472 %5 = bitcast <16 x i32> %4 to <8 x i64>
9476 define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9477 ; X86-LABEL: test_mm512_maskz_rol_epi32:
9478 ; X86: # %bb.0: # %entry
9479 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9480 ; X86-NEXT: kmovw %eax, %k1
9481 ; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9484 ; X64-LABEL: test_mm512_maskz_rol_epi32:
9485 ; X64: # %bb.0: # %entry
9486 ; X64-NEXT: kmovw %edi, %k1
9487 ; X64-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z}
9490 %0 = bitcast <8 x i64> %__A to <16 x i32>
9491 %1 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9492 %2 = bitcast i16 %__U to <16 x i1>
9493 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9494 %4 = bitcast <16 x i32> %3 to <8 x i64>
9498 define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) {
9499 ; CHECK-LABEL: test_mm512_rol_epi64:
9500 ; CHECK: # %bb.0: # %entry
9501 ; CHECK-NEXT: vprolq $5, %zmm0, %zmm0
9502 ; CHECK-NEXT: ret{{[l|q]}}
9504 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9508 define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9509 ; X86-LABEL: test_mm512_mask_rol_epi64:
9510 ; X86: # %bb.0: # %entry
9511 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9512 ; X86-NEXT: kmovw %eax, %k1
9513 ; X86-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9516 ; X64-LABEL: test_mm512_mask_rol_epi64:
9517 ; X64: # %bb.0: # %entry
9518 ; X64-NEXT: kmovw %edi, %k1
9519 ; X64-NEXT: vprolq $5, %zmm1, %zmm0 {%k1}
9522 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9523 %1 = bitcast i8 %__U to <8 x i1>
9524 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9528 define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9529 ; X86-LABEL: test_mm512_maskz_rol_epi64:
9530 ; X86: # %bb.0: # %entry
9531 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9532 ; X86-NEXT: kmovw %eax, %k1
9533 ; X86-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9536 ; X64-LABEL: test_mm512_maskz_rol_epi64:
9537 ; X64: # %bb.0: # %entry
9538 ; X64-NEXT: kmovw %edi, %k1
9539 ; X64-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z}
9542 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9543 %1 = bitcast i8 %__U to <8 x i1>
9544 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9548 define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9549 ; CHECK-LABEL: test_mm512_rolv_epi32:
9550 ; CHECK: # %bb.0: # %entry
9551 ; CHECK-NEXT: vprolvd %zmm1, %zmm0, %zmm0
9552 ; CHECK-NEXT: ret{{[l|q]}}
9554 %0 = bitcast <8 x i64> %__A to <16 x i32>
9555 %1 = bitcast <8 x i64> %__B to <16 x i32>
9556 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9557 %3 = bitcast <16 x i32> %2 to <8 x i64>
9561 define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9562 ; X86-LABEL: test_mm512_mask_rolv_epi32:
9563 ; X86: # %bb.0: # %entry
9564 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9565 ; X86-NEXT: kmovw %eax, %k1
9566 ; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9569 ; X64-LABEL: test_mm512_mask_rolv_epi32:
9570 ; X64: # %bb.0: # %entry
9571 ; X64-NEXT: kmovw %edi, %k1
9572 ; X64-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1}
9575 %0 = bitcast <8 x i64> %__A to <16 x i32>
9576 %1 = bitcast <8 x i64> %__B to <16 x i32>
9577 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9578 %3 = bitcast <8 x i64> %__W to <16 x i32>
9579 %4 = bitcast i16 %__U to <16 x i1>
9580 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9581 %6 = bitcast <16 x i32> %5 to <8 x i64>
9585 define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9586 ; X86-LABEL: test_mm512_maskz_rolv_epi32:
9587 ; X86: # %bb.0: # %entry
9588 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9589 ; X86-NEXT: kmovw %eax, %k1
9590 ; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9593 ; X64-LABEL: test_mm512_maskz_rolv_epi32:
9594 ; X64: # %bb.0: # %entry
9595 ; X64-NEXT: kmovw %edi, %k1
9596 ; X64-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9599 %0 = bitcast <8 x i64> %__A to <16 x i32>
9600 %1 = bitcast <8 x i64> %__B to <16 x i32>
9601 %2 = tail call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9602 %3 = bitcast i16 %__U to <16 x i1>
9603 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9604 %5 = bitcast <16 x i32> %4 to <8 x i64>
9608 define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9609 ; CHECK-LABEL: test_mm512_rolv_epi64:
9610 ; CHECK: # %bb.0: # %entry
9611 ; CHECK-NEXT: vprolvq %zmm1, %zmm0, %zmm0
9612 ; CHECK-NEXT: ret{{[l|q]}}
9614 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9618 define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9619 ; X86-LABEL: test_mm512_mask_rolv_epi64:
9620 ; X86: # %bb.0: # %entry
9621 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9622 ; X86-NEXT: kmovw %eax, %k1
9623 ; X86-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9626 ; X64-LABEL: test_mm512_mask_rolv_epi64:
9627 ; X64: # %bb.0: # %entry
9628 ; X64-NEXT: kmovw %edi, %k1
9629 ; X64-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1}
9632 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9633 %1 = bitcast i8 %__U to <8 x i1>
9634 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9638 define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9639 ; X86-LABEL: test_mm512_maskz_rolv_epi64:
9640 ; X86: # %bb.0: # %entry
9641 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9642 ; X86-NEXT: kmovw %eax, %k1
9643 ; X86-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9646 ; X64-LABEL: test_mm512_maskz_rolv_epi64:
9647 ; X64: # %bb.0: # %entry
9648 ; X64-NEXT: kmovw %edi, %k1
9649 ; X64-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9652 %0 = tail call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9653 %1 = bitcast i8 %__U to <8 x i1>
9654 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9658 define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) {
9659 ; CHECK-LABEL: test_mm512_ror_epi32:
9660 ; CHECK: # %bb.0: # %entry
9661 ; CHECK-NEXT: vprord $5, %zmm0, %zmm0
9662 ; CHECK-NEXT: ret{{[l|q]}}
9664 %0 = bitcast <8 x i64> %__A to <16 x i32>
9665 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9666 %2 = bitcast <16 x i32> %1 to <8 x i64>
9671 define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) {
9672 ; X86-LABEL: test_mm512_mask_ror_epi32:
9673 ; X86: # %bb.0: # %entry
9674 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9675 ; X86-NEXT: kmovw %eax, %k1
9676 ; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9679 ; X64-LABEL: test_mm512_mask_ror_epi32:
9680 ; X64: # %bb.0: # %entry
9681 ; X64-NEXT: kmovw %edi, %k1
9682 ; X64-NEXT: vprord $5, %zmm1, %zmm0 {%k1}
9685 %0 = bitcast <8 x i64> %__A to <16 x i32>
9686 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9687 %2 = bitcast <8 x i64> %__W to <16 x i32>
9688 %3 = bitcast i16 %__U to <16 x i1>
9689 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2
9690 %5 = bitcast <16 x i32> %4 to <8 x i64>
9694 define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) {
9695 ; X86-LABEL: test_mm512_maskz_ror_epi32:
9696 ; X86: # %bb.0: # %entry
9697 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9698 ; X86-NEXT: kmovw %eax, %k1
9699 ; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9702 ; X64-LABEL: test_mm512_maskz_ror_epi32:
9703 ; X64: # %bb.0: # %entry
9704 ; X64-NEXT: kmovw %edi, %k1
9705 ; X64-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z}
9708 %0 = bitcast <8 x i64> %__A to <16 x i32>
9709 %1 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
9710 %2 = bitcast i16 %__U to <16 x i1>
9711 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
9712 %4 = bitcast <16 x i32> %3 to <8 x i64>
9716 define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) {
9717 ; CHECK-LABEL: test_mm512_ror_epi64:
9718 ; CHECK: # %bb.0: # %entry
9719 ; CHECK-NEXT: vprorq $5, %zmm0, %zmm0
9720 ; CHECK-NEXT: ret{{[l|q]}}
9722 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9726 define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) {
9727 ; X86-LABEL: test_mm512_mask_ror_epi64:
9728 ; X86: # %bb.0: # %entry
9729 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9730 ; X86-NEXT: kmovw %eax, %k1
9731 ; X86-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9734 ; X64-LABEL: test_mm512_mask_ror_epi64:
9735 ; X64: # %bb.0: # %entry
9736 ; X64-NEXT: kmovw %edi, %k1
9737 ; X64-NEXT: vprorq $5, %zmm1, %zmm0 {%k1}
9740 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9741 %1 = bitcast i8 %__U to <8 x i1>
9742 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9746 define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) {
9747 ; X86-LABEL: test_mm512_maskz_ror_epi64:
9748 ; X86: # %bb.0: # %entry
9749 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9750 ; X86-NEXT: kmovw %eax, %k1
9751 ; X86-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9754 ; X64-LABEL: test_mm512_maskz_ror_epi64:
9755 ; X64: # %bb.0: # %entry
9756 ; X64-NEXT: kmovw %edi, %k1
9757 ; X64-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z}
9760 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> <i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5, i64 5>)
9761 %1 = bitcast i8 %__U to <8 x i1>
9762 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9766 define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) {
9767 ; CHECK-LABEL: test_mm512_rorv_epi32:
9768 ; CHECK: # %bb.0: # %entry
9769 ; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0
9770 ; CHECK-NEXT: ret{{[l|q]}}
9772 %0 = bitcast <8 x i64> %__A to <16 x i32>
9773 %1 = bitcast <8 x i64> %__B to <16 x i32>
9774 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9775 %3 = bitcast <16 x i32> %2 to <8 x i64>
9779 define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9780 ; X86-LABEL: test_mm512_mask_rorv_epi32:
9781 ; X86: # %bb.0: # %entry
9782 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9783 ; X86-NEXT: kmovw %eax, %k1
9784 ; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9787 ; X64-LABEL: test_mm512_mask_rorv_epi32:
9788 ; X64: # %bb.0: # %entry
9789 ; X64-NEXT: kmovw %edi, %k1
9790 ; X64-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1}
9793 %0 = bitcast <8 x i64> %__A to <16 x i32>
9794 %1 = bitcast <8 x i64> %__B to <16 x i32>
9795 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9796 %3 = bitcast <8 x i64> %__W to <16 x i32>
9797 %4 = bitcast i16 %__U to <16 x i1>
9798 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3
9799 %6 = bitcast <16 x i32> %5 to <8 x i64>
9803 define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9804 ; X86-LABEL: test_mm512_maskz_rorv_epi32:
9805 ; X86: # %bb.0: # %entry
9806 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
9807 ; X86-NEXT: kmovw %eax, %k1
9808 ; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9811 ; X64-LABEL: test_mm512_maskz_rorv_epi32:
9812 ; X64: # %bb.0: # %entry
9813 ; X64-NEXT: kmovw %edi, %k1
9814 ; X64-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
9817 %0 = bitcast <8 x i64> %__A to <16 x i32>
9818 %1 = bitcast <8 x i64> %__B to <16 x i32>
9819 %2 = tail call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %0, <16 x i32> %0, <16 x i32> %1)
9820 %3 = bitcast i16 %__U to <16 x i1>
9821 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
9822 %5 = bitcast <16 x i32> %4 to <8 x i64>
9826 define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) {
9827 ; CHECK-LABEL: test_mm512_rorv_epi64:
9828 ; CHECK: # %bb.0: # %entry
9829 ; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0
9830 ; CHECK-NEXT: ret{{[l|q]}}
9832 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9836 define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9837 ; X86-LABEL: test_mm512_mask_rorv_epi64:
9838 ; X86: # %bb.0: # %entry
9839 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9840 ; X86-NEXT: kmovw %eax, %k1
9841 ; X86-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9844 ; X64-LABEL: test_mm512_mask_rorv_epi64:
9845 ; X64: # %bb.0: # %entry
9846 ; X64-NEXT: kmovw %edi, %k1
9847 ; X64-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1}
9850 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9851 %1 = bitcast i8 %__U to <8 x i1>
9852 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
9856 define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
9857 ; X86-LABEL: test_mm512_maskz_rorv_epi64:
9858 ; X86: # %bb.0: # %entry
9859 ; X86-NEXT: movb {{[0-9]+}}(%esp), %al
9860 ; X86-NEXT: kmovw %eax, %k1
9861 ; X86-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9864 ; X64-LABEL: test_mm512_maskz_rorv_epi64:
9865 ; X64: # %bb.0: # %entry
9866 ; X64-NEXT: kmovw %edi, %k1
9867 ; X64-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
9870 %0 = tail call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %__A, <8 x i64> %__A, <8 x i64> %__B)
9871 %1 = bitcast i8 %__U to <8 x i1>
9872 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
9876 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
9877 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
9878 declare float @llvm.fma.f32(float, float, float) #9
9879 declare double @llvm.fma.f64(double, double, double) #9
9880 declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
9881 declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
9882 declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
9883 declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
9884 declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
9885 declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
9886 declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
9887 declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
9888 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>)
9889 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>)
9890 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>)
9891 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>)
9892 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
9893 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
9894 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
9895 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
9896 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
9897 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
9899 declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9900 declare <8 x i64> @llvm.fshl.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)
9901 declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
9902 declare <8 x i64> @llvm.fshr.v8i64(<8 x i64>, <8 x i64>, <8 x i64>)