1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-sse2 | FileCheck %s --check-prefixes=ALL,SCALAR
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE2-ONLY
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE3
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSSE3-ONLY
6 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE41
7 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE42
8 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX2-ONLY
10 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512F
11 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512BW
13 define void @vec32_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
14 ; SCALAR-LABEL: vec32_v2i8:
16 ; SCALAR-NEXT: movzbl (%rdi), %eax
17 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
18 ; SCALAR-NEXT: notb %al
19 ; SCALAR-NEXT: notb %cl
20 ; SCALAR-NEXT: movb %cl, 1(%rsi)
21 ; SCALAR-NEXT: movb %al, (%rsi)
22 ; SCALAR-NEXT: movb %cl, 1(%rdx)
23 ; SCALAR-NEXT: movb %al, (%rdx)
24 ; SCALAR-NEXT: movb %cl, 3(%rdx)
25 ; SCALAR-NEXT: movb %al, 2(%rdx)
28 ; SSE-LABEL: vec32_v2i8:
30 ; SSE-NEXT: movl (%rdi), %eax
32 ; SSE-NEXT: movw %ax, (%rsi)
33 ; SSE-NEXT: movw %ax, (%rdx)
34 ; SSE-NEXT: movw %ax, 2(%rdx)
36 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
37 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
38 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
39 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
40 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
41 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
42 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
46 define void @vec64_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
47 ; SCALAR-LABEL: vec64_v2i8:
49 ; SCALAR-NEXT: movzbl (%rdi), %eax
50 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
51 ; SCALAR-NEXT: notb %al
52 ; SCALAR-NEXT: notb %cl
53 ; SCALAR-NEXT: movb %cl, 1(%rsi)
54 ; SCALAR-NEXT: movb %al, (%rsi)
55 ; SCALAR-NEXT: movb %cl, 1(%rdx)
56 ; SCALAR-NEXT: movb %al, (%rdx)
57 ; SCALAR-NEXT: movb %cl, 3(%rdx)
58 ; SCALAR-NEXT: movb %al, 2(%rdx)
59 ; SCALAR-NEXT: movb %cl, 5(%rdx)
60 ; SCALAR-NEXT: movb %al, 4(%rdx)
61 ; SCALAR-NEXT: movb %cl, 7(%rdx)
62 ; SCALAR-NEXT: movb %al, 6(%rdx)
65 ; SSE-LABEL: vec64_v2i8:
67 ; SSE-NEXT: movl (%rdi), %eax
69 ; SSE-NEXT: movw %ax, (%rsi)
70 ; SSE-NEXT: movw %ax, (%rdx)
71 ; SSE-NEXT: movw %ax, 2(%rdx)
72 ; SSE-NEXT: movw %ax, 4(%rdx)
73 ; SSE-NEXT: movw %ax, 6(%rdx)
75 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
76 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
77 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
78 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
79 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
80 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
81 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
82 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
83 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
84 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
85 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
89 define void @vec64_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
90 ; SCALAR-LABEL: vec64_v2i16:
92 ; SCALAR-NEXT: movzwl 2(%rdi), %eax
93 ; SCALAR-NEXT: movl (%rdi), %ecx
94 ; SCALAR-NEXT: notl %ecx
95 ; SCALAR-NEXT: notl %eax
96 ; SCALAR-NEXT: movw %ax, 2(%rsi)
97 ; SCALAR-NEXT: movw %cx, (%rsi)
98 ; SCALAR-NEXT: movw %ax, 2(%rdx)
99 ; SCALAR-NEXT: movw %cx, (%rdx)
100 ; SCALAR-NEXT: movw %ax, 6(%rdx)
101 ; SCALAR-NEXT: movw %cx, 4(%rdx)
104 ; SSE-LABEL: vec64_v2i16:
106 ; SSE-NEXT: movl (%rdi), %eax
107 ; SSE-NEXT: notl %eax
108 ; SSE-NEXT: movl %eax, (%rsi)
109 ; SSE-NEXT: movl %eax, (%rdx)
110 ; SSE-NEXT: movl %eax, 4(%rdx)
112 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
113 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
114 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
115 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
116 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
117 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
118 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
122 define void @vec64_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
123 ; SCALAR-LABEL: vec64_v4i8:
125 ; SCALAR-NEXT: movzbl 3(%rdi), %eax
126 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
127 ; SCALAR-NEXT: movzbl (%rdi), %r8d
128 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
129 ; SCALAR-NEXT: notb %r8b
130 ; SCALAR-NEXT: notb %dil
131 ; SCALAR-NEXT: notb %cl
132 ; SCALAR-NEXT: notb %al
133 ; SCALAR-NEXT: movb %al, 3(%rsi)
134 ; SCALAR-NEXT: movb %cl, 2(%rsi)
135 ; SCALAR-NEXT: movb %dil, 1(%rsi)
136 ; SCALAR-NEXT: movb %r8b, (%rsi)
137 ; SCALAR-NEXT: movb %al, 3(%rdx)
138 ; SCALAR-NEXT: movb %cl, 2(%rdx)
139 ; SCALAR-NEXT: movb %dil, 1(%rdx)
140 ; SCALAR-NEXT: movb %r8b, (%rdx)
141 ; SCALAR-NEXT: movb %al, 7(%rdx)
142 ; SCALAR-NEXT: movb %cl, 6(%rdx)
143 ; SCALAR-NEXT: movb %dil, 5(%rdx)
144 ; SCALAR-NEXT: movb %r8b, 4(%rdx)
147 ; SSE-LABEL: vec64_v4i8:
149 ; SSE-NEXT: movl (%rdi), %eax
150 ; SSE-NEXT: notl %eax
151 ; SSE-NEXT: movl %eax, (%rsi)
152 ; SSE-NEXT: movl %eax, (%rdx)
153 ; SSE-NEXT: movl %eax, 4(%rdx)
155 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
156 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
157 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
158 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
159 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
160 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
161 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
165 define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
166 ; SCALAR-LABEL: vec128_v2i8:
168 ; SCALAR-NEXT: movzbl (%rdi), %eax
169 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
170 ; SCALAR-NEXT: notb %al
171 ; SCALAR-NEXT: notb %cl
172 ; SCALAR-NEXT: movb %cl, 1(%rsi)
173 ; SCALAR-NEXT: movb %al, (%rsi)
174 ; SCALAR-NEXT: movb %cl, 1(%rdx)
175 ; SCALAR-NEXT: movb %al, (%rdx)
176 ; SCALAR-NEXT: movb %cl, 3(%rdx)
177 ; SCALAR-NEXT: movb %al, 2(%rdx)
178 ; SCALAR-NEXT: movb %cl, 5(%rdx)
179 ; SCALAR-NEXT: movb %al, 4(%rdx)
180 ; SCALAR-NEXT: movb %cl, 7(%rdx)
181 ; SCALAR-NEXT: movb %al, 6(%rdx)
182 ; SCALAR-NEXT: movb %cl, 9(%rdx)
183 ; SCALAR-NEXT: movb %al, 8(%rdx)
184 ; SCALAR-NEXT: movb %cl, 11(%rdx)
185 ; SCALAR-NEXT: movb %al, 10(%rdx)
186 ; SCALAR-NEXT: movb %cl, 13(%rdx)
187 ; SCALAR-NEXT: movb %al, 12(%rdx)
188 ; SCALAR-NEXT: movb %cl, 15(%rdx)
189 ; SCALAR-NEXT: movb %al, 14(%rdx)
192 ; SSE2-ONLY-LABEL: vec128_v2i8:
193 ; SSE2-ONLY: # %bb.0:
194 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
195 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
196 ; SSE2-ONLY-NEXT: movd %xmm0, %eax
197 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
198 ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
199 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
200 ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
201 ; SSE2-ONLY-NEXT: retq
203 ; SSE3-LABEL: vec128_v2i8:
205 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
206 ; SSE3-NEXT: pxor (%rdi), %xmm0
207 ; SSE3-NEXT: movd %xmm0, %eax
208 ; SSE3-NEXT: movw %ax, (%rsi)
209 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
210 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
211 ; SSE3-NEXT: movdqa %xmm0, (%rdx)
214 ; SSSE3-ONLY-LABEL: vec128_v2i8:
215 ; SSSE3-ONLY: # %bb.0:
216 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
217 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
218 ; SSSE3-ONLY-NEXT: movd %xmm0, %eax
219 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
220 ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
221 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
222 ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
223 ; SSSE3-ONLY-NEXT: retq
225 ; SSE41-LABEL: vec128_v2i8:
227 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
228 ; SSE41-NEXT: pxor (%rdi), %xmm0
229 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
230 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
231 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
232 ; SSE41-NEXT: movdqa %xmm0, (%rdx)
235 ; SSE42-LABEL: vec128_v2i8:
237 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
238 ; SSE42-NEXT: pxor (%rdi), %xmm0
239 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
240 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
241 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
242 ; SSE42-NEXT: movdqa %xmm0, (%rdx)
245 ; AVX1-LABEL: vec128_v2i8:
247 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
248 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
249 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
250 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
251 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
252 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
255 ; AVX2-LABEL: vec128_v2i8:
257 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
258 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
259 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi)
260 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
261 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
263 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
264 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
265 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
266 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
267 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
268 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
269 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
270 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
271 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
272 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
273 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
274 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
275 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
276 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
277 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
278 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
279 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
280 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
281 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
285 define void @vec128_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
286 ; SCALAR-LABEL: vec128_v2i16:
288 ; SCALAR-NEXT: movzwl 2(%rdi), %eax
289 ; SCALAR-NEXT: movl (%rdi), %ecx
290 ; SCALAR-NEXT: notl %ecx
291 ; SCALAR-NEXT: notl %eax
292 ; SCALAR-NEXT: movw %ax, 2(%rsi)
293 ; SCALAR-NEXT: movw %cx, (%rsi)
294 ; SCALAR-NEXT: movw %ax, 2(%rdx)
295 ; SCALAR-NEXT: movw %cx, (%rdx)
296 ; SCALAR-NEXT: movw %ax, 6(%rdx)
297 ; SCALAR-NEXT: movw %cx, 4(%rdx)
298 ; SCALAR-NEXT: movw %ax, 10(%rdx)
299 ; SCALAR-NEXT: movw %cx, 8(%rdx)
300 ; SCALAR-NEXT: movw %ax, 14(%rdx)
301 ; SCALAR-NEXT: movw %cx, 12(%rdx)
304 ; SSE2-LABEL: vec128_v2i16:
306 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
307 ; SSE2-NEXT: pxor (%rdi), %xmm0
308 ; SSE2-NEXT: movd %xmm0, (%rsi)
309 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
310 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
313 ; AVX1-LABEL: vec128_v2i16:
315 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
316 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
317 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
318 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
319 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
322 ; AVX2-LABEL: vec128_v2i16:
324 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
325 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
326 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
327 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
328 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
330 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
331 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
332 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
333 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
334 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
335 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
336 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
337 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
338 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
339 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
340 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
344 define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
345 ; SCALAR-LABEL: vec128_v2i32:
347 ; SCALAR-NEXT: movl (%rdi), %eax
348 ; SCALAR-NEXT: movl 4(%rdi), %ecx
349 ; SCALAR-NEXT: notl %eax
350 ; SCALAR-NEXT: notl %ecx
351 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
352 ; SCALAR-NEXT: movl %eax, (%rsi)
353 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
354 ; SCALAR-NEXT: movl %eax, (%rdx)
355 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
356 ; SCALAR-NEXT: movl %eax, 8(%rdx)
359 ; SSE2-LABEL: vec128_v2i32:
361 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
362 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
363 ; SSE2-NEXT: pxor %xmm0, %xmm1
364 ; SSE2-NEXT: movq %xmm1, (%rsi)
365 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
366 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
369 ; AVX1-LABEL: vec128_v2i32:
371 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
372 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
373 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
374 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
375 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
376 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
379 ; AVX2-ONLY-LABEL: vec128_v2i32:
380 ; AVX2-ONLY: # %bb.0:
381 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
382 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
383 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
384 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
385 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0
386 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
387 ; AVX2-ONLY-NEXT: retq
389 ; AVX512-LABEL: vec128_v2i32:
391 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
392 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
393 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
394 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
395 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
397 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
398 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
399 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
400 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
401 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
402 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
403 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
407 define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
408 ; SCALAR-LABEL: vec128_v2f32:
410 ; SCALAR-NEXT: movl (%rdi), %eax
411 ; SCALAR-NEXT: movl 4(%rdi), %ecx
412 ; SCALAR-NEXT: notl %eax
413 ; SCALAR-NEXT: notl %ecx
414 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
415 ; SCALAR-NEXT: movl %eax, (%rsi)
416 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
417 ; SCALAR-NEXT: movl %eax, (%rdx)
418 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
419 ; SCALAR-NEXT: movl %eax, 8(%rdx)
422 ; SSE2-LABEL: vec128_v2f32:
424 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
425 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
426 ; SSE2-NEXT: pxor %xmm0, %xmm1
427 ; SSE2-NEXT: movq %xmm1, (%rsi)
428 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
429 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
432 ; AVX1-LABEL: vec128_v2f32:
434 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
435 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
436 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
437 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
438 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
439 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
442 ; AVX2-ONLY-LABEL: vec128_v2f32:
443 ; AVX2-ONLY: # %bb.0:
444 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
445 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
446 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
447 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
448 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0
449 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
450 ; AVX2-ONLY-NEXT: retq
452 ; AVX512-LABEL: vec128_v2f32:
454 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
455 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
456 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
457 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
458 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
460 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
461 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
462 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
463 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
464 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
465 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
466 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
467 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
471 define void @vec128_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
472 ; SCALAR-LABEL: vec128_v4i8:
474 ; SCALAR-NEXT: movzbl 3(%rdi), %eax
475 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
476 ; SCALAR-NEXT: movzbl (%rdi), %r8d
477 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
478 ; SCALAR-NEXT: notb %r8b
479 ; SCALAR-NEXT: notb %dil
480 ; SCALAR-NEXT: notb %cl
481 ; SCALAR-NEXT: notb %al
482 ; SCALAR-NEXT: movb %al, 3(%rsi)
483 ; SCALAR-NEXT: movb %cl, 2(%rsi)
484 ; SCALAR-NEXT: movb %dil, 1(%rsi)
485 ; SCALAR-NEXT: movb %r8b, (%rsi)
486 ; SCALAR-NEXT: movb %al, 3(%rdx)
487 ; SCALAR-NEXT: movb %cl, 2(%rdx)
488 ; SCALAR-NEXT: movb %dil, 1(%rdx)
489 ; SCALAR-NEXT: movb %r8b, (%rdx)
490 ; SCALAR-NEXT: movb %al, 7(%rdx)
491 ; SCALAR-NEXT: movb %cl, 6(%rdx)
492 ; SCALAR-NEXT: movb %dil, 5(%rdx)
493 ; SCALAR-NEXT: movb %r8b, 4(%rdx)
494 ; SCALAR-NEXT: movb %al, 11(%rdx)
495 ; SCALAR-NEXT: movb %cl, 10(%rdx)
496 ; SCALAR-NEXT: movb %dil, 9(%rdx)
497 ; SCALAR-NEXT: movb %r8b, 8(%rdx)
498 ; SCALAR-NEXT: movb %al, 15(%rdx)
499 ; SCALAR-NEXT: movb %cl, 14(%rdx)
500 ; SCALAR-NEXT: movb %dil, 13(%rdx)
501 ; SCALAR-NEXT: movb %r8b, 12(%rdx)
504 ; SSE2-LABEL: vec128_v4i8:
506 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
507 ; SSE2-NEXT: pxor (%rdi), %xmm0
508 ; SSE2-NEXT: movd %xmm0, (%rsi)
509 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
510 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
513 ; AVX1-LABEL: vec128_v4i8:
515 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
516 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
517 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
518 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
519 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
522 ; AVX2-LABEL: vec128_v4i8:
524 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
525 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
526 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
527 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
528 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
530 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
531 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
532 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
533 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
534 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
535 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
536 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
537 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
538 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
539 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
540 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
544 define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
545 ; SCALAR-LABEL: vec128_v4i16:
547 ; SCALAR-NEXT: movzwl 6(%rdi), %eax
548 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
549 ; SCALAR-NEXT: movl (%rdi), %r8d
550 ; SCALAR-NEXT: movl 4(%rdi), %edi
551 ; SCALAR-NEXT: notl %r8d
552 ; SCALAR-NEXT: notl %ecx
553 ; SCALAR-NEXT: notl %edi
554 ; SCALAR-NEXT: notl %eax
555 ; SCALAR-NEXT: movw %ax, 6(%rsi)
556 ; SCALAR-NEXT: movw %di, 4(%rsi)
557 ; SCALAR-NEXT: movw %cx, 2(%rsi)
558 ; SCALAR-NEXT: movw %r8w, (%rsi)
559 ; SCALAR-NEXT: movw %ax, 6(%rdx)
560 ; SCALAR-NEXT: movw %di, 4(%rdx)
561 ; SCALAR-NEXT: movw %cx, 2(%rdx)
562 ; SCALAR-NEXT: movw %r8w, (%rdx)
563 ; SCALAR-NEXT: movw %ax, 14(%rdx)
564 ; SCALAR-NEXT: movw %di, 12(%rdx)
565 ; SCALAR-NEXT: movw %cx, 10(%rdx)
566 ; SCALAR-NEXT: movw %r8w, 8(%rdx)
569 ; SSE2-LABEL: vec128_v4i16:
571 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
572 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
573 ; SSE2-NEXT: pxor %xmm0, %xmm1
574 ; SSE2-NEXT: movq %xmm1, (%rsi)
575 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
576 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
579 ; AVX1-LABEL: vec128_v4i16:
581 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
582 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
583 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
584 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
585 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
586 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
589 ; AVX2-ONLY-LABEL: vec128_v4i16:
590 ; AVX2-ONLY: # %bb.0:
591 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
592 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
593 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
594 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
595 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0
596 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
597 ; AVX2-ONLY-NEXT: retq
599 ; AVX512-LABEL: vec128_v4i16:
601 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
602 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
603 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
604 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
605 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
607 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
608 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
609 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
610 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
611 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
612 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
613 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
617 define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
618 ; SCALAR-LABEL: vec128_v8i8:
620 ; SCALAR-NEXT: pushq %rbx
621 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
622 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
623 ; SCALAR-NEXT: movzbl 5(%rdi), %r10d
624 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
625 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
626 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
627 ; SCALAR-NEXT: movzbl (%rdi), %eax
628 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
629 ; SCALAR-NEXT: notb %al
630 ; SCALAR-NEXT: notb %dil
631 ; SCALAR-NEXT: notb %cl
632 ; SCALAR-NEXT: notb %r8b
633 ; SCALAR-NEXT: notb %r9b
634 ; SCALAR-NEXT: notb %r10b
635 ; SCALAR-NEXT: notb %r11b
636 ; SCALAR-NEXT: notb %bl
637 ; SCALAR-NEXT: movb %bl, 7(%rsi)
638 ; SCALAR-NEXT: movb %r11b, 6(%rsi)
639 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
640 ; SCALAR-NEXT: movb %r9b, 4(%rsi)
641 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
642 ; SCALAR-NEXT: movb %cl, 2(%rsi)
643 ; SCALAR-NEXT: movb %dil, 1(%rsi)
644 ; SCALAR-NEXT: movb %al, (%rsi)
645 ; SCALAR-NEXT: movb %bl, 7(%rdx)
646 ; SCALAR-NEXT: movb %r11b, 6(%rdx)
647 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
648 ; SCALAR-NEXT: movb %r9b, 4(%rdx)
649 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
650 ; SCALAR-NEXT: movb %cl, 2(%rdx)
651 ; SCALAR-NEXT: movb %dil, 1(%rdx)
652 ; SCALAR-NEXT: movb %al, (%rdx)
653 ; SCALAR-NEXT: movb %bl, 15(%rdx)
654 ; SCALAR-NEXT: movb %r11b, 14(%rdx)
655 ; SCALAR-NEXT: movb %r10b, 13(%rdx)
656 ; SCALAR-NEXT: movb %r9b, 12(%rdx)
657 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
658 ; SCALAR-NEXT: movb %cl, 10(%rdx)
659 ; SCALAR-NEXT: movb %dil, 9(%rdx)
660 ; SCALAR-NEXT: movb %al, 8(%rdx)
661 ; SCALAR-NEXT: popq %rbx
664 ; SSE2-LABEL: vec128_v8i8:
666 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
667 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
668 ; SSE2-NEXT: pxor %xmm0, %xmm1
669 ; SSE2-NEXT: movq %xmm1, (%rsi)
670 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
671 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
674 ; AVX1-LABEL: vec128_v8i8:
676 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
677 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
678 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
679 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
680 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
681 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
684 ; AVX2-ONLY-LABEL: vec128_v8i8:
685 ; AVX2-ONLY: # %bb.0:
686 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
687 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
688 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
689 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
690 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0
691 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
692 ; AVX2-ONLY-NEXT: retq
694 ; AVX512-LABEL: vec128_v8i8:
696 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
697 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
698 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
699 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
700 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
702 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
703 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
704 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
705 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
706 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
707 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
708 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
712 define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
713 ; SCALAR-LABEL: vec256_v2i8:
715 ; SCALAR-NEXT: movzbl (%rdi), %eax
716 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
717 ; SCALAR-NEXT: notb %al
718 ; SCALAR-NEXT: notb %cl
719 ; SCALAR-NEXT: movb %cl, 1(%rsi)
720 ; SCALAR-NEXT: movb %al, (%rsi)
721 ; SCALAR-NEXT: movb %cl, 1(%rdx)
722 ; SCALAR-NEXT: movb %al, (%rdx)
723 ; SCALAR-NEXT: movb %cl, 3(%rdx)
724 ; SCALAR-NEXT: movb %al, 2(%rdx)
725 ; SCALAR-NEXT: movb %cl, 5(%rdx)
726 ; SCALAR-NEXT: movb %al, 4(%rdx)
727 ; SCALAR-NEXT: movb %cl, 7(%rdx)
728 ; SCALAR-NEXT: movb %al, 6(%rdx)
729 ; SCALAR-NEXT: movb %cl, 9(%rdx)
730 ; SCALAR-NEXT: movb %al, 8(%rdx)
731 ; SCALAR-NEXT: movb %cl, 11(%rdx)
732 ; SCALAR-NEXT: movb %al, 10(%rdx)
733 ; SCALAR-NEXT: movb %cl, 13(%rdx)
734 ; SCALAR-NEXT: movb %al, 12(%rdx)
735 ; SCALAR-NEXT: movb %cl, 15(%rdx)
736 ; SCALAR-NEXT: movb %al, 14(%rdx)
737 ; SCALAR-NEXT: movb %cl, 17(%rdx)
738 ; SCALAR-NEXT: movb %al, 16(%rdx)
739 ; SCALAR-NEXT: movb %cl, 19(%rdx)
740 ; SCALAR-NEXT: movb %al, 18(%rdx)
741 ; SCALAR-NEXT: movb %cl, 21(%rdx)
742 ; SCALAR-NEXT: movb %al, 20(%rdx)
743 ; SCALAR-NEXT: movb %cl, 23(%rdx)
744 ; SCALAR-NEXT: movb %al, 22(%rdx)
745 ; SCALAR-NEXT: movb %cl, 25(%rdx)
746 ; SCALAR-NEXT: movb %al, 24(%rdx)
747 ; SCALAR-NEXT: movb %cl, 27(%rdx)
748 ; SCALAR-NEXT: movb %al, 26(%rdx)
749 ; SCALAR-NEXT: movb %cl, 29(%rdx)
750 ; SCALAR-NEXT: movb %al, 28(%rdx)
751 ; SCALAR-NEXT: movb %cl, 31(%rdx)
752 ; SCALAR-NEXT: movb %al, 30(%rdx)
755 ; SSE2-ONLY-LABEL: vec256_v2i8:
756 ; SSE2-ONLY: # %bb.0:
757 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
758 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
759 ; SSE2-ONLY-NEXT: movd %xmm0, %eax
760 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
761 ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
762 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
763 ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
764 ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
765 ; SSE2-ONLY-NEXT: retq
767 ; SSE3-LABEL: vec256_v2i8:
769 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
770 ; SSE3-NEXT: pxor (%rdi), %xmm0
771 ; SSE3-NEXT: movd %xmm0, %eax
772 ; SSE3-NEXT: movw %ax, (%rsi)
773 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
774 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
775 ; SSE3-NEXT: movdqa %xmm0, (%rdx)
776 ; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
779 ; SSSE3-ONLY-LABEL: vec256_v2i8:
780 ; SSSE3-ONLY: # %bb.0:
781 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
782 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
783 ; SSSE3-ONLY-NEXT: movd %xmm0, %eax
784 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
785 ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
786 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
787 ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
788 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
789 ; SSSE3-ONLY-NEXT: retq
791 ; SSE41-LABEL: vec256_v2i8:
793 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
794 ; SSE41-NEXT: pxor (%rdi), %xmm0
795 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
796 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
797 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
798 ; SSE41-NEXT: movdqa %xmm0, (%rdx)
799 ; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
802 ; SSE42-LABEL: vec256_v2i8:
804 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
805 ; SSE42-NEXT: pxor (%rdi), %xmm0
806 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
807 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
808 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
809 ; SSE42-NEXT: movdqa %xmm0, (%rdx)
810 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
813 ; AVX1-LABEL: vec256_v2i8:
815 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
816 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
817 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
818 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
819 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
820 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
821 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
822 ; AVX1-NEXT: vzeroupper
825 ; AVX2-LABEL: vec256_v2i8:
827 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
828 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
829 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi)
830 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
831 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
832 ; AVX2-NEXT: vzeroupper
834 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
835 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
836 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
837 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
838 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
839 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
840 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
841 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
842 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
843 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
844 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
845 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
846 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
847 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
848 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
849 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
850 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
851 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
852 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
853 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
854 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
855 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
856 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
857 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
858 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
859 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
860 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
861 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
862 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
863 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
864 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
865 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
866 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
867 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
868 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
872 define void @vec256_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
873 ; SCALAR-LABEL: vec256_v2i16:
875 ; SCALAR-NEXT: movzwl 2(%rdi), %eax
876 ; SCALAR-NEXT: movl (%rdi), %ecx
877 ; SCALAR-NEXT: notl %ecx
878 ; SCALAR-NEXT: notl %eax
879 ; SCALAR-NEXT: movw %ax, 2(%rsi)
880 ; SCALAR-NEXT: movw %cx, (%rsi)
881 ; SCALAR-NEXT: movw %ax, 2(%rdx)
882 ; SCALAR-NEXT: movw %cx, (%rdx)
883 ; SCALAR-NEXT: movw %ax, 6(%rdx)
884 ; SCALAR-NEXT: movw %cx, 4(%rdx)
885 ; SCALAR-NEXT: movw %ax, 10(%rdx)
886 ; SCALAR-NEXT: movw %cx, 8(%rdx)
887 ; SCALAR-NEXT: movw %ax, 14(%rdx)
888 ; SCALAR-NEXT: movw %cx, 12(%rdx)
889 ; SCALAR-NEXT: movw %ax, 18(%rdx)
890 ; SCALAR-NEXT: movw %cx, 16(%rdx)
891 ; SCALAR-NEXT: movw %ax, 22(%rdx)
892 ; SCALAR-NEXT: movw %cx, 20(%rdx)
893 ; SCALAR-NEXT: movw %ax, 26(%rdx)
894 ; SCALAR-NEXT: movw %cx, 24(%rdx)
895 ; SCALAR-NEXT: movw %ax, 30(%rdx)
896 ; SCALAR-NEXT: movw %cx, 28(%rdx)
899 ; SSE2-LABEL: vec256_v2i16:
901 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
902 ; SSE2-NEXT: pxor (%rdi), %xmm0
903 ; SSE2-NEXT: movd %xmm0, (%rsi)
904 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
905 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
906 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
909 ; AVX1-LABEL: vec256_v2i16:
911 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
912 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
913 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
914 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
915 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
916 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
919 ; AVX2-LABEL: vec256_v2i16:
921 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
922 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
923 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
924 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
925 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
926 ; AVX2-NEXT: vzeroupper
928 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
929 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
930 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
931 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
932 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
933 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
934 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
935 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
936 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
937 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
938 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
939 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
940 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
941 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
942 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
943 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
944 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
945 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
946 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
950 define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
951 ; SCALAR-LABEL: vec256_v2i32:
953 ; SCALAR-NEXT: movl (%rdi), %eax
954 ; SCALAR-NEXT: movl 4(%rdi), %ecx
955 ; SCALAR-NEXT: notl %eax
956 ; SCALAR-NEXT: notl %ecx
957 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
958 ; SCALAR-NEXT: movl %eax, (%rsi)
959 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
960 ; SCALAR-NEXT: movl %eax, (%rdx)
961 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
962 ; SCALAR-NEXT: movl %eax, 8(%rdx)
963 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
964 ; SCALAR-NEXT: movl %eax, 16(%rdx)
965 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
966 ; SCALAR-NEXT: movl %eax, 24(%rdx)
969 ; SSE2-LABEL: vec256_v2i32:
971 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
972 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
973 ; SSE2-NEXT: pxor %xmm0, %xmm1
974 ; SSE2-NEXT: movq %xmm1, (%rsi)
975 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
976 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
977 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
980 ; AVX1-LABEL: vec256_v2i32:
982 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
983 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
984 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
985 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
986 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
987 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
988 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
989 ; AVX1-NEXT: vzeroupper
992 ; AVX2-ONLY-LABEL: vec256_v2i32:
993 ; AVX2-ONLY: # %bb.0:
994 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
995 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
996 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
997 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
998 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
999 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
1000 ; AVX2-ONLY-NEXT: vzeroupper
1001 ; AVX2-ONLY-NEXT: retq
1003 ; AVX512-LABEL: vec256_v2i32:
1005 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1006 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
1007 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
1008 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
1009 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
1010 ; AVX512-NEXT: vzeroupper
1012 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
1013 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
1014 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
1015 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
1016 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
1017 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
1018 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
1019 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
1020 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
1021 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
1022 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
1026 define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1027 ; SCALAR-LABEL: vec256_v2f32:
1029 ; SCALAR-NEXT: movl (%rdi), %eax
1030 ; SCALAR-NEXT: movl 4(%rdi), %ecx
1031 ; SCALAR-NEXT: notl %eax
1032 ; SCALAR-NEXT: notl %ecx
1033 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
1034 ; SCALAR-NEXT: movl %eax, (%rsi)
1035 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
1036 ; SCALAR-NEXT: movl %eax, (%rdx)
1037 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
1038 ; SCALAR-NEXT: movl %eax, 8(%rdx)
1039 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
1040 ; SCALAR-NEXT: movl %eax, 16(%rdx)
1041 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
1042 ; SCALAR-NEXT: movl %eax, 24(%rdx)
1045 ; SSE2-LABEL: vec256_v2f32:
1047 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1048 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1049 ; SSE2-NEXT: pxor %xmm0, %xmm1
1050 ; SSE2-NEXT: movq %xmm1, (%rsi)
1051 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1052 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1053 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1056 ; AVX1-LABEL: vec256_v2f32:
1058 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1059 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1060 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
1061 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
1062 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1063 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1064 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1065 ; AVX1-NEXT: vzeroupper
1068 ; AVX2-ONLY-LABEL: vec256_v2f32:
1069 ; AVX2-ONLY: # %bb.0:
1070 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1071 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1072 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
1073 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
1074 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
1075 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
1076 ; AVX2-ONLY-NEXT: vzeroupper
1077 ; AVX2-ONLY-NEXT: retq
1079 ; AVX512-LABEL: vec256_v2f32:
1081 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1082 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
1083 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
1084 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
1085 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
1086 ; AVX512-NEXT: vzeroupper
1088 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
1089 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
1090 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
1091 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
1092 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
1093 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
1094 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
1095 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
1096 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
1097 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
1098 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
1099 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
1103 define void @vec256_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1104 ; SCALAR-LABEL: vec256_v2i64:
1106 ; SCALAR-NEXT: movq (%rdi), %rax
1107 ; SCALAR-NEXT: movq 8(%rdi), %rcx
1108 ; SCALAR-NEXT: notq %rax
1109 ; SCALAR-NEXT: notq %rcx
1110 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
1111 ; SCALAR-NEXT: movq %rax, (%rsi)
1112 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
1113 ; SCALAR-NEXT: movq %rax, (%rdx)
1114 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
1115 ; SCALAR-NEXT: movq %rax, 16(%rdx)
1118 ; SSE2-LABEL: vec256_v2i64:
1120 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1121 ; SSE2-NEXT: pxor (%rdi), %xmm0
1122 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1123 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1124 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1127 ; AVX-LABEL: vec256_v2i64:
1129 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1130 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1131 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1132 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1133 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1135 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
1136 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
1137 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
1138 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
1139 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
1140 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
1141 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
1145 define void @vec256_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1146 ; SCALAR-LABEL: vec256_v2f64:
1148 ; SCALAR-NEXT: movq (%rdi), %rax
1149 ; SCALAR-NEXT: movq 8(%rdi), %rcx
1150 ; SCALAR-NEXT: notq %rax
1151 ; SCALAR-NEXT: notq %rcx
1152 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
1153 ; SCALAR-NEXT: movq %rax, (%rsi)
1154 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
1155 ; SCALAR-NEXT: movq %rax, (%rdx)
1156 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
1157 ; SCALAR-NEXT: movq %rax, 16(%rdx)
1160 ; SSE2-LABEL: vec256_v2f64:
1162 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1163 ; SSE2-NEXT: pxor (%rdi), %xmm0
1164 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1165 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1166 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1169 ; AVX-LABEL: vec256_v2f64:
1171 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1172 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1173 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1174 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1175 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1177 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
1178 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
1179 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
1180 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
1181 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
1182 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
1183 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
1184 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
1188 define void @vec256_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1189 ; SCALAR-LABEL: vec256_v4i8:
1191 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
1192 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
1193 ; SCALAR-NEXT: movzbl (%rdi), %eax
1194 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
1195 ; SCALAR-NEXT: notb %al
1196 ; SCALAR-NEXT: notb %dil
1197 ; SCALAR-NEXT: notb %cl
1198 ; SCALAR-NEXT: notb %r8b
1199 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
1200 ; SCALAR-NEXT: movb %cl, 2(%rsi)
1201 ; SCALAR-NEXT: movb %dil, 1(%rsi)
1202 ; SCALAR-NEXT: movb %al, (%rsi)
1203 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
1204 ; SCALAR-NEXT: movb %cl, 2(%rdx)
1205 ; SCALAR-NEXT: movb %dil, 1(%rdx)
1206 ; SCALAR-NEXT: movb %al, (%rdx)
1207 ; SCALAR-NEXT: movb %r8b, 7(%rdx)
1208 ; SCALAR-NEXT: movb %cl, 6(%rdx)
1209 ; SCALAR-NEXT: movb %dil, 5(%rdx)
1210 ; SCALAR-NEXT: movb %al, 4(%rdx)
1211 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
1212 ; SCALAR-NEXT: movb %cl, 10(%rdx)
1213 ; SCALAR-NEXT: movb %dil, 9(%rdx)
1214 ; SCALAR-NEXT: movb %al, 8(%rdx)
1215 ; SCALAR-NEXT: movb %r8b, 15(%rdx)
1216 ; SCALAR-NEXT: movb %cl, 14(%rdx)
1217 ; SCALAR-NEXT: movb %dil, 13(%rdx)
1218 ; SCALAR-NEXT: movb %al, 12(%rdx)
1219 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
1220 ; SCALAR-NEXT: movb %cl, 18(%rdx)
1221 ; SCALAR-NEXT: movb %dil, 17(%rdx)
1222 ; SCALAR-NEXT: movb %al, 16(%rdx)
1223 ; SCALAR-NEXT: movb %r8b, 23(%rdx)
1224 ; SCALAR-NEXT: movb %cl, 22(%rdx)
1225 ; SCALAR-NEXT: movb %dil, 21(%rdx)
1226 ; SCALAR-NEXT: movb %al, 20(%rdx)
1227 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
1228 ; SCALAR-NEXT: movb %cl, 26(%rdx)
1229 ; SCALAR-NEXT: movb %dil, 25(%rdx)
1230 ; SCALAR-NEXT: movb %al, 24(%rdx)
1231 ; SCALAR-NEXT: movb %r8b, 31(%rdx)
1232 ; SCALAR-NEXT: movb %cl, 30(%rdx)
1233 ; SCALAR-NEXT: movb %dil, 29(%rdx)
1234 ; SCALAR-NEXT: movb %al, 28(%rdx)
1237 ; SSE2-LABEL: vec256_v4i8:
1239 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1240 ; SSE2-NEXT: pxor (%rdi), %xmm0
1241 ; SSE2-NEXT: movd %xmm0, (%rsi)
1242 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1243 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1244 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1247 ; AVX1-LABEL: vec256_v4i8:
1249 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1250 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
1251 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
1252 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1253 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
1254 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
1257 ; AVX2-LABEL: vec256_v4i8:
1259 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1260 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
1261 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
1262 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
1263 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1264 ; AVX2-NEXT: vzeroupper
1266 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
1267 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
1268 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1269 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
1270 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1271 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
1272 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
1273 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
1274 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
1275 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
1276 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
1277 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
1278 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
1279 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
1280 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
1281 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
1282 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
1283 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
1284 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
1288 define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1289 ; SCALAR-LABEL: vec256_v4i16:
1291 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
1292 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
1293 ; SCALAR-NEXT: movl (%rdi), %eax
1294 ; SCALAR-NEXT: movl 4(%rdi), %edi
1295 ; SCALAR-NEXT: notl %eax
1296 ; SCALAR-NEXT: notl %ecx
1297 ; SCALAR-NEXT: notl %edi
1298 ; SCALAR-NEXT: notl %r8d
1299 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
1300 ; SCALAR-NEXT: movw %di, 4(%rsi)
1301 ; SCALAR-NEXT: movw %cx, 2(%rsi)
1302 ; SCALAR-NEXT: movw %ax, (%rsi)
1303 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
1304 ; SCALAR-NEXT: movw %di, 4(%rdx)
1305 ; SCALAR-NEXT: movw %cx, 2(%rdx)
1306 ; SCALAR-NEXT: movw %ax, (%rdx)
1307 ; SCALAR-NEXT: movw %r8w, 14(%rdx)
1308 ; SCALAR-NEXT: movw %di, 12(%rdx)
1309 ; SCALAR-NEXT: movw %cx, 10(%rdx)
1310 ; SCALAR-NEXT: movw %ax, 8(%rdx)
1311 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
1312 ; SCALAR-NEXT: movw %di, 20(%rdx)
1313 ; SCALAR-NEXT: movw %cx, 18(%rdx)
1314 ; SCALAR-NEXT: movw %ax, 16(%rdx)
1315 ; SCALAR-NEXT: movw %r8w, 30(%rdx)
1316 ; SCALAR-NEXT: movw %di, 28(%rdx)
1317 ; SCALAR-NEXT: movw %cx, 26(%rdx)
1318 ; SCALAR-NEXT: movw %ax, 24(%rdx)
1321 ; SSE2-LABEL: vec256_v4i16:
1323 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1324 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1325 ; SSE2-NEXT: pxor %xmm0, %xmm1
1326 ; SSE2-NEXT: movq %xmm1, (%rsi)
1327 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1328 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1329 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1332 ; AVX1-LABEL: vec256_v4i16:
1334 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1335 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1336 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
1337 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
1338 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1339 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1340 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1341 ; AVX1-NEXT: vzeroupper
1344 ; AVX2-ONLY-LABEL: vec256_v4i16:
1345 ; AVX2-ONLY: # %bb.0:
1346 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1347 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1348 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
1349 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
1350 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
1351 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
1352 ; AVX2-ONLY-NEXT: vzeroupper
1353 ; AVX2-ONLY-NEXT: retq
1355 ; AVX512-LABEL: vec256_v4i16:
1357 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1358 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
1359 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
1360 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
1361 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
1362 ; AVX512-NEXT: vzeroupper
1364 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
1365 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
1366 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
1367 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
1368 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
1369 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
1370 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
1371 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
1372 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
1373 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
1374 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
1378 define void @vec256_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1379 ; SCALAR-LABEL: vec256_v4i32:
1381 ; SCALAR-NEXT: movaps (%rdi), %xmm0
1382 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1383 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
1384 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
1385 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
1388 ; SSE2-LABEL: vec256_v4i32:
1390 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1391 ; SSE2-NEXT: pxor (%rdi), %xmm0
1392 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1393 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1394 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1397 ; AVX-LABEL: vec256_v4i32:
1399 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1400 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1401 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1402 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1403 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1405 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
1406 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
1407 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
1408 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
1409 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
1410 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
1411 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
1415 define void @vec256_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1416 ; SCALAR-LABEL: vec256_v4f32:
1418 ; SCALAR-NEXT: movaps (%rdi), %xmm0
1419 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1420 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
1421 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
1422 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
1425 ; SSE2-LABEL: vec256_v4f32:
1427 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1428 ; SSE2-NEXT: pxor (%rdi), %xmm0
1429 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1430 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1431 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1434 ; AVX-LABEL: vec256_v4f32:
1436 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1437 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1438 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1439 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1440 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1442 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
1443 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
1444 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
1445 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
1446 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
1447 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
1448 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
1449 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
1453 define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1454 ; SCALAR-LABEL: vec256_v8i8:
1456 ; SCALAR-NEXT: pushq %rbx
1457 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
1458 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
1459 ; SCALAR-NEXT: movzbl 5(%rdi), %r10d
1460 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
1461 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
1462 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
1463 ; SCALAR-NEXT: movzbl (%rdi), %eax
1464 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
1465 ; SCALAR-NEXT: notb %al
1466 ; SCALAR-NEXT: notb %dil
1467 ; SCALAR-NEXT: notb %cl
1468 ; SCALAR-NEXT: notb %r8b
1469 ; SCALAR-NEXT: notb %r9b
1470 ; SCALAR-NEXT: notb %r10b
1471 ; SCALAR-NEXT: notb %r11b
1472 ; SCALAR-NEXT: notb %bl
1473 ; SCALAR-NEXT: movb %bl, 7(%rsi)
1474 ; SCALAR-NEXT: movb %r11b, 6(%rsi)
1475 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
1476 ; SCALAR-NEXT: movb %r9b, 4(%rsi)
1477 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
1478 ; SCALAR-NEXT: movb %cl, 2(%rsi)
1479 ; SCALAR-NEXT: movb %dil, 1(%rsi)
1480 ; SCALAR-NEXT: movb %al, (%rsi)
1481 ; SCALAR-NEXT: movb %bl, 7(%rdx)
1482 ; SCALAR-NEXT: movb %r11b, 6(%rdx)
1483 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
1484 ; SCALAR-NEXT: movb %r9b, 4(%rdx)
1485 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
1486 ; SCALAR-NEXT: movb %cl, 2(%rdx)
1487 ; SCALAR-NEXT: movb %dil, 1(%rdx)
1488 ; SCALAR-NEXT: movb %al, (%rdx)
1489 ; SCALAR-NEXT: movb %bl, 15(%rdx)
1490 ; SCALAR-NEXT: movb %r11b, 14(%rdx)
1491 ; SCALAR-NEXT: movb %r10b, 13(%rdx)
1492 ; SCALAR-NEXT: movb %r9b, 12(%rdx)
1493 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
1494 ; SCALAR-NEXT: movb %cl, 10(%rdx)
1495 ; SCALAR-NEXT: movb %dil, 9(%rdx)
1496 ; SCALAR-NEXT: movb %al, 8(%rdx)
1497 ; SCALAR-NEXT: movb %bl, 23(%rdx)
1498 ; SCALAR-NEXT: movb %r11b, 22(%rdx)
1499 ; SCALAR-NEXT: movb %r10b, 21(%rdx)
1500 ; SCALAR-NEXT: movb %r9b, 20(%rdx)
1501 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
1502 ; SCALAR-NEXT: movb %cl, 18(%rdx)
1503 ; SCALAR-NEXT: movb %dil, 17(%rdx)
1504 ; SCALAR-NEXT: movb %al, 16(%rdx)
1505 ; SCALAR-NEXT: movb %bl, 31(%rdx)
1506 ; SCALAR-NEXT: movb %r11b, 30(%rdx)
1507 ; SCALAR-NEXT: movb %r10b, 29(%rdx)
1508 ; SCALAR-NEXT: movb %r9b, 28(%rdx)
1509 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
1510 ; SCALAR-NEXT: movb %cl, 26(%rdx)
1511 ; SCALAR-NEXT: movb %dil, 25(%rdx)
1512 ; SCALAR-NEXT: movb %al, 24(%rdx)
1513 ; SCALAR-NEXT: popq %rbx
1516 ; SSE2-LABEL: vec256_v8i8:
1518 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1519 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1520 ; SSE2-NEXT: pxor %xmm0, %xmm1
1521 ; SSE2-NEXT: movq %xmm1, (%rsi)
1522 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1523 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1524 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1527 ; AVX1-LABEL: vec256_v8i8:
1529 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1530 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1531 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
1532 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
1533 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1534 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1535 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1536 ; AVX1-NEXT: vzeroupper
1539 ; AVX2-ONLY-LABEL: vec256_v8i8:
1540 ; AVX2-ONLY: # %bb.0:
1541 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1542 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1543 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
1544 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
1545 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
1546 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
1547 ; AVX2-ONLY-NEXT: vzeroupper
1548 ; AVX2-ONLY-NEXT: retq
1550 ; AVX512-LABEL: vec256_v8i8:
1552 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1553 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
1554 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
1555 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
1556 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
1557 ; AVX512-NEXT: vzeroupper
1559 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
1560 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1561 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1562 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
1563 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1564 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
1565 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
1566 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
1567 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
1568 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
1569 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
1573 define void @vec256_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1574 ; SCALAR-LABEL: vec256_v8i16:
1576 ; SCALAR-NEXT: pushq %rbx
1577 ; SCALAR-NEXT: movzwl 14(%rdi), %ebx
1578 ; SCALAR-NEXT: movl 12(%rdi), %r11d
1579 ; SCALAR-NEXT: movzwl 10(%rdi), %r10d
1580 ; SCALAR-NEXT: movl 8(%rdi), %r9d
1581 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
1582 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
1583 ; SCALAR-NEXT: movl (%rdi), %eax
1584 ; SCALAR-NEXT: movl 4(%rdi), %edi
1585 ; SCALAR-NEXT: notl %eax
1586 ; SCALAR-NEXT: notl %ecx
1587 ; SCALAR-NEXT: notl %edi
1588 ; SCALAR-NEXT: notl %r8d
1589 ; SCALAR-NEXT: notl %r9d
1590 ; SCALAR-NEXT: notl %r10d
1591 ; SCALAR-NEXT: notl %r11d
1592 ; SCALAR-NEXT: notl %ebx
1593 ; SCALAR-NEXT: movw %bx, 14(%rsi)
1594 ; SCALAR-NEXT: movw %r11w, 12(%rsi)
1595 ; SCALAR-NEXT: movw %r10w, 10(%rsi)
1596 ; SCALAR-NEXT: movw %r9w, 8(%rsi)
1597 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
1598 ; SCALAR-NEXT: movw %di, 4(%rsi)
1599 ; SCALAR-NEXT: movw %cx, 2(%rsi)
1600 ; SCALAR-NEXT: movw %ax, (%rsi)
1601 ; SCALAR-NEXT: movw %bx, 14(%rdx)
1602 ; SCALAR-NEXT: movw %r11w, 12(%rdx)
1603 ; SCALAR-NEXT: movw %r10w, 10(%rdx)
1604 ; SCALAR-NEXT: movw %r9w, 8(%rdx)
1605 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
1606 ; SCALAR-NEXT: movw %di, 4(%rdx)
1607 ; SCALAR-NEXT: movw %cx, 2(%rdx)
1608 ; SCALAR-NEXT: movw %ax, (%rdx)
1609 ; SCALAR-NEXT: movw %bx, 30(%rdx)
1610 ; SCALAR-NEXT: movw %r11w, 28(%rdx)
1611 ; SCALAR-NEXT: movw %r10w, 26(%rdx)
1612 ; SCALAR-NEXT: movw %r9w, 24(%rdx)
1613 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
1614 ; SCALAR-NEXT: movw %di, 20(%rdx)
1615 ; SCALAR-NEXT: movw %cx, 18(%rdx)
1616 ; SCALAR-NEXT: movw %ax, 16(%rdx)
1617 ; SCALAR-NEXT: popq %rbx
1620 ; SSE2-LABEL: vec256_v8i16:
1622 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1623 ; SSE2-NEXT: pxor (%rdi), %xmm0
1624 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1625 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1626 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1629 ; AVX-LABEL: vec256_v8i16:
1631 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1632 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1633 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1634 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1635 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1637 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
1638 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1639 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
1640 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
1641 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
1642 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
1643 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
1647 define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1648 ; SCALAR-LABEL: vec256_v16i8:
1650 ; SCALAR-NEXT: pushq %rbp
1651 ; SCALAR-NEXT: pushq %r15
1652 ; SCALAR-NEXT: pushq %r14
1653 ; SCALAR-NEXT: pushq %r13
1654 ; SCALAR-NEXT: pushq %r12
1655 ; SCALAR-NEXT: pushq %rbx
1656 ; SCALAR-NEXT: movzbl 15(%rdi), %eax
1657 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1658 ; SCALAR-NEXT: movzbl 14(%rdi), %eax
1659 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1660 ; SCALAR-NEXT: movzbl 13(%rdi), %eax
1661 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1662 ; SCALAR-NEXT: movzbl 12(%rdi), %r15d
1663 ; SCALAR-NEXT: movzbl 11(%rdi), %eax
1664 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1665 ; SCALAR-NEXT: movzbl 10(%rdi), %ebp
1666 ; SCALAR-NEXT: movzbl 9(%rdi), %r14d
1667 ; SCALAR-NEXT: movzbl 8(%rdi), %eax
1668 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1669 ; SCALAR-NEXT: movzbl 7(%rdi), %r12d
1670 ; SCALAR-NEXT: movzbl 6(%rdi), %r10d
1671 ; SCALAR-NEXT: movzbl 5(%rdi), %r9d
1672 ; SCALAR-NEXT: movzbl 4(%rdi), %ebx
1673 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
1674 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
1675 ; SCALAR-NEXT: movzbl (%rdi), %eax
1676 ; SCALAR-NEXT: movzbl 1(%rdi), %r13d
1677 ; SCALAR-NEXT: notb %al
1678 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1679 ; SCALAR-NEXT: notb %r13b
1680 ; SCALAR-NEXT: notb %cl
1681 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1682 ; SCALAR-NEXT: notb %r8b
1683 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1684 ; SCALAR-NEXT: notb %bl
1685 ; SCALAR-NEXT: notb %r9b
1686 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1687 ; SCALAR-NEXT: notb %r10b
1688 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1689 ; SCALAR-NEXT: notb %r12b
1690 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1691 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
1692 ; SCALAR-NEXT: notb %r11b
1693 ; SCALAR-NEXT: movl %r14d, %r10d
1694 ; SCALAR-NEXT: notb %r10b
1695 ; SCALAR-NEXT: notb %bpl
1696 ; SCALAR-NEXT: movl %ebp, %r14d
1697 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
1698 ; SCALAR-NEXT: notb %r8b
1699 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1700 ; SCALAR-NEXT: movl %r15d, %edi
1701 ; SCALAR-NEXT: notb %dil
1702 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1703 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
1704 ; SCALAR-NEXT: notb %r9b
1705 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
1706 ; SCALAR-NEXT: notb %bpl
1707 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
1708 ; SCALAR-NEXT: notb %r15b
1709 ; SCALAR-NEXT: movb %r15b, 15(%rsi)
1710 ; SCALAR-NEXT: movb %bpl, 14(%rsi)
1711 ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1712 ; SCALAR-NEXT: movl %r9d, %eax
1713 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1714 ; SCALAR-NEXT: movb %r9b, 13(%rsi)
1715 ; SCALAR-NEXT: movb %dil, 12(%rsi)
1716 ; SCALAR-NEXT: movb %r8b, 11(%rsi)
1717 ; SCALAR-NEXT: movb %r14b, 10(%rsi)
1718 ; SCALAR-NEXT: movb %r10b, 9(%rsi)
1719 ; SCALAR-NEXT: movl %r10d, %r8d
1720 ; SCALAR-NEXT: movb %r11b, 8(%rsi)
1721 ; SCALAR-NEXT: movl %r11d, %r9d
1722 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1723 ; SCALAR-NEXT: movb %r12b, 7(%rsi)
1724 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1725 ; SCALAR-NEXT: movb %cl, 6(%rsi)
1726 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
1727 ; SCALAR-NEXT: movb %dil, 5(%rsi)
1728 ; SCALAR-NEXT: movb %bl, 4(%rsi)
1729 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1730 ; SCALAR-NEXT: movb %cl, 3(%rsi)
1731 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1732 ; SCALAR-NEXT: movb %cl, 2(%rsi)
1733 ; SCALAR-NEXT: movb %r13b, 1(%rsi)
1734 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1735 ; SCALAR-NEXT: movb %r10b, (%rsi)
1736 ; SCALAR-NEXT: movb %r15b, 15(%rdx)
1737 ; SCALAR-NEXT: movl %r15d, %r11d
1738 ; SCALAR-NEXT: movb %bpl, 14(%rdx)
1739 ; SCALAR-NEXT: movb %al, 13(%rdx)
1740 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
1741 ; SCALAR-NEXT: movb %r12b, 12(%rdx)
1742 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
1743 ; SCALAR-NEXT: movb %r15b, 11(%rdx)
1744 ; SCALAR-NEXT: movb %r14b, 10(%rdx)
1745 ; SCALAR-NEXT: movb %r8b, 9(%rdx)
1746 ; SCALAR-NEXT: movb %r9b, 8(%rdx)
1747 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
1748 ; SCALAR-NEXT: movb %r9b, 7(%rdx)
1749 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1750 ; SCALAR-NEXT: movb %al, 6(%rdx)
1751 ; SCALAR-NEXT: movb %dil, 5(%rdx)
1752 ; SCALAR-NEXT: movb %bl, 4(%rdx)
1753 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
1754 ; SCALAR-NEXT: movb %sil, 3(%rdx)
1755 ; SCALAR-NEXT: movb %cl, 2(%rdx)
1756 ; SCALAR-NEXT: movb %r13b, 1(%rdx)
1757 ; SCALAR-NEXT: movl %r10d, %edi
1758 ; SCALAR-NEXT: movb %r10b, (%rdx)
1759 ; SCALAR-NEXT: movb %r11b, 31(%rdx)
1760 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1761 ; SCALAR-NEXT: movb %r10b, 30(%rdx)
1762 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1763 ; SCALAR-NEXT: movb %r10b, 29(%rdx)
1764 ; SCALAR-NEXT: movb %r12b, 28(%rdx)
1765 ; SCALAR-NEXT: movb %r15b, 27(%rdx)
1766 ; SCALAR-NEXT: movb %r14b, 26(%rdx)
1767 ; SCALAR-NEXT: movb %r8b, 25(%rdx)
1768 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1769 ; SCALAR-NEXT: movb %r10b, 24(%rdx)
1770 ; SCALAR-NEXT: movb %r9b, 23(%rdx)
1771 ; SCALAR-NEXT: movb %al, 22(%rdx)
1772 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1773 ; SCALAR-NEXT: movb %al, 21(%rdx)
1774 ; SCALAR-NEXT: movb %bl, 20(%rdx)
1775 ; SCALAR-NEXT: movb %sil, 19(%rdx)
1776 ; SCALAR-NEXT: movb %cl, 18(%rdx)
1777 ; SCALAR-NEXT: movb %r13b, 17(%rdx)
1778 ; SCALAR-NEXT: movb %dil, 16(%rdx)
1779 ; SCALAR-NEXT: popq %rbx
1780 ; SCALAR-NEXT: popq %r12
1781 ; SCALAR-NEXT: popq %r13
1782 ; SCALAR-NEXT: popq %r14
1783 ; SCALAR-NEXT: popq %r15
1784 ; SCALAR-NEXT: popq %rbp
1787 ; SSE2-LABEL: vec256_v16i8:
1789 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1790 ; SSE2-NEXT: pxor (%rdi), %xmm0
1791 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1792 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1793 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1796 ; AVX-LABEL: vec256_v16i8:
1798 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1799 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1800 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1801 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1802 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1804 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
1805 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1806 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1807 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
1808 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1809 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
1810 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
1814 define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1815 ; SCALAR-LABEL: vec384_v2i8:
1817 ; SCALAR-NEXT: movzbl (%rdi), %eax
1818 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
1819 ; SCALAR-NEXT: notb %al
1820 ; SCALAR-NEXT: notb %cl
1821 ; SCALAR-NEXT: movb %cl, 1(%rsi)
1822 ; SCALAR-NEXT: movb %al, (%rsi)
1823 ; SCALAR-NEXT: movb %cl, 1(%rdx)
1824 ; SCALAR-NEXT: movb %al, (%rdx)
1825 ; SCALAR-NEXT: movb %cl, 3(%rdx)
1826 ; SCALAR-NEXT: movb %al, 2(%rdx)
1827 ; SCALAR-NEXT: movb %cl, 5(%rdx)
1828 ; SCALAR-NEXT: movb %al, 4(%rdx)
1829 ; SCALAR-NEXT: movb %cl, 7(%rdx)
1830 ; SCALAR-NEXT: movb %al, 6(%rdx)
1831 ; SCALAR-NEXT: movb %cl, 9(%rdx)
1832 ; SCALAR-NEXT: movb %al, 8(%rdx)
1833 ; SCALAR-NEXT: movb %cl, 11(%rdx)
1834 ; SCALAR-NEXT: movb %al, 10(%rdx)
1835 ; SCALAR-NEXT: movb %cl, 13(%rdx)
1836 ; SCALAR-NEXT: movb %al, 12(%rdx)
1837 ; SCALAR-NEXT: movb %cl, 15(%rdx)
1838 ; SCALAR-NEXT: movb %al, 14(%rdx)
1839 ; SCALAR-NEXT: movb %cl, 17(%rdx)
1840 ; SCALAR-NEXT: movb %al, 16(%rdx)
1841 ; SCALAR-NEXT: movb %cl, 19(%rdx)
1842 ; SCALAR-NEXT: movb %al, 18(%rdx)
1843 ; SCALAR-NEXT: movb %cl, 21(%rdx)
1844 ; SCALAR-NEXT: movb %al, 20(%rdx)
1845 ; SCALAR-NEXT: movb %cl, 23(%rdx)
1846 ; SCALAR-NEXT: movb %al, 22(%rdx)
1847 ; SCALAR-NEXT: movb %cl, 25(%rdx)
1848 ; SCALAR-NEXT: movb %al, 24(%rdx)
1849 ; SCALAR-NEXT: movb %cl, 27(%rdx)
1850 ; SCALAR-NEXT: movb %al, 26(%rdx)
1851 ; SCALAR-NEXT: movb %cl, 29(%rdx)
1852 ; SCALAR-NEXT: movb %al, 28(%rdx)
1853 ; SCALAR-NEXT: movb %cl, 31(%rdx)
1854 ; SCALAR-NEXT: movb %al, 30(%rdx)
1855 ; SCALAR-NEXT: movb %cl, 33(%rdx)
1856 ; SCALAR-NEXT: movb %al, 32(%rdx)
1857 ; SCALAR-NEXT: movb %cl, 35(%rdx)
1858 ; SCALAR-NEXT: movb %al, 34(%rdx)
1859 ; SCALAR-NEXT: movb %cl, 37(%rdx)
1860 ; SCALAR-NEXT: movb %al, 36(%rdx)
1861 ; SCALAR-NEXT: movb %cl, 39(%rdx)
1862 ; SCALAR-NEXT: movb %al, 38(%rdx)
1863 ; SCALAR-NEXT: movb %cl, 41(%rdx)
1864 ; SCALAR-NEXT: movb %al, 40(%rdx)
1865 ; SCALAR-NEXT: movb %cl, 43(%rdx)
1866 ; SCALAR-NEXT: movb %al, 42(%rdx)
1867 ; SCALAR-NEXT: movb %cl, 45(%rdx)
1868 ; SCALAR-NEXT: movb %al, 44(%rdx)
1869 ; SCALAR-NEXT: movb %cl, 47(%rdx)
1870 ; SCALAR-NEXT: movb %al, 46(%rdx)
1873 ; SSE2-ONLY-LABEL: vec384_v2i8:
1874 ; SSE2-ONLY: # %bb.0:
1875 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
1876 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
1877 ; SSE2-ONLY-NEXT: movd %xmm0, %eax
1878 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
1879 ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1880 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1881 ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
1882 ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
1883 ; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
1884 ; SSE2-ONLY-NEXT: retq
1886 ; SSE3-LABEL: vec384_v2i8:
1888 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
1889 ; SSE3-NEXT: pxor (%rdi), %xmm0
1890 ; SSE3-NEXT: movd %xmm0, %eax
1891 ; SSE3-NEXT: movw %ax, (%rsi)
1892 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1893 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1894 ; SSE3-NEXT: movdqa %xmm0, (%rdx)
1895 ; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
1896 ; SSE3-NEXT: movdqa %xmm0, 32(%rdx)
1899 ; SSSE3-ONLY-LABEL: vec384_v2i8:
1900 ; SSSE3-ONLY: # %bb.0:
1901 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
1902 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
1903 ; SSSE3-ONLY-NEXT: movd %xmm0, %eax
1904 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
1905 ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1906 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1907 ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
1908 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
1909 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
1910 ; SSSE3-ONLY-NEXT: retq
1912 ; SSE41-LABEL: vec384_v2i8:
1914 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
1915 ; SSE41-NEXT: pxor (%rdi), %xmm0
1916 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
1917 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1918 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1919 ; SSE41-NEXT: movdqa %xmm0, (%rdx)
1920 ; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
1921 ; SSE41-NEXT: movdqa %xmm0, 32(%rdx)
1924 ; SSE42-LABEL: vec384_v2i8:
1926 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
1927 ; SSE42-NEXT: pxor (%rdi), %xmm0
1928 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
1929 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1930 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1931 ; SSE42-NEXT: movdqa %xmm0, (%rdx)
1932 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
1933 ; SSE42-NEXT: movdqa %xmm0, 32(%rdx)
1936 ; AVX1-LABEL: vec384_v2i8:
1938 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1939 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
1940 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
1941 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1942 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1943 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1944 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
1945 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
1946 ; AVX1-NEXT: vzeroupper
1949 ; AVX2-LABEL: vec384_v2i8:
1951 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1952 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
1953 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi)
1954 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
1955 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1956 ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx)
1957 ; AVX2-NEXT: vzeroupper
1959 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
1960 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
1961 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1962 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
1963 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1964 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
1965 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
1966 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
1967 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
1968 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
1969 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
1970 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
1971 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
1972 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
1973 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
1974 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
1975 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
1976 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
1977 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
1978 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
1979 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
1980 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
1981 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
1982 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
1983 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
1984 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
1985 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
1986 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
1987 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
1988 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
1989 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
1990 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
1991 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
1992 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
1993 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
1994 %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16
1995 store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32
1996 %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17
1997 store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2
1998 %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18
1999 store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4
2000 %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19
2001 store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2
2002 %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20
2003 store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8
2004 %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21
2005 store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2
2006 %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22
2007 store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4
2008 %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23
2009 store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2
2013 define void @vec384_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2014 ; SCALAR-LABEL: vec384_v2i16:
2016 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
2017 ; SCALAR-NEXT: movl (%rdi), %eax
2018 ; SCALAR-NEXT: notl %eax
2019 ; SCALAR-NEXT: notl %ecx
2020 ; SCALAR-NEXT: movw %cx, 2(%rsi)
2021 ; SCALAR-NEXT: movw %ax, (%rsi)
2022 ; SCALAR-NEXT: movw %cx, 2(%rdx)
2023 ; SCALAR-NEXT: movw %ax, (%rdx)
2024 ; SCALAR-NEXT: movw %cx, 6(%rdx)
2025 ; SCALAR-NEXT: movw %ax, 4(%rdx)
2026 ; SCALAR-NEXT: movw %cx, 10(%rdx)
2027 ; SCALAR-NEXT: movw %ax, 8(%rdx)
2028 ; SCALAR-NEXT: movw %cx, 14(%rdx)
2029 ; SCALAR-NEXT: movw %ax, 12(%rdx)
2030 ; SCALAR-NEXT: movw %cx, 18(%rdx)
2031 ; SCALAR-NEXT: movw %ax, 16(%rdx)
2032 ; SCALAR-NEXT: movw %cx, 22(%rdx)
2033 ; SCALAR-NEXT: movw %ax, 20(%rdx)
2034 ; SCALAR-NEXT: movw %cx, 26(%rdx)
2035 ; SCALAR-NEXT: movw %ax, 24(%rdx)
2036 ; SCALAR-NEXT: movw %cx, 30(%rdx)
2037 ; SCALAR-NEXT: movw %ax, 28(%rdx)
2038 ; SCALAR-NEXT: movw %cx, 34(%rdx)
2039 ; SCALAR-NEXT: movw %ax, 32(%rdx)
2040 ; SCALAR-NEXT: movw %cx, 38(%rdx)
2041 ; SCALAR-NEXT: movw %ax, 36(%rdx)
2042 ; SCALAR-NEXT: movw %cx, 42(%rdx)
2043 ; SCALAR-NEXT: movw %ax, 40(%rdx)
2044 ; SCALAR-NEXT: movw %cx, 46(%rdx)
2045 ; SCALAR-NEXT: movw %ax, 44(%rdx)
2048 ; SSE2-LABEL: vec384_v2i16:
2050 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
2051 ; SSE2-NEXT: pxor (%rdi), %xmm0
2052 ; SSE2-NEXT: movd %xmm0, (%rsi)
2053 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2054 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2055 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2056 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2059 ; AVX1-LABEL: vec384_v2i16:
2061 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
2062 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
2063 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
2064 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2065 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
2066 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
2067 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
2070 ; AVX2-LABEL: vec384_v2i16:
2072 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
2073 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
2074 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
2075 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
2076 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2077 ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx)
2078 ; AVX2-NEXT: vzeroupper
2080 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
2081 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
2082 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
2083 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
2084 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
2085 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
2086 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
2087 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
2088 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
2089 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
2090 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
2091 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
2092 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
2093 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
2094 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
2095 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
2096 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
2097 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
2098 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
2099 %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8
2100 store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32
2101 %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9
2102 store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4
2103 %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10
2104 store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8
2105 %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11
2106 store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4
2110 define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2111 ; SCALAR-LABEL: vec384_v2i32:
2113 ; SCALAR-NEXT: movl (%rdi), %eax
2114 ; SCALAR-NEXT: movl 4(%rdi), %ecx
2115 ; SCALAR-NEXT: notl %eax
2116 ; SCALAR-NEXT: notl %ecx
2117 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
2118 ; SCALAR-NEXT: movl %eax, (%rsi)
2119 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
2120 ; SCALAR-NEXT: movl %eax, (%rdx)
2121 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
2122 ; SCALAR-NEXT: movl %eax, 8(%rdx)
2123 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
2124 ; SCALAR-NEXT: movl %eax, 16(%rdx)
2125 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
2126 ; SCALAR-NEXT: movl %eax, 24(%rdx)
2127 ; SCALAR-NEXT: movl %ecx, 36(%rdx)
2128 ; SCALAR-NEXT: movl %eax, 32(%rdx)
2129 ; SCALAR-NEXT: movl %ecx, 44(%rdx)
2130 ; SCALAR-NEXT: movl %eax, 40(%rdx)
2133 ; SSE2-LABEL: vec384_v2i32:
2135 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2136 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
2137 ; SSE2-NEXT: pxor %xmm0, %xmm1
2138 ; SSE2-NEXT: movq %xmm1, (%rsi)
2139 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
2140 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2141 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2142 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2145 ; AVX1-LABEL: vec384_v2i32:
2147 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2148 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2149 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2150 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
2151 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2152 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
2153 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
2154 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
2155 ; AVX1-NEXT: vzeroupper
2158 ; AVX2-ONLY-LABEL: vec384_v2i32:
2159 ; AVX2-ONLY: # %bb.0:
2160 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2161 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2162 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
2163 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
2164 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
2165 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
2166 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx)
2167 ; AVX2-ONLY-NEXT: vzeroupper
2168 ; AVX2-ONLY-NEXT: retq
2170 ; AVX512-LABEL: vec384_v2i32:
2172 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2173 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
2174 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
2175 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
2176 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
2177 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
2178 ; AVX512-NEXT: vzeroupper
2180 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
2181 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
2182 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
2183 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
2184 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
2185 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
2186 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
2187 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
2188 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
2189 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
2190 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
2191 %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4
2192 store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32
2193 %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5
2194 store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8
2198 define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2199 ; SCALAR-LABEL: vec384_v2f32:
2201 ; SCALAR-NEXT: movl (%rdi), %eax
2202 ; SCALAR-NEXT: movl 4(%rdi), %ecx
2203 ; SCALAR-NEXT: notl %eax
2204 ; SCALAR-NEXT: notl %ecx
2205 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
2206 ; SCALAR-NEXT: movl %eax, (%rsi)
2207 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
2208 ; SCALAR-NEXT: movl %eax, (%rdx)
2209 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
2210 ; SCALAR-NEXT: movl %eax, 8(%rdx)
2211 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
2212 ; SCALAR-NEXT: movl %eax, 16(%rdx)
2213 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
2214 ; SCALAR-NEXT: movl %eax, 24(%rdx)
2215 ; SCALAR-NEXT: movl %ecx, 36(%rdx)
2216 ; SCALAR-NEXT: movl %eax, 32(%rdx)
2217 ; SCALAR-NEXT: movl %ecx, 44(%rdx)
2218 ; SCALAR-NEXT: movl %eax, 40(%rdx)
2221 ; SSE2-LABEL: vec384_v2f32:
2223 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2224 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
2225 ; SSE2-NEXT: pxor %xmm0, %xmm1
2226 ; SSE2-NEXT: movq %xmm1, (%rsi)
2227 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
2228 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2229 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2230 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2233 ; AVX1-LABEL: vec384_v2f32:
2235 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2236 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2237 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2238 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
2239 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2240 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
2241 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
2242 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
2243 ; AVX1-NEXT: vzeroupper
2246 ; AVX2-ONLY-LABEL: vec384_v2f32:
2247 ; AVX2-ONLY: # %bb.0:
2248 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2249 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2250 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
2251 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
2252 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
2253 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
2254 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx)
2255 ; AVX2-ONLY-NEXT: vzeroupper
2256 ; AVX2-ONLY-NEXT: retq
2258 ; AVX512-LABEL: vec384_v2f32:
2260 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2261 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
2262 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
2263 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
2264 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
2265 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
2266 ; AVX512-NEXT: vzeroupper
2268 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
2269 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
2270 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
2271 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
2272 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
2273 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
2274 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
2275 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
2276 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
2277 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
2278 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
2279 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
2280 %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4
2281 store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32
2282 %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5
2283 store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8
2287 define void @vec384_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2288 ; SCALAR-LABEL: vec384_v2i64:
2290 ; SCALAR-NEXT: movq (%rdi), %rax
2291 ; SCALAR-NEXT: movq 8(%rdi), %rcx
2292 ; SCALAR-NEXT: notq %rax
2293 ; SCALAR-NEXT: notq %rcx
2294 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
2295 ; SCALAR-NEXT: movq %rax, (%rsi)
2296 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
2297 ; SCALAR-NEXT: movq %rax, (%rdx)
2298 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
2299 ; SCALAR-NEXT: movq %rax, 16(%rdx)
2300 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
2301 ; SCALAR-NEXT: movq %rax, 32(%rdx)
2304 ; SSE2-LABEL: vec384_v2i64:
2306 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
2307 ; SSE2-NEXT: pxor (%rdi), %xmm0
2308 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
2309 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2310 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2311 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2314 ; AVX-LABEL: vec384_v2i64:
2316 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
2317 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
2318 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
2319 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
2320 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
2321 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
2323 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
2324 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
2325 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
2326 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
2327 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
2328 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
2329 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
2330 %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2
2331 store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32
2335 define void @vec384_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2336 ; SCALAR-LABEL: vec384_v2f64:
2338 ; SCALAR-NEXT: movq (%rdi), %rax
2339 ; SCALAR-NEXT: movq 8(%rdi), %rcx
2340 ; SCALAR-NEXT: notq %rax
2341 ; SCALAR-NEXT: notq %rcx
2342 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
2343 ; SCALAR-NEXT: movq %rax, (%rsi)
2344 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
2345 ; SCALAR-NEXT: movq %rax, (%rdx)
2346 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
2347 ; SCALAR-NEXT: movq %rax, 16(%rdx)
2348 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
2349 ; SCALAR-NEXT: movq %rax, 32(%rdx)
2352 ; SSE2-LABEL: vec384_v2f64:
2354 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
2355 ; SSE2-NEXT: pxor (%rdi), %xmm0
2356 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
2357 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2358 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2359 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2362 ; AVX-LABEL: vec384_v2f64:
2364 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
2365 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
2366 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
2367 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
2368 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
2369 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
2371 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
2372 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
2373 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
2374 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
2375 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
2376 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
2377 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
2378 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
2379 %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2
2380 store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32
2384 define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2385 ; SCALAR-LABEL: vec384_v3i8:
2387 ; SCALAR-NEXT: movl (%rdi), %eax
2388 ; SCALAR-NEXT: movl %eax, %ecx
2389 ; SCALAR-NEXT: shrl $16, %ecx
2390 ; SCALAR-NEXT: notb %cl
2391 ; SCALAR-NEXT: notl %eax
2392 ; SCALAR-NEXT: movw %ax, (%rsi)
2393 ; SCALAR-NEXT: movb %cl, 2(%rsi)
2394 ; SCALAR-NEXT: movb %cl, 2(%rdx)
2395 ; SCALAR-NEXT: movw %ax, (%rdx)
2396 ; SCALAR-NEXT: movb %cl, 6(%rdx)
2397 ; SCALAR-NEXT: movw %ax, 4(%rdx)
2398 ; SCALAR-NEXT: movb %cl, 10(%rdx)
2399 ; SCALAR-NEXT: movw %ax, 8(%rdx)
2400 ; SCALAR-NEXT: movb %cl, 14(%rdx)
2401 ; SCALAR-NEXT: movw %ax, 12(%rdx)
2402 ; SCALAR-NEXT: movb %cl, 18(%rdx)
2403 ; SCALAR-NEXT: movw %ax, 16(%rdx)
2404 ; SCALAR-NEXT: movb %cl, 22(%rdx)
2405 ; SCALAR-NEXT: movw %ax, 20(%rdx)
2406 ; SCALAR-NEXT: movb %cl, 26(%rdx)
2407 ; SCALAR-NEXT: movw %ax, 24(%rdx)
2408 ; SCALAR-NEXT: movb %cl, 30(%rdx)
2409 ; SCALAR-NEXT: movw %ax, 28(%rdx)
2410 ; SCALAR-NEXT: movb %cl, 34(%rdx)
2411 ; SCALAR-NEXT: movw %ax, 32(%rdx)
2412 ; SCALAR-NEXT: movb %cl, 38(%rdx)
2413 ; SCALAR-NEXT: movw %ax, 36(%rdx)
2414 ; SCALAR-NEXT: movb %cl, 42(%rdx)
2415 ; SCALAR-NEXT: movw %ax, 40(%rdx)
2416 ; SCALAR-NEXT: movb %cl, 46(%rdx)
2417 ; SCALAR-NEXT: movw %ax, 44(%rdx)
2418 ; SCALAR-NEXT: movb %cl, 50(%rdx)
2419 ; SCALAR-NEXT: movw %ax, 48(%rdx)
2420 ; SCALAR-NEXT: movb %cl, 54(%rdx)
2421 ; SCALAR-NEXT: movw %ax, 52(%rdx)
2422 ; SCALAR-NEXT: movb %cl, 58(%rdx)
2423 ; SCALAR-NEXT: movw %ax, 56(%rdx)
2424 ; SCALAR-NEXT: movb %cl, 62(%rdx)
2425 ; SCALAR-NEXT: movw %ax, 60(%rdx)
2428 ; SSE2-ONLY-LABEL: vec384_v3i8:
2429 ; SSE2-ONLY: # %bb.0:
2430 ; SSE2-ONLY-NEXT: movl (%rdi), %eax
2431 ; SSE2-ONLY-NEXT: notl %eax
2432 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
2433 ; SSE2-ONLY-NEXT: movl %eax, %ecx
2434 ; SSE2-ONLY-NEXT: shrl $16, %ecx
2435 ; SSE2-ONLY-NEXT: movb %cl, 2(%rsi)
2436 ; SSE2-ONLY-NEXT: movb %cl, 2(%rdx)
2437 ; SSE2-ONLY-NEXT: movw %ax, (%rdx)
2438 ; SSE2-ONLY-NEXT: movb %cl, 6(%rdx)
2439 ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx)
2440 ; SSE2-ONLY-NEXT: movb %cl, 10(%rdx)
2441 ; SSE2-ONLY-NEXT: movw %ax, 8(%rdx)
2442 ; SSE2-ONLY-NEXT: movb %cl, 14(%rdx)
2443 ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx)
2444 ; SSE2-ONLY-NEXT: movb %cl, 18(%rdx)
2445 ; SSE2-ONLY-NEXT: movw %ax, 16(%rdx)
2446 ; SSE2-ONLY-NEXT: movb %cl, 22(%rdx)
2447 ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx)
2448 ; SSE2-ONLY-NEXT: movb %cl, 26(%rdx)
2449 ; SSE2-ONLY-NEXT: movw %ax, 24(%rdx)
2450 ; SSE2-ONLY-NEXT: movb %cl, 30(%rdx)
2451 ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx)
2452 ; SSE2-ONLY-NEXT: movb %cl, 34(%rdx)
2453 ; SSE2-ONLY-NEXT: movw %ax, 32(%rdx)
2454 ; SSE2-ONLY-NEXT: movb %cl, 38(%rdx)
2455 ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx)
2456 ; SSE2-ONLY-NEXT: movb %cl, 42(%rdx)
2457 ; SSE2-ONLY-NEXT: movw %ax, 40(%rdx)
2458 ; SSE2-ONLY-NEXT: movb %cl, 46(%rdx)
2459 ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx)
2460 ; SSE2-ONLY-NEXT: movb %cl, 50(%rdx)
2461 ; SSE2-ONLY-NEXT: movw %ax, 48(%rdx)
2462 ; SSE2-ONLY-NEXT: movb %cl, 54(%rdx)
2463 ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx)
2464 ; SSE2-ONLY-NEXT: movb %cl, 58(%rdx)
2465 ; SSE2-ONLY-NEXT: movw %ax, 56(%rdx)
2466 ; SSE2-ONLY-NEXT: movb %cl, 62(%rdx)
2467 ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx)
2468 ; SSE2-ONLY-NEXT: retq
2470 ; SSE3-LABEL: vec384_v3i8:
2472 ; SSE3-NEXT: movl (%rdi), %eax
2473 ; SSE3-NEXT: notl %eax
2474 ; SSE3-NEXT: movw %ax, (%rsi)
2475 ; SSE3-NEXT: movl %eax, %ecx
2476 ; SSE3-NEXT: shrl $16, %ecx
2477 ; SSE3-NEXT: movb %cl, 2(%rsi)
2478 ; SSE3-NEXT: movb %cl, 2(%rdx)
2479 ; SSE3-NEXT: movw %ax, (%rdx)
2480 ; SSE3-NEXT: movb %cl, 6(%rdx)
2481 ; SSE3-NEXT: movw %ax, 4(%rdx)
2482 ; SSE3-NEXT: movb %cl, 10(%rdx)
2483 ; SSE3-NEXT: movw %ax, 8(%rdx)
2484 ; SSE3-NEXT: movb %cl, 14(%rdx)
2485 ; SSE3-NEXT: movw %ax, 12(%rdx)
2486 ; SSE3-NEXT: movb %cl, 18(%rdx)
2487 ; SSE3-NEXT: movw %ax, 16(%rdx)
2488 ; SSE3-NEXT: movb %cl, 22(%rdx)
2489 ; SSE3-NEXT: movw %ax, 20(%rdx)
2490 ; SSE3-NEXT: movb %cl, 26(%rdx)
2491 ; SSE3-NEXT: movw %ax, 24(%rdx)
2492 ; SSE3-NEXT: movb %cl, 30(%rdx)
2493 ; SSE3-NEXT: movw %ax, 28(%rdx)
2494 ; SSE3-NEXT: movb %cl, 34(%rdx)
2495 ; SSE3-NEXT: movw %ax, 32(%rdx)
2496 ; SSE3-NEXT: movb %cl, 38(%rdx)
2497 ; SSE3-NEXT: movw %ax, 36(%rdx)
2498 ; SSE3-NEXT: movb %cl, 42(%rdx)
2499 ; SSE3-NEXT: movw %ax, 40(%rdx)
2500 ; SSE3-NEXT: movb %cl, 46(%rdx)
2501 ; SSE3-NEXT: movw %ax, 44(%rdx)
2502 ; SSE3-NEXT: movb %cl, 50(%rdx)
2503 ; SSE3-NEXT: movw %ax, 48(%rdx)
2504 ; SSE3-NEXT: movb %cl, 54(%rdx)
2505 ; SSE3-NEXT: movw %ax, 52(%rdx)
2506 ; SSE3-NEXT: movb %cl, 58(%rdx)
2507 ; SSE3-NEXT: movw %ax, 56(%rdx)
2508 ; SSE3-NEXT: movb %cl, 62(%rdx)
2509 ; SSE3-NEXT: movw %ax, 60(%rdx)
2512 ; SSSE3-ONLY-LABEL: vec384_v3i8:
2513 ; SSSE3-ONLY: # %bb.0:
2514 ; SSSE3-ONLY-NEXT: movl (%rdi), %eax
2515 ; SSSE3-ONLY-NEXT: notl %eax
2516 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
2517 ; SSSE3-ONLY-NEXT: movl %eax, %ecx
2518 ; SSSE3-ONLY-NEXT: shrl $16, %ecx
2519 ; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi)
2520 ; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx)
2521 ; SSSE3-ONLY-NEXT: movw %ax, (%rdx)
2522 ; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx)
2523 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx)
2524 ; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx)
2525 ; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx)
2526 ; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx)
2527 ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx)
2528 ; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx)
2529 ; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx)
2530 ; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx)
2531 ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx)
2532 ; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx)
2533 ; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx)
2534 ; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx)
2535 ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx)
2536 ; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx)
2537 ; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx)
2538 ; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx)
2539 ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx)
2540 ; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx)
2541 ; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx)
2542 ; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx)
2543 ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx)
2544 ; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx)
2545 ; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx)
2546 ; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx)
2547 ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx)
2548 ; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx)
2549 ; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx)
2550 ; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx)
2551 ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx)
2552 ; SSSE3-ONLY-NEXT: retq
2554 ; SSE41-LABEL: vec384_v3i8:
2556 ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2557 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
2558 ; SSE41-NEXT: pxor %xmm1, %xmm0
2559 ; SSE41-NEXT: pextrb $2, %xmm0, 2(%rsi)
2560 ; SSE41-NEXT: movd %xmm0, %eax
2561 ; SSE41-NEXT: movw %ax, (%rsi)
2562 ; SSE41-NEXT: pextrb $2, %xmm0, 2(%rdx)
2563 ; SSE41-NEXT: movw %ax, (%rdx)
2564 ; SSE41-NEXT: pextrb $2, %xmm0, 6(%rdx)
2565 ; SSE41-NEXT: movw %ax, 4(%rdx)
2566 ; SSE41-NEXT: pextrb $2, %xmm0, 10(%rdx)
2567 ; SSE41-NEXT: movw %ax, 8(%rdx)
2568 ; SSE41-NEXT: pextrb $2, %xmm0, 14(%rdx)
2569 ; SSE41-NEXT: movw %ax, 12(%rdx)
2570 ; SSE41-NEXT: pextrb $2, %xmm0, 18(%rdx)
2571 ; SSE41-NEXT: movw %ax, 16(%rdx)
2572 ; SSE41-NEXT: pextrb $2, %xmm0, 22(%rdx)
2573 ; SSE41-NEXT: movw %ax, 20(%rdx)
2574 ; SSE41-NEXT: pextrb $2, %xmm0, 26(%rdx)
2575 ; SSE41-NEXT: movw %ax, 24(%rdx)
2576 ; SSE41-NEXT: pextrb $2, %xmm0, 30(%rdx)
2577 ; SSE41-NEXT: movw %ax, 28(%rdx)
2578 ; SSE41-NEXT: pextrb $2, %xmm0, 34(%rdx)
2579 ; SSE41-NEXT: movw %ax, 32(%rdx)
2580 ; SSE41-NEXT: pextrb $2, %xmm0, 38(%rdx)
2581 ; SSE41-NEXT: movw %ax, 36(%rdx)
2582 ; SSE41-NEXT: pextrb $2, %xmm0, 42(%rdx)
2583 ; SSE41-NEXT: movw %ax, 40(%rdx)
2584 ; SSE41-NEXT: pextrb $2, %xmm0, 46(%rdx)
2585 ; SSE41-NEXT: movw %ax, 44(%rdx)
2586 ; SSE41-NEXT: pextrb $2, %xmm0, 50(%rdx)
2587 ; SSE41-NEXT: movw %ax, 48(%rdx)
2588 ; SSE41-NEXT: pextrb $2, %xmm0, 54(%rdx)
2589 ; SSE41-NEXT: movw %ax, 52(%rdx)
2590 ; SSE41-NEXT: pextrb $2, %xmm0, 58(%rdx)
2591 ; SSE41-NEXT: movw %ax, 56(%rdx)
2592 ; SSE41-NEXT: pextrb $2, %xmm0, 62(%rdx)
2593 ; SSE41-NEXT: movw %ax, 60(%rdx)
2596 ; SSE42-LABEL: vec384_v3i8:
2598 ; SSE42-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2599 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
2600 ; SSE42-NEXT: pxor %xmm1, %xmm0
2601 ; SSE42-NEXT: pextrb $2, %xmm0, 2(%rsi)
2602 ; SSE42-NEXT: movd %xmm0, %eax
2603 ; SSE42-NEXT: movw %ax, (%rsi)
2604 ; SSE42-NEXT: pextrb $2, %xmm0, 2(%rdx)
2605 ; SSE42-NEXT: movw %ax, (%rdx)
2606 ; SSE42-NEXT: pextrb $2, %xmm0, 6(%rdx)
2607 ; SSE42-NEXT: movw %ax, 4(%rdx)
2608 ; SSE42-NEXT: pextrb $2, %xmm0, 10(%rdx)
2609 ; SSE42-NEXT: movw %ax, 8(%rdx)
2610 ; SSE42-NEXT: pextrb $2, %xmm0, 14(%rdx)
2611 ; SSE42-NEXT: movw %ax, 12(%rdx)
2612 ; SSE42-NEXT: pextrb $2, %xmm0, 18(%rdx)
2613 ; SSE42-NEXT: movw %ax, 16(%rdx)
2614 ; SSE42-NEXT: pextrb $2, %xmm0, 22(%rdx)
2615 ; SSE42-NEXT: movw %ax, 20(%rdx)
2616 ; SSE42-NEXT: pextrb $2, %xmm0, 26(%rdx)
2617 ; SSE42-NEXT: movw %ax, 24(%rdx)
2618 ; SSE42-NEXT: pextrb $2, %xmm0, 30(%rdx)
2619 ; SSE42-NEXT: movw %ax, 28(%rdx)
2620 ; SSE42-NEXT: pextrb $2, %xmm0, 34(%rdx)
2621 ; SSE42-NEXT: movw %ax, 32(%rdx)
2622 ; SSE42-NEXT: pextrb $2, %xmm0, 38(%rdx)
2623 ; SSE42-NEXT: movw %ax, 36(%rdx)
2624 ; SSE42-NEXT: pextrb $2, %xmm0, 42(%rdx)
2625 ; SSE42-NEXT: movw %ax, 40(%rdx)
2626 ; SSE42-NEXT: pextrb $2, %xmm0, 46(%rdx)
2627 ; SSE42-NEXT: movw %ax, 44(%rdx)
2628 ; SSE42-NEXT: pextrb $2, %xmm0, 50(%rdx)
2629 ; SSE42-NEXT: movw %ax, 48(%rdx)
2630 ; SSE42-NEXT: pextrb $2, %xmm0, 54(%rdx)
2631 ; SSE42-NEXT: movw %ax, 52(%rdx)
2632 ; SSE42-NEXT: pextrb $2, %xmm0, 58(%rdx)
2633 ; SSE42-NEXT: movw %ax, 56(%rdx)
2634 ; SSE42-NEXT: pextrb $2, %xmm0, 62(%rdx)
2635 ; SSE42-NEXT: movw %ax, 60(%rdx)
2638 ; AVX1-LABEL: vec384_v3i8:
2640 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2641 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2642 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2643 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rsi)
2644 ; AVX1-NEXT: vmovd %xmm0, %eax
2645 ; AVX1-NEXT: movw %ax, (%rsi)
2646 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdx)
2647 ; AVX1-NEXT: movw %ax, (%rdx)
2648 ; AVX1-NEXT: vpextrb $2, %xmm0, 6(%rdx)
2649 ; AVX1-NEXT: movw %ax, 4(%rdx)
2650 ; AVX1-NEXT: vpextrb $2, %xmm0, 10(%rdx)
2651 ; AVX1-NEXT: movw %ax, 8(%rdx)
2652 ; AVX1-NEXT: vpextrb $2, %xmm0, 14(%rdx)
2653 ; AVX1-NEXT: movw %ax, 12(%rdx)
2654 ; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdx)
2655 ; AVX1-NEXT: movw %ax, 16(%rdx)
2656 ; AVX1-NEXT: vpextrb $2, %xmm0, 22(%rdx)
2657 ; AVX1-NEXT: movw %ax, 20(%rdx)
2658 ; AVX1-NEXT: vpextrb $2, %xmm0, 26(%rdx)
2659 ; AVX1-NEXT: movw %ax, 24(%rdx)
2660 ; AVX1-NEXT: vpextrb $2, %xmm0, 30(%rdx)
2661 ; AVX1-NEXT: movw %ax, 28(%rdx)
2662 ; AVX1-NEXT: vpextrb $2, %xmm0, 34(%rdx)
2663 ; AVX1-NEXT: movw %ax, 32(%rdx)
2664 ; AVX1-NEXT: vpextrb $2, %xmm0, 38(%rdx)
2665 ; AVX1-NEXT: movw %ax, 36(%rdx)
2666 ; AVX1-NEXT: vpextrb $2, %xmm0, 42(%rdx)
2667 ; AVX1-NEXT: movw %ax, 40(%rdx)
2668 ; AVX1-NEXT: vpextrb $2, %xmm0, 46(%rdx)
2669 ; AVX1-NEXT: movw %ax, 44(%rdx)
2670 ; AVX1-NEXT: vpextrb $2, %xmm0, 50(%rdx)
2671 ; AVX1-NEXT: movw %ax, 48(%rdx)
2672 ; AVX1-NEXT: vpextrb $2, %xmm0, 54(%rdx)
2673 ; AVX1-NEXT: movw %ax, 52(%rdx)
2674 ; AVX1-NEXT: vpextrb $2, %xmm0, 58(%rdx)
2675 ; AVX1-NEXT: movw %ax, 56(%rdx)
2676 ; AVX1-NEXT: vpextrb $2, %xmm0, 62(%rdx)
2677 ; AVX1-NEXT: movw %ax, 60(%rdx)
2680 ; AVX2-ONLY-LABEL: vec384_v3i8:
2681 ; AVX2-ONLY: # %bb.0:
2682 ; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2683 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2684 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
2685 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 2(%rsi)
2686 ; AVX2-ONLY-NEXT: vmovd %xmm0, %eax
2687 ; AVX2-ONLY-NEXT: movw %ax, (%rsi)
2688 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 2(%rdx)
2689 ; AVX2-ONLY-NEXT: movw %ax, (%rdx)
2690 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 6(%rdx)
2691 ; AVX2-ONLY-NEXT: movw %ax, 4(%rdx)
2692 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 10(%rdx)
2693 ; AVX2-ONLY-NEXT: movw %ax, 8(%rdx)
2694 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 14(%rdx)
2695 ; AVX2-ONLY-NEXT: movw %ax, 12(%rdx)
2696 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 18(%rdx)
2697 ; AVX2-ONLY-NEXT: movw %ax, 16(%rdx)
2698 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 22(%rdx)
2699 ; AVX2-ONLY-NEXT: movw %ax, 20(%rdx)
2700 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 26(%rdx)
2701 ; AVX2-ONLY-NEXT: movw %ax, 24(%rdx)
2702 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 30(%rdx)
2703 ; AVX2-ONLY-NEXT: movw %ax, 28(%rdx)
2704 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 34(%rdx)
2705 ; AVX2-ONLY-NEXT: movw %ax, 32(%rdx)
2706 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 38(%rdx)
2707 ; AVX2-ONLY-NEXT: movw %ax, 36(%rdx)
2708 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 42(%rdx)
2709 ; AVX2-ONLY-NEXT: movw %ax, 40(%rdx)
2710 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 46(%rdx)
2711 ; AVX2-ONLY-NEXT: movw %ax, 44(%rdx)
2712 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 50(%rdx)
2713 ; AVX2-ONLY-NEXT: movw %ax, 48(%rdx)
2714 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 54(%rdx)
2715 ; AVX2-ONLY-NEXT: movw %ax, 52(%rdx)
2716 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 58(%rdx)
2717 ; AVX2-ONLY-NEXT: movw %ax, 56(%rdx)
2718 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 62(%rdx)
2719 ; AVX2-ONLY-NEXT: movw %ax, 60(%rdx)
2720 ; AVX2-ONLY-NEXT: retq
2722 ; AVX512-LABEL: vec384_v3i8:
2724 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2725 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
2726 ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi)
2727 ; AVX512-NEXT: vmovd %xmm0, %eax
2728 ; AVX512-NEXT: movw %ax, (%rsi)
2729 ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rdx)
2730 ; AVX512-NEXT: movw %ax, (%rdx)
2731 ; AVX512-NEXT: vpextrb $2, %xmm0, 6(%rdx)
2732 ; AVX512-NEXT: movw %ax, 4(%rdx)
2733 ; AVX512-NEXT: vpextrb $2, %xmm0, 10(%rdx)
2734 ; AVX512-NEXT: movw %ax, 8(%rdx)
2735 ; AVX512-NEXT: vpextrb $2, %xmm0, 14(%rdx)
2736 ; AVX512-NEXT: movw %ax, 12(%rdx)
2737 ; AVX512-NEXT: vpextrb $2, %xmm0, 18(%rdx)
2738 ; AVX512-NEXT: movw %ax, 16(%rdx)
2739 ; AVX512-NEXT: vpextrb $2, %xmm0, 22(%rdx)
2740 ; AVX512-NEXT: movw %ax, 20(%rdx)
2741 ; AVX512-NEXT: vpextrb $2, %xmm0, 26(%rdx)
2742 ; AVX512-NEXT: movw %ax, 24(%rdx)
2743 ; AVX512-NEXT: vpextrb $2, %xmm0, 30(%rdx)
2744 ; AVX512-NEXT: movw %ax, 28(%rdx)
2745 ; AVX512-NEXT: vpextrb $2, %xmm0, 34(%rdx)
2746 ; AVX512-NEXT: movw %ax, 32(%rdx)
2747 ; AVX512-NEXT: vpextrb $2, %xmm0, 38(%rdx)
2748 ; AVX512-NEXT: movw %ax, 36(%rdx)
2749 ; AVX512-NEXT: vpextrb $2, %xmm0, 42(%rdx)
2750 ; AVX512-NEXT: movw %ax, 40(%rdx)
2751 ; AVX512-NEXT: vpextrb $2, %xmm0, 46(%rdx)
2752 ; AVX512-NEXT: movw %ax, 44(%rdx)
2753 ; AVX512-NEXT: vpextrb $2, %xmm0, 50(%rdx)
2754 ; AVX512-NEXT: movw %ax, 48(%rdx)
2755 ; AVX512-NEXT: vpextrb $2, %xmm0, 54(%rdx)
2756 ; AVX512-NEXT: movw %ax, 52(%rdx)
2757 ; AVX512-NEXT: vpextrb $2, %xmm0, 58(%rdx)
2758 ; AVX512-NEXT: movw %ax, 56(%rdx)
2759 ; AVX512-NEXT: vpextrb $2, %xmm0, 62(%rdx)
2760 ; AVX512-NEXT: movw %ax, 60(%rdx)
2762 %in.subvec.not = load <3 x i8>, ptr %in.subvec.ptr, align 64
2763 %in.subvec = xor <3 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1>
2764 store <3 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
2765 %out.subvec0.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 0
2766 store <3 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
2767 %out.subvec1.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 1
2768 store <3 x i8> %in.subvec, ptr %out.subvec1.ptr, align 1
2769 %out.subvec2.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 2
2770 store <3 x i8> %in.subvec, ptr %out.subvec2.ptr, align 2
2771 %out.subvec3.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 3
2772 store <3 x i8> %in.subvec, ptr %out.subvec3.ptr, align 1
2773 %out.subvec4.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 4
2774 store <3 x i8> %in.subvec, ptr %out.subvec4.ptr, align 4
2775 %out.subvec5.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 5
2776 store <3 x i8> %in.subvec, ptr %out.subvec5.ptr, align 1
2777 %out.subvec6.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 6
2778 store <3 x i8> %in.subvec, ptr %out.subvec6.ptr, align 2
2779 %out.subvec7.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 7
2780 store <3 x i8> %in.subvec, ptr %out.subvec7.ptr, align 1
2781 %out.subvec8.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 8
2782 store <3 x i8> %in.subvec, ptr %out.subvec8.ptr, align 8
2783 %out.subvec9.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 9
2784 store <3 x i8> %in.subvec, ptr %out.subvec9.ptr, align 1
2785 %out.subvec10.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 10
2786 store <3 x i8> %in.subvec, ptr %out.subvec10.ptr, align 2
2787 %out.subvec11.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 11
2788 store <3 x i8> %in.subvec, ptr %out.subvec11.ptr, align 1
2789 %out.subvec12.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 12
2790 store <3 x i8> %in.subvec, ptr %out.subvec12.ptr, align 4
2791 %out.subvec13.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 13
2792 store <3 x i8> %in.subvec, ptr %out.subvec13.ptr, align 1
2793 %out.subvec14.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 14
2794 store <3 x i8> %in.subvec, ptr %out.subvec14.ptr, align 2
2795 %out.subvec15.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 15
2796 store <3 x i8> %in.subvec, ptr %out.subvec15.ptr, align 1
2800 define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2801 ; SCALAR-LABEL: vec384_v3i16:
2803 ; SCALAR-NEXT: movq (%rdi), %rax
2804 ; SCALAR-NEXT: movq %rax, %rcx
2805 ; SCALAR-NEXT: shrq $32, %rcx
2806 ; SCALAR-NEXT: notl %ecx
2807 ; SCALAR-NEXT: notl %eax
2808 ; SCALAR-NEXT: movl %eax, (%rsi)
2809 ; SCALAR-NEXT: movw %cx, 4(%rsi)
2810 ; SCALAR-NEXT: movw %cx, 4(%rdx)
2811 ; SCALAR-NEXT: movl %eax, (%rdx)
2812 ; SCALAR-NEXT: movw %cx, 12(%rdx)
2813 ; SCALAR-NEXT: movl %eax, 8(%rdx)
2814 ; SCALAR-NEXT: movw %cx, 20(%rdx)
2815 ; SCALAR-NEXT: movl %eax, 16(%rdx)
2816 ; SCALAR-NEXT: movw %cx, 28(%rdx)
2817 ; SCALAR-NEXT: movl %eax, 24(%rdx)
2818 ; SCALAR-NEXT: movw %cx, 36(%rdx)
2819 ; SCALAR-NEXT: movl %eax, 32(%rdx)
2820 ; SCALAR-NEXT: movw %cx, 44(%rdx)
2821 ; SCALAR-NEXT: movl %eax, 40(%rdx)
2822 ; SCALAR-NEXT: movw %cx, 52(%rdx)
2823 ; SCALAR-NEXT: movl %eax, 48(%rdx)
2824 ; SCALAR-NEXT: movw %cx, 60(%rdx)
2825 ; SCALAR-NEXT: movl %eax, 56(%rdx)
2828 ; SSE2-ONLY-LABEL: vec384_v3i16:
2829 ; SSE2-ONLY: # %bb.0:
2830 ; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2831 ; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
2832 ; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1
2833 ; SSE2-ONLY-NEXT: movd %xmm1, (%rsi)
2834 ; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax
2835 ; SSE2-ONLY-NEXT: movw %ax, 4(%rsi)
2836 ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx)
2837 ; SSE2-ONLY-NEXT: movd %xmm1, (%rdx)
2838 ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx)
2839 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
2840 ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx)
2841 ; SSE2-ONLY-NEXT: movd %xmm1, 16(%rdx)
2842 ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx)
2843 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
2844 ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx)
2845 ; SSE2-ONLY-NEXT: movd %xmm1, 32(%rdx)
2846 ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx)
2847 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
2848 ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx)
2849 ; SSE2-ONLY-NEXT: movd %xmm1, 48(%rdx)
2850 ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx)
2851 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
2852 ; SSE2-ONLY-NEXT: retq
2854 ; SSE3-LABEL: vec384_v3i16:
2856 ; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2857 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
2858 ; SSE3-NEXT: pxor %xmm0, %xmm1
2859 ; SSE3-NEXT: movd %xmm1, (%rsi)
2860 ; SSE3-NEXT: pextrw $2, %xmm1, %eax
2861 ; SSE3-NEXT: movw %ax, 4(%rsi)
2862 ; SSE3-NEXT: movw %ax, 4(%rdx)
2863 ; SSE3-NEXT: movd %xmm1, (%rdx)
2864 ; SSE3-NEXT: movw %ax, 12(%rdx)
2865 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
2866 ; SSE3-NEXT: movw %ax, 20(%rdx)
2867 ; SSE3-NEXT: movd %xmm1, 16(%rdx)
2868 ; SSE3-NEXT: movw %ax, 28(%rdx)
2869 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
2870 ; SSE3-NEXT: movw %ax, 36(%rdx)
2871 ; SSE3-NEXT: movd %xmm1, 32(%rdx)
2872 ; SSE3-NEXT: movw %ax, 44(%rdx)
2873 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
2874 ; SSE3-NEXT: movw %ax, 52(%rdx)
2875 ; SSE3-NEXT: movd %xmm1, 48(%rdx)
2876 ; SSE3-NEXT: movw %ax, 60(%rdx)
2877 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
2880 ; SSSE3-ONLY-LABEL: vec384_v3i16:
2881 ; SSSE3-ONLY: # %bb.0:
2882 ; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2883 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
2884 ; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1
2885 ; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi)
2886 ; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax
2887 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi)
2888 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx)
2889 ; SSSE3-ONLY-NEXT: movd %xmm1, (%rdx)
2890 ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx)
2891 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
2892 ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx)
2893 ; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rdx)
2894 ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx)
2895 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
2896 ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx)
2897 ; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rdx)
2898 ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx)
2899 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
2900 ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx)
2901 ; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rdx)
2902 ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx)
2903 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
2904 ; SSSE3-ONLY-NEXT: retq
2906 ; SSE41-LABEL: vec384_v3i16:
2908 ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2909 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
2910 ; SSE41-NEXT: pxor %xmm0, %xmm1
2911 ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi)
2912 ; SSE41-NEXT: movd %xmm1, (%rsi)
2913 ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rdx)
2914 ; SSE41-NEXT: movd %xmm1, (%rdx)
2915 ; SSE41-NEXT: pextrw $2, %xmm1, 12(%rdx)
2916 ; SSE41-NEXT: movd %xmm1, 8(%rdx)
2917 ; SSE41-NEXT: pextrw $2, %xmm1, 20(%rdx)
2918 ; SSE41-NEXT: movd %xmm1, 16(%rdx)
2919 ; SSE41-NEXT: pextrw $2, %xmm1, 28(%rdx)
2920 ; SSE41-NEXT: movd %xmm1, 24(%rdx)
2921 ; SSE41-NEXT: pextrw $2, %xmm1, 36(%rdx)
2922 ; SSE41-NEXT: movd %xmm1, 32(%rdx)
2923 ; SSE41-NEXT: pextrw $2, %xmm1, 44(%rdx)
2924 ; SSE41-NEXT: movd %xmm1, 40(%rdx)
2925 ; SSE41-NEXT: pextrw $2, %xmm1, 52(%rdx)
2926 ; SSE41-NEXT: movd %xmm1, 48(%rdx)
2927 ; SSE41-NEXT: pextrw $2, %xmm1, 60(%rdx)
2928 ; SSE41-NEXT: movd %xmm1, 56(%rdx)
2931 ; SSE42-LABEL: vec384_v3i16:
2933 ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2934 ; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
2935 ; SSE42-NEXT: pxor %xmm0, %xmm1
2936 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi)
2937 ; SSE42-NEXT: movd %xmm1, (%rsi)
2938 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdx)
2939 ; SSE42-NEXT: movd %xmm1, (%rdx)
2940 ; SSE42-NEXT: pextrw $2, %xmm1, 12(%rdx)
2941 ; SSE42-NEXT: movd %xmm1, 8(%rdx)
2942 ; SSE42-NEXT: pextrw $2, %xmm1, 20(%rdx)
2943 ; SSE42-NEXT: movd %xmm1, 16(%rdx)
2944 ; SSE42-NEXT: pextrw $2, %xmm1, 28(%rdx)
2945 ; SSE42-NEXT: movd %xmm1, 24(%rdx)
2946 ; SSE42-NEXT: pextrw $2, %xmm1, 36(%rdx)
2947 ; SSE42-NEXT: movd %xmm1, 32(%rdx)
2948 ; SSE42-NEXT: pextrw $2, %xmm1, 44(%rdx)
2949 ; SSE42-NEXT: movd %xmm1, 40(%rdx)
2950 ; SSE42-NEXT: pextrw $2, %xmm1, 52(%rdx)
2951 ; SSE42-NEXT: movd %xmm1, 48(%rdx)
2952 ; SSE42-NEXT: pextrw $2, %xmm1, 60(%rdx)
2953 ; SSE42-NEXT: movd %xmm1, 56(%rdx)
2956 ; AVX1-LABEL: vec384_v3i16:
2958 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2959 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2960 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2961 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rsi)
2962 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
2963 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdx)
2964 ; AVX1-NEXT: vmovd %xmm0, (%rdx)
2965 ; AVX1-NEXT: vpextrw $2, %xmm0, 12(%rdx)
2966 ; AVX1-NEXT: vmovd %xmm0, 8(%rdx)
2967 ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdx)
2968 ; AVX1-NEXT: vmovd %xmm0, 16(%rdx)
2969 ; AVX1-NEXT: vpextrw $2, %xmm0, 28(%rdx)
2970 ; AVX1-NEXT: vmovd %xmm0, 24(%rdx)
2971 ; AVX1-NEXT: vpextrw $2, %xmm0, 36(%rdx)
2972 ; AVX1-NEXT: vmovd %xmm0, 32(%rdx)
2973 ; AVX1-NEXT: vpextrw $2, %xmm0, 44(%rdx)
2974 ; AVX1-NEXT: vmovd %xmm0, 40(%rdx)
2975 ; AVX1-NEXT: vpextrw $2, %xmm0, 52(%rdx)
2976 ; AVX1-NEXT: vmovd %xmm0, 48(%rdx)
2977 ; AVX1-NEXT: vpextrw $2, %xmm0, 60(%rdx)
2978 ; AVX1-NEXT: vmovd %xmm0, 56(%rdx)
2981 ; AVX2-ONLY-LABEL: vec384_v3i16:
2982 ; AVX2-ONLY: # %bb.0:
2983 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2984 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2985 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
2986 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rsi)
2987 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
2988 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rdx)
2989 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rdx)
2990 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 12(%rdx)
2991 ; AVX2-ONLY-NEXT: vmovd %xmm0, 8(%rdx)
2992 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 20(%rdx)
2993 ; AVX2-ONLY-NEXT: vmovd %xmm0, 16(%rdx)
2994 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 28(%rdx)
2995 ; AVX2-ONLY-NEXT: vmovd %xmm0, 24(%rdx)
2996 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 36(%rdx)
2997 ; AVX2-ONLY-NEXT: vmovd %xmm0, 32(%rdx)
2998 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 44(%rdx)
2999 ; AVX2-ONLY-NEXT: vmovd %xmm0, 40(%rdx)
3000 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 52(%rdx)
3001 ; AVX2-ONLY-NEXT: vmovd %xmm0, 48(%rdx)
3002 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 60(%rdx)
3003 ; AVX2-ONLY-NEXT: vmovd %xmm0, 56(%rdx)
3004 ; AVX2-ONLY-NEXT: retq
3006 ; AVX512-LABEL: vec384_v3i16:
3008 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3009 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
3010 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi)
3011 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
3012 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx)
3013 ; AVX512-NEXT: vmovd %xmm0, (%rdx)
3014 ; AVX512-NEXT: vpextrw $2, %xmm0, 12(%rdx)
3015 ; AVX512-NEXT: vmovd %xmm0, 8(%rdx)
3016 ; AVX512-NEXT: vpextrw $2, %xmm0, 20(%rdx)
3017 ; AVX512-NEXT: vmovd %xmm0, 16(%rdx)
3018 ; AVX512-NEXT: vpextrw $2, %xmm0, 28(%rdx)
3019 ; AVX512-NEXT: vmovd %xmm0, 24(%rdx)
3020 ; AVX512-NEXT: vpextrw $2, %xmm0, 36(%rdx)
3021 ; AVX512-NEXT: vmovd %xmm0, 32(%rdx)
3022 ; AVX512-NEXT: vpextrw $2, %xmm0, 44(%rdx)
3023 ; AVX512-NEXT: vmovd %xmm0, 40(%rdx)
3024 ; AVX512-NEXT: vpextrw $2, %xmm0, 52(%rdx)
3025 ; AVX512-NEXT: vmovd %xmm0, 48(%rdx)
3026 ; AVX512-NEXT: vpextrw $2, %xmm0, 60(%rdx)
3027 ; AVX512-NEXT: vmovd %xmm0, 56(%rdx)
3029 %in.subvec.not = load <3 x i16>, ptr %in.subvec.ptr, align 64
3030 %in.subvec = xor <3 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1>
3031 store <3 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
3032 %out.subvec0.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 0
3033 store <3 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
3034 %out.subvec1.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 1
3035 store <3 x i16> %in.subvec, ptr %out.subvec1.ptr, align 2
3036 %out.subvec2.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 2
3037 store <3 x i16> %in.subvec, ptr %out.subvec2.ptr, align 4
3038 %out.subvec3.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 3
3039 store <3 x i16> %in.subvec, ptr %out.subvec3.ptr, align 2
3040 %out.subvec4.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 4
3041 store <3 x i16> %in.subvec, ptr %out.subvec4.ptr, align 8
3042 %out.subvec5.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 5
3043 store <3 x i16> %in.subvec, ptr %out.subvec5.ptr, align 2
3044 %out.subvec6.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 6
3045 store <3 x i16> %in.subvec, ptr %out.subvec6.ptr, align 4
3046 %out.subvec7.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 7
3047 store <3 x i16> %in.subvec, ptr %out.subvec7.ptr, align 2
3051 define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3052 ; SCALAR-LABEL: vec384_v3i32:
3054 ; SCALAR-NEXT: movl 8(%rdi), %eax
3055 ; SCALAR-NEXT: movq (%rdi), %rcx
3056 ; SCALAR-NEXT: notq %rcx
3057 ; SCALAR-NEXT: notl %eax
3058 ; SCALAR-NEXT: movl %eax, 8(%rsi)
3059 ; SCALAR-NEXT: movq %rcx, (%rsi)
3060 ; SCALAR-NEXT: movl %eax, 8(%rdx)
3061 ; SCALAR-NEXT: movq %rcx, (%rdx)
3062 ; SCALAR-NEXT: movl %eax, 24(%rdx)
3063 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
3064 ; SCALAR-NEXT: movl %eax, 40(%rdx)
3065 ; SCALAR-NEXT: movq %rcx, 32(%rdx)
3066 ; SCALAR-NEXT: movl %eax, 56(%rdx)
3067 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
3070 ; SSE2-ONLY-LABEL: vec384_v3i32:
3071 ; SSE2-ONLY: # %bb.0:
3072 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
3073 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
3074 ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi)
3075 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3076 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi)
3077 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
3078 ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx)
3079 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
3080 ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx)
3081 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
3082 ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx)
3083 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
3084 ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx)
3085 ; SSE2-ONLY-NEXT: retq
3087 ; SSE3-LABEL: vec384_v3i32:
3089 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
3090 ; SSE3-NEXT: pxor (%rdi), %xmm0
3091 ; SSE3-NEXT: movq %xmm0, (%rsi)
3092 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3093 ; SSE3-NEXT: movd %xmm1, 8(%rsi)
3094 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
3095 ; SSE3-NEXT: movq %xmm0, (%rdx)
3096 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
3097 ; SSE3-NEXT: movq %xmm0, 16(%rdx)
3098 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
3099 ; SSE3-NEXT: movq %xmm0, 32(%rdx)
3100 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
3101 ; SSE3-NEXT: movq %xmm0, 48(%rdx)
3104 ; SSSE3-ONLY-LABEL: vec384_v3i32:
3105 ; SSSE3-ONLY: # %bb.0:
3106 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
3107 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
3108 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi)
3109 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3110 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi)
3111 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
3112 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx)
3113 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
3114 ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx)
3115 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
3116 ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx)
3117 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
3118 ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx)
3119 ; SSSE3-ONLY-NEXT: retq
3121 ; SSE41-LABEL: vec384_v3i32:
3123 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
3124 ; SSE41-NEXT: pxor (%rdi), %xmm0
3125 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi)
3126 ; SSE41-NEXT: movq %xmm0, (%rsi)
3127 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx)
3128 ; SSE41-NEXT: movq %xmm0, (%rdx)
3129 ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx)
3130 ; SSE41-NEXT: movq %xmm0, 16(%rdx)
3131 ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx)
3132 ; SSE41-NEXT: movq %xmm0, 32(%rdx)
3133 ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx)
3134 ; SSE41-NEXT: movq %xmm0, 48(%rdx)
3137 ; SSE42-LABEL: vec384_v3i32:
3139 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
3140 ; SSE42-NEXT: pxor (%rdi), %xmm0
3141 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi)
3142 ; SSE42-NEXT: movq %xmm0, (%rsi)
3143 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx)
3144 ; SSE42-NEXT: movq %xmm0, (%rdx)
3145 ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx)
3146 ; SSE42-NEXT: movq %xmm0, 16(%rdx)
3147 ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx)
3148 ; SSE42-NEXT: movq %xmm0, 32(%rdx)
3149 ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx)
3150 ; SSE42-NEXT: movq %xmm0, 48(%rdx)
3153 ; AVX-LABEL: vec384_v3i32:
3155 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3156 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
3157 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi)
3158 ; AVX-NEXT: vmovq %xmm0, (%rsi)
3159 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx)
3160 ; AVX-NEXT: vmovq %xmm0, (%rdx)
3161 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx)
3162 ; AVX-NEXT: vmovq %xmm0, 16(%rdx)
3163 ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx)
3164 ; AVX-NEXT: vmovq %xmm0, 32(%rdx)
3165 ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx)
3166 ; AVX-NEXT: vmovq %xmm0, 48(%rdx)
3168 %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64
3169 %in.subvec = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1>
3170 store <3 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
3171 %out.subvec0.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 0
3172 store <3 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
3173 %out.subvec1.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 1
3174 store <3 x i32> %in.subvec, ptr %out.subvec1.ptr, align 4
3175 %out.subvec2.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 2
3176 store <3 x i32> %in.subvec, ptr %out.subvec2.ptr, align 8
3177 %out.subvec3.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 3
3178 store <3 x i32> %in.subvec, ptr %out.subvec3.ptr, align 4
3182 define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3183 ; SCALAR-LABEL: vec384_v3f32:
3185 ; SCALAR-NEXT: movl 8(%rdi), %eax
3186 ; SCALAR-NEXT: movq (%rdi), %rcx
3187 ; SCALAR-NEXT: notq %rcx
3188 ; SCALAR-NEXT: notl %eax
3189 ; SCALAR-NEXT: movl %eax, 8(%rsi)
3190 ; SCALAR-NEXT: movq %rcx, (%rsi)
3191 ; SCALAR-NEXT: movl %eax, 8(%rdx)
3192 ; SCALAR-NEXT: movq %rcx, (%rdx)
3193 ; SCALAR-NEXT: movl %eax, 24(%rdx)
3194 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
3195 ; SCALAR-NEXT: movl %eax, 40(%rdx)
3196 ; SCALAR-NEXT: movq %rcx, 32(%rdx)
3197 ; SCALAR-NEXT: movl %eax, 56(%rdx)
3198 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
3201 ; SSE2-ONLY-LABEL: vec384_v3f32:
3202 ; SSE2-ONLY: # %bb.0:
3203 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
3204 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
3205 ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi)
3206 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3207 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi)
3208 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
3209 ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx)
3210 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
3211 ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx)
3212 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
3213 ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx)
3214 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
3215 ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx)
3216 ; SSE2-ONLY-NEXT: retq
3218 ; SSE3-LABEL: vec384_v3f32:
3220 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
3221 ; SSE3-NEXT: pxor (%rdi), %xmm0
3222 ; SSE3-NEXT: movq %xmm0, (%rsi)
3223 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3224 ; SSE3-NEXT: movd %xmm1, 8(%rsi)
3225 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
3226 ; SSE3-NEXT: movq %xmm0, (%rdx)
3227 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
3228 ; SSE3-NEXT: movq %xmm0, 16(%rdx)
3229 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
3230 ; SSE3-NEXT: movq %xmm0, 32(%rdx)
3231 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
3232 ; SSE3-NEXT: movq %xmm0, 48(%rdx)
3235 ; SSSE3-ONLY-LABEL: vec384_v3f32:
3236 ; SSSE3-ONLY: # %bb.0:
3237 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
3238 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
3239 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi)
3240 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3241 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi)
3242 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
3243 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx)
3244 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
3245 ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx)
3246 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
3247 ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx)
3248 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
3249 ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx)
3250 ; SSSE3-ONLY-NEXT: retq
3252 ; SSE41-LABEL: vec384_v3f32:
3254 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
3255 ; SSE41-NEXT: pxor (%rdi), %xmm0
3256 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi)
3257 ; SSE41-NEXT: movq %xmm0, (%rsi)
3258 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx)
3259 ; SSE41-NEXT: movq %xmm0, (%rdx)
3260 ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx)
3261 ; SSE41-NEXT: movq %xmm0, 16(%rdx)
3262 ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx)
3263 ; SSE41-NEXT: movq %xmm0, 32(%rdx)
3264 ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx)
3265 ; SSE41-NEXT: movq %xmm0, 48(%rdx)
3268 ; SSE42-LABEL: vec384_v3f32:
3270 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
3271 ; SSE42-NEXT: pxor (%rdi), %xmm0
3272 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi)
3273 ; SSE42-NEXT: movq %xmm0, (%rsi)
3274 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx)
3275 ; SSE42-NEXT: movq %xmm0, (%rdx)
3276 ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx)
3277 ; SSE42-NEXT: movq %xmm0, 16(%rdx)
3278 ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx)
3279 ; SSE42-NEXT: movq %xmm0, 32(%rdx)
3280 ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx)
3281 ; SSE42-NEXT: movq %xmm0, 48(%rdx)
3284 ; AVX-LABEL: vec384_v3f32:
3286 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3287 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
3288 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi)
3289 ; AVX-NEXT: vmovq %xmm0, (%rsi)
3290 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx)
3291 ; AVX-NEXT: vmovq %xmm0, (%rdx)
3292 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx)
3293 ; AVX-NEXT: vmovq %xmm0, 16(%rdx)
3294 ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx)
3295 ; AVX-NEXT: vmovq %xmm0, 32(%rdx)
3296 ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx)
3297 ; AVX-NEXT: vmovq %xmm0, 48(%rdx)
3299 %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64
3300 %in.subvec.int = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1>
3301 %in.subvec = bitcast <3 x i32> %in.subvec.int to <3 x float>
3302 store <3 x float> %in.subvec, ptr %out.subvec.ptr, align 64
3303 %out.subvec0.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 0
3304 store <3 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
3305 %out.subvec1.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 1
3306 store <3 x float> %in.subvec, ptr %out.subvec1.ptr, align 4
3307 %out.subvec2.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 2
3308 store <3 x float> %in.subvec, ptr %out.subvec2.ptr, align 8
3309 %out.subvec3.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 3
3310 store <3 x float> %in.subvec, ptr %out.subvec3.ptr, align 4
3314 define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3315 ; SCALAR-LABEL: vec384_v3i64:
3317 ; SCALAR-NEXT: movq (%rdi), %rax
3318 ; SCALAR-NEXT: movq 8(%rdi), %rcx
3319 ; SCALAR-NEXT: movq 16(%rdi), %rdi
3320 ; SCALAR-NEXT: notq %rdi
3321 ; SCALAR-NEXT: notq %rcx
3322 ; SCALAR-NEXT: notq %rax
3323 ; SCALAR-NEXT: movq %rax, (%rsi)
3324 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
3325 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
3326 ; SCALAR-NEXT: movq %rax, (%rdx)
3327 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
3328 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
3329 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
3330 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
3331 ; SCALAR-NEXT: movq %rax, 32(%rdx)
3334 ; SSE2-LABEL: vec384_v3i64:
3336 ; SSE2-NEXT: movq 16(%rdi), %rax
3337 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3338 ; SSE2-NEXT: pxor (%rdi), %xmm0
3339 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
3340 ; SSE2-NEXT: notq %rax
3341 ; SSE2-NEXT: movq %rax, 16(%rsi)
3342 ; SSE2-NEXT: movq %rax, 16(%rdx)
3343 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3344 ; SSE2-NEXT: movq %rax, 48(%rdx)
3345 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
3348 ; AVX1-LABEL: vec384_v3i64:
3350 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3351 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
3352 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
3353 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3354 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
3355 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
3356 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
3357 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
3358 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
3359 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
3360 ; AVX1-NEXT: vzeroupper
3363 ; AVX2-LABEL: vec384_v3i64:
3365 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
3366 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
3367 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3368 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
3369 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
3370 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
3371 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
3372 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
3373 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
3374 ; AVX2-NEXT: vzeroupper
3376 %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64
3377 %in.subvec = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1>
3378 store <3 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
3379 %out.subvec0.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 0
3380 store <3 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
3381 %out.subvec1.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 1
3382 store <3 x i64> %in.subvec, ptr %out.subvec1.ptr, align 8
3386 define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3387 ; SCALAR-LABEL: vec384_v3f64:
3389 ; SCALAR-NEXT: movq (%rdi), %rax
3390 ; SCALAR-NEXT: movq 8(%rdi), %rcx
3391 ; SCALAR-NEXT: movq 16(%rdi), %rdi
3392 ; SCALAR-NEXT: notq %rdi
3393 ; SCALAR-NEXT: notq %rcx
3394 ; SCALAR-NEXT: notq %rax
3395 ; SCALAR-NEXT: movq %rax, (%rsi)
3396 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
3397 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
3398 ; SCALAR-NEXT: movq %rax, (%rdx)
3399 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
3400 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
3401 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
3402 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
3403 ; SCALAR-NEXT: movq %rax, 32(%rdx)
3406 ; SSE2-LABEL: vec384_v3f64:
3408 ; SSE2-NEXT: movq 16(%rdi), %rax
3409 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3410 ; SSE2-NEXT: pxor (%rdi), %xmm0
3411 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
3412 ; SSE2-NEXT: notq %rax
3413 ; SSE2-NEXT: movq %rax, 16(%rsi)
3414 ; SSE2-NEXT: movq %rax, 16(%rdx)
3415 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3416 ; SSE2-NEXT: movq %rax, 48(%rdx)
3417 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
3420 ; AVX1-LABEL: vec384_v3f64:
3422 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3423 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
3424 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
3425 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3426 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
3427 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
3428 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
3429 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
3430 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
3431 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
3432 ; AVX1-NEXT: vzeroupper
3435 ; AVX2-LABEL: vec384_v3f64:
3437 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
3438 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
3439 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3440 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
3441 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
3442 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
3443 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
3444 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
3445 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
3446 ; AVX2-NEXT: vzeroupper
3448 %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64
3449 %in.subvec.int = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1>
3450 %in.subvec = bitcast <3 x i64> %in.subvec.int to <3 x double>
3451 store <3 x double> %in.subvec, ptr %out.subvec.ptr, align 64
3452 %out.subvec0.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 0
3453 store <3 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
3454 %out.subvec1.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 1
3455 store <3 x double> %in.subvec, ptr %out.subvec1.ptr, align 8
3459 define void @vec384_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3460 ; SCALAR-LABEL: vec384_v4i8:
3462 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
3463 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
3464 ; SCALAR-NEXT: movzbl (%rdi), %eax
3465 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
3466 ; SCALAR-NEXT: notb %al
3467 ; SCALAR-NEXT: notb %dil
3468 ; SCALAR-NEXT: notb %cl
3469 ; SCALAR-NEXT: notb %r8b
3470 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
3471 ; SCALAR-NEXT: movb %cl, 2(%rsi)
3472 ; SCALAR-NEXT: movb %dil, 1(%rsi)
3473 ; SCALAR-NEXT: movb %al, (%rsi)
3474 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
3475 ; SCALAR-NEXT: movb %cl, 2(%rdx)
3476 ; SCALAR-NEXT: movb %dil, 1(%rdx)
3477 ; SCALAR-NEXT: movb %al, (%rdx)
3478 ; SCALAR-NEXT: movb %r8b, 7(%rdx)
3479 ; SCALAR-NEXT: movb %cl, 6(%rdx)
3480 ; SCALAR-NEXT: movb %dil, 5(%rdx)
3481 ; SCALAR-NEXT: movb %al, 4(%rdx)
3482 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
3483 ; SCALAR-NEXT: movb %cl, 10(%rdx)
3484 ; SCALAR-NEXT: movb %dil, 9(%rdx)
3485 ; SCALAR-NEXT: movb %al, 8(%rdx)
3486 ; SCALAR-NEXT: movb %r8b, 15(%rdx)
3487 ; SCALAR-NEXT: movb %cl, 14(%rdx)
3488 ; SCALAR-NEXT: movb %dil, 13(%rdx)
3489 ; SCALAR-NEXT: movb %al, 12(%rdx)
3490 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
3491 ; SCALAR-NEXT: movb %cl, 18(%rdx)
3492 ; SCALAR-NEXT: movb %dil, 17(%rdx)
3493 ; SCALAR-NEXT: movb %al, 16(%rdx)
3494 ; SCALAR-NEXT: movb %r8b, 23(%rdx)
3495 ; SCALAR-NEXT: movb %cl, 22(%rdx)
3496 ; SCALAR-NEXT: movb %dil, 21(%rdx)
3497 ; SCALAR-NEXT: movb %al, 20(%rdx)
3498 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
3499 ; SCALAR-NEXT: movb %cl, 26(%rdx)
3500 ; SCALAR-NEXT: movb %dil, 25(%rdx)
3501 ; SCALAR-NEXT: movb %al, 24(%rdx)
3502 ; SCALAR-NEXT: movb %r8b, 31(%rdx)
3503 ; SCALAR-NEXT: movb %cl, 30(%rdx)
3504 ; SCALAR-NEXT: movb %dil, 29(%rdx)
3505 ; SCALAR-NEXT: movb %al, 28(%rdx)
3506 ; SCALAR-NEXT: movb %r8b, 35(%rdx)
3507 ; SCALAR-NEXT: movb %cl, 34(%rdx)
3508 ; SCALAR-NEXT: movb %dil, 33(%rdx)
3509 ; SCALAR-NEXT: movb %al, 32(%rdx)
3510 ; SCALAR-NEXT: movb %r8b, 39(%rdx)
3511 ; SCALAR-NEXT: movb %cl, 38(%rdx)
3512 ; SCALAR-NEXT: movb %dil, 37(%rdx)
3513 ; SCALAR-NEXT: movb %al, 36(%rdx)
3514 ; SCALAR-NEXT: movb %r8b, 43(%rdx)
3515 ; SCALAR-NEXT: movb %cl, 42(%rdx)
3516 ; SCALAR-NEXT: movb %dil, 41(%rdx)
3517 ; SCALAR-NEXT: movb %al, 40(%rdx)
3518 ; SCALAR-NEXT: movb %r8b, 47(%rdx)
3519 ; SCALAR-NEXT: movb %cl, 46(%rdx)
3520 ; SCALAR-NEXT: movb %dil, 45(%rdx)
3521 ; SCALAR-NEXT: movb %al, 44(%rdx)
3524 ; SSE2-LABEL: vec384_v4i8:
3526 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3527 ; SSE2-NEXT: pxor (%rdi), %xmm0
3528 ; SSE2-NEXT: movd %xmm0, (%rsi)
3529 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3530 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3531 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
3532 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
3535 ; AVX1-LABEL: vec384_v4i8:
3537 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3538 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
3539 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
3540 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3541 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
3542 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
3543 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
3546 ; AVX2-LABEL: vec384_v4i8:
3548 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3549 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
3550 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
3551 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
3552 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
3553 ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx)
3554 ; AVX2-NEXT: vzeroupper
3556 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
3557 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
3558 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
3559 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
3560 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
3561 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
3562 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
3563 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
3564 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
3565 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
3566 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
3567 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
3568 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
3569 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
3570 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
3571 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
3572 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
3573 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
3574 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
3575 %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8
3576 store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32
3577 %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9
3578 store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4
3579 %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10
3580 store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8
3581 %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11
3582 store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4
3586 define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3587 ; SCALAR-LABEL: vec384_v4i16:
3589 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
3590 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
3591 ; SCALAR-NEXT: movl (%rdi), %eax
3592 ; SCALAR-NEXT: movl 4(%rdi), %edi
3593 ; SCALAR-NEXT: notl %eax
3594 ; SCALAR-NEXT: notl %ecx
3595 ; SCALAR-NEXT: notl %edi
3596 ; SCALAR-NEXT: notl %r8d
3597 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
3598 ; SCALAR-NEXT: movw %di, 4(%rsi)
3599 ; SCALAR-NEXT: movw %cx, 2(%rsi)
3600 ; SCALAR-NEXT: movw %ax, (%rsi)
3601 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
3602 ; SCALAR-NEXT: movw %di, 4(%rdx)
3603 ; SCALAR-NEXT: movw %cx, 2(%rdx)
3604 ; SCALAR-NEXT: movw %ax, (%rdx)
3605 ; SCALAR-NEXT: movw %r8w, 14(%rdx)
3606 ; SCALAR-NEXT: movw %di, 12(%rdx)
3607 ; SCALAR-NEXT: movw %cx, 10(%rdx)
3608 ; SCALAR-NEXT: movw %ax, 8(%rdx)
3609 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
3610 ; SCALAR-NEXT: movw %di, 20(%rdx)
3611 ; SCALAR-NEXT: movw %cx, 18(%rdx)
3612 ; SCALAR-NEXT: movw %ax, 16(%rdx)
3613 ; SCALAR-NEXT: movw %r8w, 30(%rdx)
3614 ; SCALAR-NEXT: movw %di, 28(%rdx)
3615 ; SCALAR-NEXT: movw %cx, 26(%rdx)
3616 ; SCALAR-NEXT: movw %ax, 24(%rdx)
3617 ; SCALAR-NEXT: movw %r8w, 38(%rdx)
3618 ; SCALAR-NEXT: movw %di, 36(%rdx)
3619 ; SCALAR-NEXT: movw %cx, 34(%rdx)
3620 ; SCALAR-NEXT: movw %ax, 32(%rdx)
3621 ; SCALAR-NEXT: movw %r8w, 46(%rdx)
3622 ; SCALAR-NEXT: movw %di, 44(%rdx)
3623 ; SCALAR-NEXT: movw %cx, 42(%rdx)
3624 ; SCALAR-NEXT: movw %ax, 40(%rdx)
3627 ; SSE2-LABEL: vec384_v4i16:
3629 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3630 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
3631 ; SSE2-NEXT: pxor %xmm0, %xmm1
3632 ; SSE2-NEXT: movq %xmm1, (%rsi)
3633 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
3634 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3635 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
3636 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
3639 ; AVX1-LABEL: vec384_v4i16:
3641 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3642 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
3643 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
3644 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
3645 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3646 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
3647 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
3648 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
3649 ; AVX1-NEXT: vzeroupper
3652 ; AVX2-ONLY-LABEL: vec384_v4i16:
3653 ; AVX2-ONLY: # %bb.0:
3654 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3655 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
3656 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
3657 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
3658 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
3659 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
3660 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx)
3661 ; AVX2-ONLY-NEXT: vzeroupper
3662 ; AVX2-ONLY-NEXT: retq
3664 ; AVX512-LABEL: vec384_v4i16:
3666 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3667 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
3668 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
3669 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
3670 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
3671 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
3672 ; AVX512-NEXT: vzeroupper
3674 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
3675 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
3676 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
3677 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
3678 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
3679 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
3680 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
3681 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
3682 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
3683 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
3684 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
3685 %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4
3686 store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32
3687 %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5
3688 store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8
3692 define void @vec384_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3693 ; SCALAR-LABEL: vec384_v4i32:
3695 ; SCALAR-NEXT: movaps (%rdi), %xmm0
3696 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3697 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
3698 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
3699 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
3700 ; SCALAR-NEXT: movaps %xmm0, 32(%rdx)
3703 ; SSE2-LABEL: vec384_v4i32:
3705 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3706 ; SSE2-NEXT: pxor (%rdi), %xmm0
3707 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
3708 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3709 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
3710 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
3713 ; AVX-LABEL: vec384_v4i32:
3715 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3716 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
3717 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
3718 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3719 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3720 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
3722 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
3723 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
3724 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
3725 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
3726 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
3727 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
3728 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
3729 %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2
3730 store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32
3734 define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3735 ; SCALAR-LABEL: vec384_v4f32:
3737 ; SCALAR-NEXT: movaps (%rdi), %xmm0
3738 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3739 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
3740 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
3741 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
3742 ; SCALAR-NEXT: movaps %xmm0, 32(%rdx)
3745 ; SSE2-LABEL: vec384_v4f32:
3747 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3748 ; SSE2-NEXT: pxor (%rdi), %xmm0
3749 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
3750 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3751 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
3752 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
3755 ; AVX-LABEL: vec384_v4f32:
3757 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3758 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
3759 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
3760 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3761 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3762 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
3764 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
3765 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
3766 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
3767 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
3768 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
3769 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
3770 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
3771 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
3772 %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2
3773 store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32
3777 define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3778 ; SCALAR-LABEL: vec384_v6i8:
3780 ; SCALAR-NEXT: movq (%rdi), %rax
3781 ; SCALAR-NEXT: movq %rax, %rcx
3782 ; SCALAR-NEXT: shrq $32, %rcx
3783 ; SCALAR-NEXT: notl %ecx
3784 ; SCALAR-NEXT: notl %eax
3785 ; SCALAR-NEXT: movl %eax, (%rsi)
3786 ; SCALAR-NEXT: movw %cx, 4(%rsi)
3787 ; SCALAR-NEXT: movw %cx, 4(%rdx)
3788 ; SCALAR-NEXT: movl %eax, (%rdx)
3789 ; SCALAR-NEXT: movw %cx, 12(%rdx)
3790 ; SCALAR-NEXT: movl %eax, 8(%rdx)
3791 ; SCALAR-NEXT: movw %cx, 20(%rdx)
3792 ; SCALAR-NEXT: movl %eax, 16(%rdx)
3793 ; SCALAR-NEXT: movw %cx, 28(%rdx)
3794 ; SCALAR-NEXT: movl %eax, 24(%rdx)
3795 ; SCALAR-NEXT: movw %cx, 36(%rdx)
3796 ; SCALAR-NEXT: movl %eax, 32(%rdx)
3797 ; SCALAR-NEXT: movw %cx, 44(%rdx)
3798 ; SCALAR-NEXT: movl %eax, 40(%rdx)
3799 ; SCALAR-NEXT: movw %cx, 52(%rdx)
3800 ; SCALAR-NEXT: movl %eax, 48(%rdx)
3801 ; SCALAR-NEXT: movw %cx, 60(%rdx)
3802 ; SCALAR-NEXT: movl %eax, 56(%rdx)
3805 ; SSE2-ONLY-LABEL: vec384_v6i8:
3806 ; SSE2-ONLY: # %bb.0:
3807 ; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3808 ; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
3809 ; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1
3810 ; SSE2-ONLY-NEXT: movd %xmm1, (%rsi)
3811 ; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax
3812 ; SSE2-ONLY-NEXT: movw %ax, 4(%rsi)
3813 ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx)
3814 ; SSE2-ONLY-NEXT: movd %xmm1, (%rdx)
3815 ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx)
3816 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
3817 ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx)
3818 ; SSE2-ONLY-NEXT: movd %xmm1, 16(%rdx)
3819 ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx)
3820 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
3821 ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx)
3822 ; SSE2-ONLY-NEXT: movd %xmm1, 32(%rdx)
3823 ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx)
3824 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
3825 ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx)
3826 ; SSE2-ONLY-NEXT: movd %xmm1, 48(%rdx)
3827 ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx)
3828 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
3829 ; SSE2-ONLY-NEXT: retq
3831 ; SSE3-LABEL: vec384_v6i8:
3833 ; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3834 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
3835 ; SSE3-NEXT: pxor %xmm0, %xmm1
3836 ; SSE3-NEXT: movd %xmm1, (%rsi)
3837 ; SSE3-NEXT: pextrw $2, %xmm1, %eax
3838 ; SSE3-NEXT: movw %ax, 4(%rsi)
3839 ; SSE3-NEXT: movw %ax, 4(%rdx)
3840 ; SSE3-NEXT: movd %xmm1, (%rdx)
3841 ; SSE3-NEXT: movw %ax, 12(%rdx)
3842 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
3843 ; SSE3-NEXT: movw %ax, 20(%rdx)
3844 ; SSE3-NEXT: movd %xmm1, 16(%rdx)
3845 ; SSE3-NEXT: movw %ax, 28(%rdx)
3846 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
3847 ; SSE3-NEXT: movw %ax, 36(%rdx)
3848 ; SSE3-NEXT: movd %xmm1, 32(%rdx)
3849 ; SSE3-NEXT: movw %ax, 44(%rdx)
3850 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
3851 ; SSE3-NEXT: movw %ax, 52(%rdx)
3852 ; SSE3-NEXT: movd %xmm1, 48(%rdx)
3853 ; SSE3-NEXT: movw %ax, 60(%rdx)
3854 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
3857 ; SSSE3-ONLY-LABEL: vec384_v6i8:
3858 ; SSSE3-ONLY: # %bb.0:
3859 ; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3860 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
3861 ; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1
3862 ; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi)
3863 ; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax
3864 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi)
3865 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx)
3866 ; SSSE3-ONLY-NEXT: movd %xmm1, (%rdx)
3867 ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx)
3868 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
3869 ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx)
3870 ; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rdx)
3871 ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx)
3872 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
3873 ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx)
3874 ; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rdx)
3875 ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx)
3876 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
3877 ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx)
3878 ; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rdx)
3879 ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx)
3880 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
3881 ; SSSE3-ONLY-NEXT: retq
3883 ; SSE41-LABEL: vec384_v6i8:
3885 ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3886 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
3887 ; SSE41-NEXT: pxor %xmm0, %xmm1
3888 ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi)
3889 ; SSE41-NEXT: movd %xmm1, (%rsi)
3890 ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rdx)
3891 ; SSE41-NEXT: movd %xmm1, (%rdx)
3892 ; SSE41-NEXT: pextrw $2, %xmm1, 12(%rdx)
3893 ; SSE41-NEXT: movd %xmm1, 8(%rdx)
3894 ; SSE41-NEXT: pextrw $2, %xmm1, 20(%rdx)
3895 ; SSE41-NEXT: movd %xmm1, 16(%rdx)
3896 ; SSE41-NEXT: pextrw $2, %xmm1, 28(%rdx)
3897 ; SSE41-NEXT: movd %xmm1, 24(%rdx)
3898 ; SSE41-NEXT: pextrw $2, %xmm1, 36(%rdx)
3899 ; SSE41-NEXT: movd %xmm1, 32(%rdx)
3900 ; SSE41-NEXT: pextrw $2, %xmm1, 44(%rdx)
3901 ; SSE41-NEXT: movd %xmm1, 40(%rdx)
3902 ; SSE41-NEXT: pextrw $2, %xmm1, 52(%rdx)
3903 ; SSE41-NEXT: movd %xmm1, 48(%rdx)
3904 ; SSE41-NEXT: pextrw $2, %xmm1, 60(%rdx)
3905 ; SSE41-NEXT: movd %xmm1, 56(%rdx)
3908 ; SSE42-LABEL: vec384_v6i8:
3910 ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3911 ; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
3912 ; SSE42-NEXT: pxor %xmm0, %xmm1
3913 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi)
3914 ; SSE42-NEXT: movd %xmm1, (%rsi)
3915 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdx)
3916 ; SSE42-NEXT: movd %xmm1, (%rdx)
3917 ; SSE42-NEXT: pextrw $2, %xmm1, 12(%rdx)
3918 ; SSE42-NEXT: movd %xmm1, 8(%rdx)
3919 ; SSE42-NEXT: pextrw $2, %xmm1, 20(%rdx)
3920 ; SSE42-NEXT: movd %xmm1, 16(%rdx)
3921 ; SSE42-NEXT: pextrw $2, %xmm1, 28(%rdx)
3922 ; SSE42-NEXT: movd %xmm1, 24(%rdx)
3923 ; SSE42-NEXT: pextrw $2, %xmm1, 36(%rdx)
3924 ; SSE42-NEXT: movd %xmm1, 32(%rdx)
3925 ; SSE42-NEXT: pextrw $2, %xmm1, 44(%rdx)
3926 ; SSE42-NEXT: movd %xmm1, 40(%rdx)
3927 ; SSE42-NEXT: pextrw $2, %xmm1, 52(%rdx)
3928 ; SSE42-NEXT: movd %xmm1, 48(%rdx)
3929 ; SSE42-NEXT: pextrw $2, %xmm1, 60(%rdx)
3930 ; SSE42-NEXT: movd %xmm1, 56(%rdx)
3933 ; AVX1-LABEL: vec384_v6i8:
3935 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3936 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
3937 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
3938 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rsi)
3939 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
3940 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdx)
3941 ; AVX1-NEXT: vmovd %xmm0, (%rdx)
3942 ; AVX1-NEXT: vpextrw $2, %xmm0, 12(%rdx)
3943 ; AVX1-NEXT: vmovd %xmm0, 8(%rdx)
3944 ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdx)
3945 ; AVX1-NEXT: vmovd %xmm0, 16(%rdx)
3946 ; AVX1-NEXT: vpextrw $2, %xmm0, 28(%rdx)
3947 ; AVX1-NEXT: vmovd %xmm0, 24(%rdx)
3948 ; AVX1-NEXT: vpextrw $2, %xmm0, 36(%rdx)
3949 ; AVX1-NEXT: vmovd %xmm0, 32(%rdx)
3950 ; AVX1-NEXT: vpextrw $2, %xmm0, 44(%rdx)
3951 ; AVX1-NEXT: vmovd %xmm0, 40(%rdx)
3952 ; AVX1-NEXT: vpextrw $2, %xmm0, 52(%rdx)
3953 ; AVX1-NEXT: vmovd %xmm0, 48(%rdx)
3954 ; AVX1-NEXT: vpextrw $2, %xmm0, 60(%rdx)
3955 ; AVX1-NEXT: vmovd %xmm0, 56(%rdx)
3958 ; AVX2-ONLY-LABEL: vec384_v6i8:
3959 ; AVX2-ONLY: # %bb.0:
3960 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3961 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
3962 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
3963 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rsi)
3964 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
3965 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rdx)
3966 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rdx)
3967 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 12(%rdx)
3968 ; AVX2-ONLY-NEXT: vmovd %xmm0, 8(%rdx)
3969 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 20(%rdx)
3970 ; AVX2-ONLY-NEXT: vmovd %xmm0, 16(%rdx)
3971 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 28(%rdx)
3972 ; AVX2-ONLY-NEXT: vmovd %xmm0, 24(%rdx)
3973 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 36(%rdx)
3974 ; AVX2-ONLY-NEXT: vmovd %xmm0, 32(%rdx)
3975 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 44(%rdx)
3976 ; AVX2-ONLY-NEXT: vmovd %xmm0, 40(%rdx)
3977 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 52(%rdx)
3978 ; AVX2-ONLY-NEXT: vmovd %xmm0, 48(%rdx)
3979 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 60(%rdx)
3980 ; AVX2-ONLY-NEXT: vmovd %xmm0, 56(%rdx)
3981 ; AVX2-ONLY-NEXT: retq
3983 ; AVX512-LABEL: vec384_v6i8:
3985 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3986 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
3987 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi)
3988 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
3989 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx)
3990 ; AVX512-NEXT: vmovd %xmm0, (%rdx)
3991 ; AVX512-NEXT: vpextrw $2, %xmm0, 12(%rdx)
3992 ; AVX512-NEXT: vmovd %xmm0, 8(%rdx)
3993 ; AVX512-NEXT: vpextrw $2, %xmm0, 20(%rdx)
3994 ; AVX512-NEXT: vmovd %xmm0, 16(%rdx)
3995 ; AVX512-NEXT: vpextrw $2, %xmm0, 28(%rdx)
3996 ; AVX512-NEXT: vmovd %xmm0, 24(%rdx)
3997 ; AVX512-NEXT: vpextrw $2, %xmm0, 36(%rdx)
3998 ; AVX512-NEXT: vmovd %xmm0, 32(%rdx)
3999 ; AVX512-NEXT: vpextrw $2, %xmm0, 44(%rdx)
4000 ; AVX512-NEXT: vmovd %xmm0, 40(%rdx)
4001 ; AVX512-NEXT: vpextrw $2, %xmm0, 52(%rdx)
4002 ; AVX512-NEXT: vmovd %xmm0, 48(%rdx)
4003 ; AVX512-NEXT: vpextrw $2, %xmm0, 60(%rdx)
4004 ; AVX512-NEXT: vmovd %xmm0, 56(%rdx)
4006 %in.subvec.not = load <6 x i8>, ptr %in.subvec.ptr, align 64
4007 %in.subvec = xor <6 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4008 store <6 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4009 %out.subvec0.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 0
4010 store <6 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4011 %out.subvec1.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 1
4012 store <6 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
4013 %out.subvec2.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 2
4014 store <6 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
4015 %out.subvec3.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 3
4016 store <6 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
4017 %out.subvec4.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 4
4018 store <6 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
4019 %out.subvec5.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 5
4020 store <6 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
4021 %out.subvec6.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 6
4022 store <6 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
4023 %out.subvec7.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 7
4024 store <6 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
4028 define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4029 ; SCALAR-LABEL: vec384_v6i16:
4031 ; SCALAR-NEXT: movq (%rdi), %rax
4032 ; SCALAR-NEXT: movl 8(%rdi), %ecx
4033 ; SCALAR-NEXT: notl %ecx
4034 ; SCALAR-NEXT: notq %rax
4035 ; SCALAR-NEXT: movq %rax, (%rsi)
4036 ; SCALAR-NEXT: movl %ecx, 8(%rsi)
4037 ; SCALAR-NEXT: movl %ecx, 8(%rdx)
4038 ; SCALAR-NEXT: movq %rax, (%rdx)
4039 ; SCALAR-NEXT: movl %ecx, 24(%rdx)
4040 ; SCALAR-NEXT: movq %rax, 16(%rdx)
4041 ; SCALAR-NEXT: movl %ecx, 40(%rdx)
4042 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4043 ; SCALAR-NEXT: movl %ecx, 56(%rdx)
4044 ; SCALAR-NEXT: movq %rax, 48(%rdx)
4047 ; SSE2-ONLY-LABEL: vec384_v6i16:
4048 ; SSE2-ONLY: # %bb.0:
4049 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
4050 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
4051 ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi)
4052 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4053 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi)
4054 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
4055 ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx)
4056 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
4057 ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx)
4058 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
4059 ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx)
4060 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
4061 ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx)
4062 ; SSE2-ONLY-NEXT: retq
4064 ; SSE3-LABEL: vec384_v6i16:
4066 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
4067 ; SSE3-NEXT: pxor (%rdi), %xmm0
4068 ; SSE3-NEXT: movq %xmm0, (%rsi)
4069 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4070 ; SSE3-NEXT: movd %xmm1, 8(%rsi)
4071 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
4072 ; SSE3-NEXT: movq %xmm0, (%rdx)
4073 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
4074 ; SSE3-NEXT: movq %xmm0, 16(%rdx)
4075 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
4076 ; SSE3-NEXT: movq %xmm0, 32(%rdx)
4077 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
4078 ; SSE3-NEXT: movq %xmm0, 48(%rdx)
4081 ; SSSE3-ONLY-LABEL: vec384_v6i16:
4082 ; SSSE3-ONLY: # %bb.0:
4083 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
4084 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
4085 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi)
4086 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4087 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi)
4088 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
4089 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx)
4090 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
4091 ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx)
4092 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
4093 ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx)
4094 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
4095 ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx)
4096 ; SSSE3-ONLY-NEXT: retq
4098 ; SSE41-LABEL: vec384_v6i16:
4100 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
4101 ; SSE41-NEXT: pxor (%rdi), %xmm0
4102 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi)
4103 ; SSE41-NEXT: movq %xmm0, (%rsi)
4104 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx)
4105 ; SSE41-NEXT: movq %xmm0, (%rdx)
4106 ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx)
4107 ; SSE41-NEXT: movq %xmm0, 16(%rdx)
4108 ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx)
4109 ; SSE41-NEXT: movq %xmm0, 32(%rdx)
4110 ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx)
4111 ; SSE41-NEXT: movq %xmm0, 48(%rdx)
4114 ; SSE42-LABEL: vec384_v6i16:
4116 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
4117 ; SSE42-NEXT: pxor (%rdi), %xmm0
4118 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi)
4119 ; SSE42-NEXT: movq %xmm0, (%rsi)
4120 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx)
4121 ; SSE42-NEXT: movq %xmm0, (%rdx)
4122 ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx)
4123 ; SSE42-NEXT: movq %xmm0, 16(%rdx)
4124 ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx)
4125 ; SSE42-NEXT: movq %xmm0, 32(%rdx)
4126 ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx)
4127 ; SSE42-NEXT: movq %xmm0, 48(%rdx)
4130 ; AVX-LABEL: vec384_v6i16:
4132 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
4133 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
4134 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi)
4135 ; AVX-NEXT: vmovq %xmm0, (%rsi)
4136 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx)
4137 ; AVX-NEXT: vmovq %xmm0, (%rdx)
4138 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx)
4139 ; AVX-NEXT: vmovq %xmm0, 16(%rdx)
4140 ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx)
4141 ; AVX-NEXT: vmovq %xmm0, 32(%rdx)
4142 ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx)
4143 ; AVX-NEXT: vmovq %xmm0, 48(%rdx)
4145 %in.subvec.not = load <6 x i16>, ptr %in.subvec.ptr, align 64
4146 %in.subvec = xor <6 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4147 store <6 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4148 %out.subvec0.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 0
4149 store <6 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4150 %out.subvec1.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 1
4151 store <6 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
4152 %out.subvec2.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 2
4153 store <6 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
4154 %out.subvec3.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 3
4155 store <6 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
4159 define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4160 ; SCALAR-LABEL: vec384_v6i32:
4162 ; SCALAR-NEXT: movq (%rdi), %rax
4163 ; SCALAR-NEXT: movq 8(%rdi), %rcx
4164 ; SCALAR-NEXT: movq 16(%rdi), %rdi
4165 ; SCALAR-NEXT: notq %rdi
4166 ; SCALAR-NEXT: notq %rcx
4167 ; SCALAR-NEXT: notq %rax
4168 ; SCALAR-NEXT: movq %rax, (%rsi)
4169 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
4170 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
4171 ; SCALAR-NEXT: movq %rax, (%rdx)
4172 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
4173 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
4174 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
4175 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
4176 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4179 ; SSE2-LABEL: vec384_v6i32:
4181 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4182 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
4183 ; SSE2-NEXT: pxor %xmm0, %xmm1
4184 ; SSE2-NEXT: pxor (%rdi), %xmm0
4185 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4186 ; SSE2-NEXT: movq %xmm1, 16(%rsi)
4187 ; SSE2-NEXT: movq %xmm1, 16(%rdx)
4188 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4189 ; SSE2-NEXT: movq %xmm1, 48(%rdx)
4190 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
4193 ; AVX1-LABEL: vec384_v6i32:
4195 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
4196 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
4197 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
4198 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4199 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
4200 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
4201 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
4202 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
4203 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
4204 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
4205 ; AVX1-NEXT: vzeroupper
4208 ; AVX2-LABEL: vec384_v6i32:
4210 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
4211 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
4212 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4213 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
4214 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
4215 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
4216 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
4217 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
4218 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
4219 ; AVX2-NEXT: vzeroupper
4221 %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64
4222 %in.subvec = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
4223 store <6 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
4224 %out.subvec0.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 0
4225 store <6 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
4226 %out.subvec1.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 1
4227 store <6 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
4231 define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4232 ; SCALAR-LABEL: vec384_v6f32:
4234 ; SCALAR-NEXT: movq (%rdi), %rax
4235 ; SCALAR-NEXT: movq 8(%rdi), %rcx
4236 ; SCALAR-NEXT: movq 16(%rdi), %rdi
4237 ; SCALAR-NEXT: notq %rdi
4238 ; SCALAR-NEXT: notq %rcx
4239 ; SCALAR-NEXT: notq %rax
4240 ; SCALAR-NEXT: movq %rax, (%rsi)
4241 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
4242 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
4243 ; SCALAR-NEXT: movq %rax, (%rdx)
4244 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
4245 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
4246 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
4247 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
4248 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4251 ; SSE2-LABEL: vec384_v6f32:
4253 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4254 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
4255 ; SSE2-NEXT: pxor %xmm0, %xmm1
4256 ; SSE2-NEXT: pxor (%rdi), %xmm0
4257 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4258 ; SSE2-NEXT: movq %xmm1, 16(%rsi)
4259 ; SSE2-NEXT: movq %xmm1, 16(%rdx)
4260 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4261 ; SSE2-NEXT: movq %xmm1, 48(%rdx)
4262 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
4265 ; AVX1-LABEL: vec384_v6f32:
4267 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
4268 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
4269 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
4270 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4271 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
4272 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
4273 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
4274 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
4275 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
4276 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
4277 ; AVX1-NEXT: vzeroupper
4280 ; AVX2-LABEL: vec384_v6f32:
4282 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
4283 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
4284 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4285 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
4286 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
4287 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
4288 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
4289 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
4290 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
4291 ; AVX2-NEXT: vzeroupper
4293 %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64
4294 %in.subvec.int = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
4295 %in.subvec = bitcast <6 x i32> %in.subvec.int to <6 x float>
4296 store <6 x float> %in.subvec, ptr %out.subvec.ptr, align 64
4297 %out.subvec0.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 0
4298 store <6 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
4299 %out.subvec1.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 1
4300 store <6 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
4304 define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4305 ; SCALAR-LABEL: vec384_v8i8:
4307 ; SCALAR-NEXT: pushq %rbx
4308 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
4309 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
4310 ; SCALAR-NEXT: movzbl 5(%rdi), %r10d
4311 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
4312 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
4313 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
4314 ; SCALAR-NEXT: movzbl (%rdi), %eax
4315 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
4316 ; SCALAR-NEXT: notb %al
4317 ; SCALAR-NEXT: notb %dil
4318 ; SCALAR-NEXT: notb %cl
4319 ; SCALAR-NEXT: notb %r8b
4320 ; SCALAR-NEXT: notb %r9b
4321 ; SCALAR-NEXT: notb %r10b
4322 ; SCALAR-NEXT: notb %r11b
4323 ; SCALAR-NEXT: notb %bl
4324 ; SCALAR-NEXT: movb %bl, 7(%rsi)
4325 ; SCALAR-NEXT: movb %r11b, 6(%rsi)
4326 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
4327 ; SCALAR-NEXT: movb %r9b, 4(%rsi)
4328 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
4329 ; SCALAR-NEXT: movb %cl, 2(%rsi)
4330 ; SCALAR-NEXT: movb %dil, 1(%rsi)
4331 ; SCALAR-NEXT: movb %al, (%rsi)
4332 ; SCALAR-NEXT: movb %bl, 7(%rdx)
4333 ; SCALAR-NEXT: movb %r11b, 6(%rdx)
4334 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
4335 ; SCALAR-NEXT: movb %r9b, 4(%rdx)
4336 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
4337 ; SCALAR-NEXT: movb %cl, 2(%rdx)
4338 ; SCALAR-NEXT: movb %dil, 1(%rdx)
4339 ; SCALAR-NEXT: movb %al, (%rdx)
4340 ; SCALAR-NEXT: movb %bl, 15(%rdx)
4341 ; SCALAR-NEXT: movb %r11b, 14(%rdx)
4342 ; SCALAR-NEXT: movb %r10b, 13(%rdx)
4343 ; SCALAR-NEXT: movb %r9b, 12(%rdx)
4344 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
4345 ; SCALAR-NEXT: movb %cl, 10(%rdx)
4346 ; SCALAR-NEXT: movb %dil, 9(%rdx)
4347 ; SCALAR-NEXT: movb %al, 8(%rdx)
4348 ; SCALAR-NEXT: movb %bl, 23(%rdx)
4349 ; SCALAR-NEXT: movb %r11b, 22(%rdx)
4350 ; SCALAR-NEXT: movb %r10b, 21(%rdx)
4351 ; SCALAR-NEXT: movb %r9b, 20(%rdx)
4352 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
4353 ; SCALAR-NEXT: movb %cl, 18(%rdx)
4354 ; SCALAR-NEXT: movb %dil, 17(%rdx)
4355 ; SCALAR-NEXT: movb %al, 16(%rdx)
4356 ; SCALAR-NEXT: movb %bl, 31(%rdx)
4357 ; SCALAR-NEXT: movb %r11b, 30(%rdx)
4358 ; SCALAR-NEXT: movb %r10b, 29(%rdx)
4359 ; SCALAR-NEXT: movb %r9b, 28(%rdx)
4360 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
4361 ; SCALAR-NEXT: movb %cl, 26(%rdx)
4362 ; SCALAR-NEXT: movb %dil, 25(%rdx)
4363 ; SCALAR-NEXT: movb %al, 24(%rdx)
4364 ; SCALAR-NEXT: movb %bl, 39(%rdx)
4365 ; SCALAR-NEXT: movb %r11b, 38(%rdx)
4366 ; SCALAR-NEXT: movb %r10b, 37(%rdx)
4367 ; SCALAR-NEXT: movb %r9b, 36(%rdx)
4368 ; SCALAR-NEXT: movb %r8b, 35(%rdx)
4369 ; SCALAR-NEXT: movb %cl, 34(%rdx)
4370 ; SCALAR-NEXT: movb %dil, 33(%rdx)
4371 ; SCALAR-NEXT: movb %al, 32(%rdx)
4372 ; SCALAR-NEXT: movb %bl, 47(%rdx)
4373 ; SCALAR-NEXT: movb %r11b, 46(%rdx)
4374 ; SCALAR-NEXT: movb %r10b, 45(%rdx)
4375 ; SCALAR-NEXT: movb %r9b, 44(%rdx)
4376 ; SCALAR-NEXT: movb %r8b, 43(%rdx)
4377 ; SCALAR-NEXT: movb %cl, 42(%rdx)
4378 ; SCALAR-NEXT: movb %dil, 41(%rdx)
4379 ; SCALAR-NEXT: movb %al, 40(%rdx)
4380 ; SCALAR-NEXT: popq %rbx
4383 ; SSE2-LABEL: vec384_v8i8:
4385 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
4386 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
4387 ; SSE2-NEXT: pxor %xmm0, %xmm1
4388 ; SSE2-NEXT: movq %xmm1, (%rsi)
4389 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
4390 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4391 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
4392 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
4395 ; AVX1-LABEL: vec384_v8i8:
4397 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4398 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
4399 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
4400 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
4401 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4402 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
4403 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
4404 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
4405 ; AVX1-NEXT: vzeroupper
4408 ; AVX2-ONLY-LABEL: vec384_v8i8:
4409 ; AVX2-ONLY: # %bb.0:
4410 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4411 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
4412 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
4413 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
4414 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
4415 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
4416 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx)
4417 ; AVX2-ONLY-NEXT: vzeroupper
4418 ; AVX2-ONLY-NEXT: retq
4420 ; AVX512-LABEL: vec384_v8i8:
4422 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4423 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
4424 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
4425 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
4426 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
4427 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
4428 ; AVX512-NEXT: vzeroupper
4430 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
4431 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4432 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4433 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
4434 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4435 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
4436 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
4437 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
4438 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
4439 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
4440 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
4441 %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4
4442 store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32
4443 %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5
4444 store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8
4448 define void @vec384_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4449 ; SCALAR-LABEL: vec384_v8i16:
4451 ; SCALAR-NEXT: pushq %rbx
4452 ; SCALAR-NEXT: movzwl 14(%rdi), %ebx
4453 ; SCALAR-NEXT: movl 12(%rdi), %r11d
4454 ; SCALAR-NEXT: movzwl 10(%rdi), %r10d
4455 ; SCALAR-NEXT: movl 8(%rdi), %r9d
4456 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
4457 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
4458 ; SCALAR-NEXT: movl (%rdi), %eax
4459 ; SCALAR-NEXT: movl 4(%rdi), %edi
4460 ; SCALAR-NEXT: notl %eax
4461 ; SCALAR-NEXT: notl %ecx
4462 ; SCALAR-NEXT: notl %edi
4463 ; SCALAR-NEXT: notl %r8d
4464 ; SCALAR-NEXT: notl %r9d
4465 ; SCALAR-NEXT: notl %r10d
4466 ; SCALAR-NEXT: notl %r11d
4467 ; SCALAR-NEXT: notl %ebx
4468 ; SCALAR-NEXT: movw %bx, 14(%rsi)
4469 ; SCALAR-NEXT: movw %r11w, 12(%rsi)
4470 ; SCALAR-NEXT: movw %r10w, 10(%rsi)
4471 ; SCALAR-NEXT: movw %r9w, 8(%rsi)
4472 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
4473 ; SCALAR-NEXT: movw %di, 4(%rsi)
4474 ; SCALAR-NEXT: movw %cx, 2(%rsi)
4475 ; SCALAR-NEXT: movw %ax, (%rsi)
4476 ; SCALAR-NEXT: movw %bx, 14(%rdx)
4477 ; SCALAR-NEXT: movw %r11w, 12(%rdx)
4478 ; SCALAR-NEXT: movw %r10w, 10(%rdx)
4479 ; SCALAR-NEXT: movw %r9w, 8(%rdx)
4480 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
4481 ; SCALAR-NEXT: movw %di, 4(%rdx)
4482 ; SCALAR-NEXT: movw %cx, 2(%rdx)
4483 ; SCALAR-NEXT: movw %ax, (%rdx)
4484 ; SCALAR-NEXT: movw %bx, 30(%rdx)
4485 ; SCALAR-NEXT: movw %r11w, 28(%rdx)
4486 ; SCALAR-NEXT: movw %r10w, 26(%rdx)
4487 ; SCALAR-NEXT: movw %r9w, 24(%rdx)
4488 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
4489 ; SCALAR-NEXT: movw %di, 20(%rdx)
4490 ; SCALAR-NEXT: movw %cx, 18(%rdx)
4491 ; SCALAR-NEXT: movw %ax, 16(%rdx)
4492 ; SCALAR-NEXT: movw %bx, 46(%rdx)
4493 ; SCALAR-NEXT: movw %r11w, 44(%rdx)
4494 ; SCALAR-NEXT: movw %r10w, 42(%rdx)
4495 ; SCALAR-NEXT: movw %r9w, 40(%rdx)
4496 ; SCALAR-NEXT: movw %r8w, 38(%rdx)
4497 ; SCALAR-NEXT: movw %di, 36(%rdx)
4498 ; SCALAR-NEXT: movw %cx, 34(%rdx)
4499 ; SCALAR-NEXT: movw %ax, 32(%rdx)
4500 ; SCALAR-NEXT: popq %rbx
4503 ; SSE2-LABEL: vec384_v8i16:
4505 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4506 ; SSE2-NEXT: pxor (%rdi), %xmm0
4507 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4508 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4509 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
4510 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
4513 ; AVX-LABEL: vec384_v8i16:
4515 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
4516 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
4517 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
4518 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
4519 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
4520 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
4522 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
4523 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4524 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4525 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
4526 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4527 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
4528 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
4529 %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2
4530 store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32
4534 define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4535 ; SCALAR-LABEL: vec384_v12i8:
4537 ; SCALAR-NEXT: movq (%rdi), %rax
4538 ; SCALAR-NEXT: movl 8(%rdi), %ecx
4539 ; SCALAR-NEXT: notl %ecx
4540 ; SCALAR-NEXT: notq %rax
4541 ; SCALAR-NEXT: movq %rax, (%rsi)
4542 ; SCALAR-NEXT: movl %ecx, 8(%rsi)
4543 ; SCALAR-NEXT: movl %ecx, 8(%rdx)
4544 ; SCALAR-NEXT: movq %rax, (%rdx)
4545 ; SCALAR-NEXT: movl %ecx, 24(%rdx)
4546 ; SCALAR-NEXT: movq %rax, 16(%rdx)
4547 ; SCALAR-NEXT: movl %ecx, 40(%rdx)
4548 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4549 ; SCALAR-NEXT: movl %ecx, 56(%rdx)
4550 ; SCALAR-NEXT: movq %rax, 48(%rdx)
4553 ; SSE2-ONLY-LABEL: vec384_v12i8:
4554 ; SSE2-ONLY: # %bb.0:
4555 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
4556 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
4557 ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi)
4558 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4559 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi)
4560 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
4561 ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx)
4562 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
4563 ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx)
4564 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
4565 ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx)
4566 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
4567 ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx)
4568 ; SSE2-ONLY-NEXT: retq
4570 ; SSE3-LABEL: vec384_v12i8:
4572 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
4573 ; SSE3-NEXT: pxor (%rdi), %xmm0
4574 ; SSE3-NEXT: movq %xmm0, (%rsi)
4575 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4576 ; SSE3-NEXT: movd %xmm1, 8(%rsi)
4577 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
4578 ; SSE3-NEXT: movq %xmm0, (%rdx)
4579 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
4580 ; SSE3-NEXT: movq %xmm0, 16(%rdx)
4581 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
4582 ; SSE3-NEXT: movq %xmm0, 32(%rdx)
4583 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
4584 ; SSE3-NEXT: movq %xmm0, 48(%rdx)
4587 ; SSSE3-ONLY-LABEL: vec384_v12i8:
4588 ; SSSE3-ONLY: # %bb.0:
4589 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
4590 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
4591 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi)
4592 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4593 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi)
4594 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
4595 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx)
4596 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
4597 ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx)
4598 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
4599 ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx)
4600 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
4601 ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx)
4602 ; SSSE3-ONLY-NEXT: retq
4604 ; SSE41-LABEL: vec384_v12i8:
4606 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
4607 ; SSE41-NEXT: pxor (%rdi), %xmm0
4608 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi)
4609 ; SSE41-NEXT: movq %xmm0, (%rsi)
4610 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx)
4611 ; SSE41-NEXT: movq %xmm0, (%rdx)
4612 ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx)
4613 ; SSE41-NEXT: movq %xmm0, 16(%rdx)
4614 ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx)
4615 ; SSE41-NEXT: movq %xmm0, 32(%rdx)
4616 ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx)
4617 ; SSE41-NEXT: movq %xmm0, 48(%rdx)
4620 ; SSE42-LABEL: vec384_v12i8:
4622 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
4623 ; SSE42-NEXT: pxor (%rdi), %xmm0
4624 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi)
4625 ; SSE42-NEXT: movq %xmm0, (%rsi)
4626 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx)
4627 ; SSE42-NEXT: movq %xmm0, (%rdx)
4628 ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx)
4629 ; SSE42-NEXT: movq %xmm0, 16(%rdx)
4630 ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx)
4631 ; SSE42-NEXT: movq %xmm0, 32(%rdx)
4632 ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx)
4633 ; SSE42-NEXT: movq %xmm0, 48(%rdx)
4636 ; AVX-LABEL: vec384_v12i8:
4638 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
4639 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
4640 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi)
4641 ; AVX-NEXT: vmovq %xmm0, (%rsi)
4642 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx)
4643 ; AVX-NEXT: vmovq %xmm0, (%rdx)
4644 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx)
4645 ; AVX-NEXT: vmovq %xmm0, 16(%rdx)
4646 ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx)
4647 ; AVX-NEXT: vmovq %xmm0, 32(%rdx)
4648 ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx)
4649 ; AVX-NEXT: vmovq %xmm0, 48(%rdx)
4651 %in.subvec.not = load <12 x i8>, ptr %in.subvec.ptr, align 64
4652 %in.subvec = xor <12 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4653 store <12 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4654 %out.subvec0.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 0
4655 store <12 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4656 %out.subvec1.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 1
4657 store <12 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
4658 %out.subvec2.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 2
4659 store <12 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
4660 %out.subvec3.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 3
4661 store <12 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
4665 define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4666 ; SCALAR-LABEL: vec384_v12i16:
4668 ; SCALAR-NEXT: movq (%rdi), %rax
4669 ; SCALAR-NEXT: movq 8(%rdi), %rcx
4670 ; SCALAR-NEXT: movq 16(%rdi), %rdi
4671 ; SCALAR-NEXT: notq %rdi
4672 ; SCALAR-NEXT: notq %rcx
4673 ; SCALAR-NEXT: notq %rax
4674 ; SCALAR-NEXT: movq %rax, (%rsi)
4675 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
4676 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
4677 ; SCALAR-NEXT: movq %rax, (%rdx)
4678 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
4679 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
4680 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
4681 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
4682 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4685 ; SSE2-LABEL: vec384_v12i16:
4687 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4688 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
4689 ; SSE2-NEXT: pxor %xmm0, %xmm1
4690 ; SSE2-NEXT: pxor (%rdi), %xmm0
4691 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4692 ; SSE2-NEXT: movq %xmm1, 16(%rsi)
4693 ; SSE2-NEXT: movq %xmm1, 16(%rdx)
4694 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4695 ; SSE2-NEXT: movq %xmm1, 48(%rdx)
4696 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
4699 ; AVX1-LABEL: vec384_v12i16:
4701 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
4702 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
4703 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
4704 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4705 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
4706 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
4707 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
4708 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
4709 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
4710 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
4711 ; AVX1-NEXT: vzeroupper
4714 ; AVX2-LABEL: vec384_v12i16:
4716 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
4717 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
4718 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4719 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
4720 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
4721 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
4722 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
4723 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
4724 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
4725 ; AVX2-NEXT: vzeroupper
4727 %in.subvec.not = load <12 x i16>, ptr %in.subvec.ptr, align 64
4728 %in.subvec = xor <12 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4729 store <12 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4730 %out.subvec0.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 0
4731 store <12 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4732 %out.subvec1.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 1
4733 store <12 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
4737 define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4738 ; SCALAR-LABEL: vec384_v16i8:
4740 ; SCALAR-NEXT: pushq %rbp
4741 ; SCALAR-NEXT: pushq %r15
4742 ; SCALAR-NEXT: pushq %r14
4743 ; SCALAR-NEXT: pushq %r13
4744 ; SCALAR-NEXT: pushq %r12
4745 ; SCALAR-NEXT: pushq %rbx
4746 ; SCALAR-NEXT: movzbl 15(%rdi), %eax
4747 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4748 ; SCALAR-NEXT: movzbl 14(%rdi), %eax
4749 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4750 ; SCALAR-NEXT: movzbl 13(%rdi), %eax
4751 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4752 ; SCALAR-NEXT: movzbl 12(%rdi), %r11d
4753 ; SCALAR-NEXT: movzbl 11(%rdi), %r13d
4754 ; SCALAR-NEXT: movzbl 10(%rdi), %r12d
4755 ; SCALAR-NEXT: movzbl 9(%rdi), %ebp
4756 ; SCALAR-NEXT: movzbl 8(%rdi), %r14d
4757 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
4758 ; SCALAR-NEXT: movzbl 6(%rdi), %r10d
4759 ; SCALAR-NEXT: movzbl 5(%rdi), %r15d
4760 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
4761 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
4762 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
4763 ; SCALAR-NEXT: movzbl (%rdi), %eax
4764 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
4765 ; SCALAR-NEXT: notb %al
4766 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4767 ; SCALAR-NEXT: notb %dil
4768 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4769 ; SCALAR-NEXT: notb %cl
4770 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4771 ; SCALAR-NEXT: notb %r8b
4772 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4773 ; SCALAR-NEXT: notb %r9b
4774 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4775 ; SCALAR-NEXT: movl %r15d, %r9d
4776 ; SCALAR-NEXT: notb %r9b
4777 ; SCALAR-NEXT: notb %r10b
4778 ; SCALAR-NEXT: notb %bl
4779 ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4780 ; SCALAR-NEXT: notb %r14b
4781 ; SCALAR-NEXT: notb %bpl
4782 ; SCALAR-NEXT: movl %ebp, %r15d
4783 ; SCALAR-NEXT: notb %r12b
4784 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4785 ; SCALAR-NEXT: notb %r13b
4786 ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4787 ; SCALAR-NEXT: notb %r11b
4788 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
4789 ; SCALAR-NEXT: notb %dil
4790 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
4791 ; SCALAR-NEXT: notb %cl
4792 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4793 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
4794 ; SCALAR-NEXT: notb %r8b
4795 ; SCALAR-NEXT: movb %r8b, 15(%rsi)
4796 ; SCALAR-NEXT: movb %cl, 14(%rsi)
4797 ; SCALAR-NEXT: movl %edi, %eax
4798 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4799 ; SCALAR-NEXT: movb %dil, 13(%rsi)
4800 ; SCALAR-NEXT: movb %r11b, 12(%rsi)
4801 ; SCALAR-NEXT: movl %r11d, %ebp
4802 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4803 ; SCALAR-NEXT: movb %r13b, 11(%rsi)
4804 ; SCALAR-NEXT: movb %r12b, 10(%rsi)
4805 ; SCALAR-NEXT: movb %r15b, 9(%rsi)
4806 ; SCALAR-NEXT: movb %r14b, 8(%rsi)
4807 ; SCALAR-NEXT: movb %bl, 7(%rsi)
4808 ; SCALAR-NEXT: movb %r10b, 6(%rsi)
4809 ; SCALAR-NEXT: movl %r10d, %ebx
4810 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4811 ; SCALAR-NEXT: movb %r9b, 5(%rsi)
4812 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
4813 ; SCALAR-NEXT: movb %r11b, 4(%rsi)
4814 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
4815 ; SCALAR-NEXT: movb %r12b, 3(%rsi)
4816 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
4817 ; SCALAR-NEXT: movb %cl, 2(%rsi)
4818 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
4819 ; SCALAR-NEXT: movb %r13b, 1(%rsi)
4820 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
4821 ; SCALAR-NEXT: movb %r10b, (%rsi)
4822 ; SCALAR-NEXT: movb %r8b, 15(%rdx)
4823 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
4824 ; SCALAR-NEXT: movb %dil, 14(%rdx)
4825 ; SCALAR-NEXT: movb %al, 13(%rdx)
4826 ; SCALAR-NEXT: movb %bpl, 12(%rdx)
4827 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
4828 ; SCALAR-NEXT: movb %al, 11(%rdx)
4829 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
4830 ; SCALAR-NEXT: movb %al, 10(%rdx)
4831 ; SCALAR-NEXT: movb %r15b, 9(%rdx)
4832 ; SCALAR-NEXT: movb %r14b, 8(%rdx)
4833 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
4834 ; SCALAR-NEXT: movb %bpl, 7(%rdx)
4835 ; SCALAR-NEXT: movb %bl, 6(%rdx)
4836 ; SCALAR-NEXT: movb %r9b, 5(%rdx)
4837 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4838 ; SCALAR-NEXT: movb %r11b, 4(%rdx)
4839 ; SCALAR-NEXT: movb %r12b, 3(%rdx)
4840 ; SCALAR-NEXT: movb %cl, 2(%rdx)
4841 ; SCALAR-NEXT: movl %r13d, %ebx
4842 ; SCALAR-NEXT: movb %r13b, 1(%rdx)
4843 ; SCALAR-NEXT: movl %r10d, %esi
4844 ; SCALAR-NEXT: movb %r10b, (%rdx)
4845 ; SCALAR-NEXT: movb %r8b, 31(%rdx)
4846 ; SCALAR-NEXT: movb %dil, 30(%rdx)
4847 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
4848 ; SCALAR-NEXT: movb %al, 29(%rdx)
4849 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
4850 ; SCALAR-NEXT: movb %r11b, 28(%rdx)
4851 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
4852 ; SCALAR-NEXT: movb %r13b, 27(%rdx)
4853 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
4854 ; SCALAR-NEXT: movb %r12b, 26(%rdx)
4855 ; SCALAR-NEXT: movb %r15b, 25(%rdx)
4856 ; SCALAR-NEXT: movb %r14b, 24(%rdx)
4857 ; SCALAR-NEXT: movb %bpl, 23(%rdx)
4858 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
4859 ; SCALAR-NEXT: movb %r10b, 22(%rdx)
4860 ; SCALAR-NEXT: movb %r9b, 21(%rdx)
4861 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
4862 ; SCALAR-NEXT: movb %r9b, 20(%rdx)
4863 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
4864 ; SCALAR-NEXT: movb %dil, 19(%rdx)
4865 ; SCALAR-NEXT: movb %cl, 18(%rdx)
4866 ; SCALAR-NEXT: movb %bl, 17(%rdx)
4867 ; SCALAR-NEXT: movb %sil, 16(%rdx)
4868 ; SCALAR-NEXT: movb %r8b, 47(%rdx)
4869 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
4870 ; SCALAR-NEXT: movb %r8b, 46(%rdx)
4871 ; SCALAR-NEXT: movb %al, 45(%rdx)
4872 ; SCALAR-NEXT: movb %r11b, 44(%rdx)
4873 ; SCALAR-NEXT: movb %r13b, 43(%rdx)
4874 ; SCALAR-NEXT: movb %r12b, 42(%rdx)
4875 ; SCALAR-NEXT: movb %r15b, 41(%rdx)
4876 ; SCALAR-NEXT: movb %r14b, 40(%rdx)
4877 ; SCALAR-NEXT: movb %bpl, 39(%rdx)
4878 ; SCALAR-NEXT: movb %r10b, 38(%rdx)
4879 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
4880 ; SCALAR-NEXT: movb %al, 37(%rdx)
4881 ; SCALAR-NEXT: movb %r9b, 36(%rdx)
4882 ; SCALAR-NEXT: movb %dil, 35(%rdx)
4883 ; SCALAR-NEXT: movb %cl, 34(%rdx)
4884 ; SCALAR-NEXT: movb %bl, 33(%rdx)
4885 ; SCALAR-NEXT: movb %sil, 32(%rdx)
4886 ; SCALAR-NEXT: popq %rbx
4887 ; SCALAR-NEXT: popq %r12
4888 ; SCALAR-NEXT: popq %r13
4889 ; SCALAR-NEXT: popq %r14
4890 ; SCALAR-NEXT: popq %r15
4891 ; SCALAR-NEXT: popq %rbp
4894 ; SSE2-LABEL: vec384_v16i8:
4896 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4897 ; SSE2-NEXT: pxor (%rdi), %xmm0
4898 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4899 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4900 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
4901 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
4904 ; AVX-LABEL: vec384_v16i8:
4906 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
4907 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
4908 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
4909 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
4910 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
4911 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
4913 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
4914 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4915 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4916 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
4917 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4918 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
4919 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
4920 %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2
4921 store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32
4925 define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4926 ; SCALAR-LABEL: vec384_v24i8:
4928 ; SCALAR-NEXT: movq (%rdi), %rax
4929 ; SCALAR-NEXT: movq 8(%rdi), %rcx
4930 ; SCALAR-NEXT: movq 16(%rdi), %rdi
4931 ; SCALAR-NEXT: notq %rdi
4932 ; SCALAR-NEXT: notq %rcx
4933 ; SCALAR-NEXT: notq %rax
4934 ; SCALAR-NEXT: movq %rax, (%rsi)
4935 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
4936 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
4937 ; SCALAR-NEXT: movq %rax, (%rdx)
4938 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
4939 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
4940 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
4941 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
4942 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4945 ; SSE2-LABEL: vec384_v24i8:
4947 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4948 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
4949 ; SSE2-NEXT: pxor %xmm0, %xmm1
4950 ; SSE2-NEXT: pxor (%rdi), %xmm0
4951 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4952 ; SSE2-NEXT: movq %xmm1, 16(%rsi)
4953 ; SSE2-NEXT: movq %xmm1, 16(%rdx)
4954 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4955 ; SSE2-NEXT: movq %xmm1, 48(%rdx)
4956 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
4959 ; AVX1-LABEL: vec384_v24i8:
4961 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
4962 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
4963 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
4964 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4965 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
4966 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
4967 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
4968 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
4969 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
4970 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
4971 ; AVX1-NEXT: vzeroupper
4974 ; AVX2-LABEL: vec384_v24i8:
4976 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
4977 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
4978 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4979 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
4980 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
4981 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
4982 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
4983 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
4984 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
4985 ; AVX2-NEXT: vzeroupper
4987 %in.subvec.not = load <24 x i8>, ptr %in.subvec.ptr, align 64
4988 %in.subvec = xor <24 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4989 store <24 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4990 %out.subvec0.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 0
4991 store <24 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4992 %out.subvec1.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 1
4993 store <24 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
4997 define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4998 ; SCALAR-LABEL: vec512_v2i8:
5000 ; SCALAR-NEXT: movzbl (%rdi), %eax
5001 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
5002 ; SCALAR-NEXT: notb %al
5003 ; SCALAR-NEXT: notb %cl
5004 ; SCALAR-NEXT: movb %cl, 1(%rsi)
5005 ; SCALAR-NEXT: movb %al, (%rsi)
5006 ; SCALAR-NEXT: movb %cl, 1(%rdx)
5007 ; SCALAR-NEXT: movb %al, (%rdx)
5008 ; SCALAR-NEXT: movb %cl, 3(%rdx)
5009 ; SCALAR-NEXT: movb %al, 2(%rdx)
5010 ; SCALAR-NEXT: movb %cl, 5(%rdx)
5011 ; SCALAR-NEXT: movb %al, 4(%rdx)
5012 ; SCALAR-NEXT: movb %cl, 7(%rdx)
5013 ; SCALAR-NEXT: movb %al, 6(%rdx)
5014 ; SCALAR-NEXT: movb %cl, 9(%rdx)
5015 ; SCALAR-NEXT: movb %al, 8(%rdx)
5016 ; SCALAR-NEXT: movb %cl, 11(%rdx)
5017 ; SCALAR-NEXT: movb %al, 10(%rdx)
5018 ; SCALAR-NEXT: movb %cl, 13(%rdx)
5019 ; SCALAR-NEXT: movb %al, 12(%rdx)
5020 ; SCALAR-NEXT: movb %cl, 15(%rdx)
5021 ; SCALAR-NEXT: movb %al, 14(%rdx)
5022 ; SCALAR-NEXT: movb %cl, 17(%rdx)
5023 ; SCALAR-NEXT: movb %al, 16(%rdx)
5024 ; SCALAR-NEXT: movb %cl, 19(%rdx)
5025 ; SCALAR-NEXT: movb %al, 18(%rdx)
5026 ; SCALAR-NEXT: movb %cl, 21(%rdx)
5027 ; SCALAR-NEXT: movb %al, 20(%rdx)
5028 ; SCALAR-NEXT: movb %cl, 23(%rdx)
5029 ; SCALAR-NEXT: movb %al, 22(%rdx)
5030 ; SCALAR-NEXT: movb %cl, 25(%rdx)
5031 ; SCALAR-NEXT: movb %al, 24(%rdx)
5032 ; SCALAR-NEXT: movb %cl, 27(%rdx)
5033 ; SCALAR-NEXT: movb %al, 26(%rdx)
5034 ; SCALAR-NEXT: movb %cl, 29(%rdx)
5035 ; SCALAR-NEXT: movb %al, 28(%rdx)
5036 ; SCALAR-NEXT: movb %cl, 31(%rdx)
5037 ; SCALAR-NEXT: movb %al, 30(%rdx)
5038 ; SCALAR-NEXT: movb %cl, 33(%rdx)
5039 ; SCALAR-NEXT: movb %al, 32(%rdx)
5040 ; SCALAR-NEXT: movb %cl, 35(%rdx)
5041 ; SCALAR-NEXT: movb %al, 34(%rdx)
5042 ; SCALAR-NEXT: movb %cl, 37(%rdx)
5043 ; SCALAR-NEXT: movb %al, 36(%rdx)
5044 ; SCALAR-NEXT: movb %cl, 39(%rdx)
5045 ; SCALAR-NEXT: movb %al, 38(%rdx)
5046 ; SCALAR-NEXT: movb %cl, 41(%rdx)
5047 ; SCALAR-NEXT: movb %al, 40(%rdx)
5048 ; SCALAR-NEXT: movb %cl, 43(%rdx)
5049 ; SCALAR-NEXT: movb %al, 42(%rdx)
5050 ; SCALAR-NEXT: movb %cl, 45(%rdx)
5051 ; SCALAR-NEXT: movb %al, 44(%rdx)
5052 ; SCALAR-NEXT: movb %cl, 47(%rdx)
5053 ; SCALAR-NEXT: movb %al, 46(%rdx)
5054 ; SCALAR-NEXT: movb %cl, 49(%rdx)
5055 ; SCALAR-NEXT: movb %al, 48(%rdx)
5056 ; SCALAR-NEXT: movb %cl, 51(%rdx)
5057 ; SCALAR-NEXT: movb %al, 50(%rdx)
5058 ; SCALAR-NEXT: movb %cl, 53(%rdx)
5059 ; SCALAR-NEXT: movb %al, 52(%rdx)
5060 ; SCALAR-NEXT: movb %cl, 55(%rdx)
5061 ; SCALAR-NEXT: movb %al, 54(%rdx)
5062 ; SCALAR-NEXT: movb %cl, 57(%rdx)
5063 ; SCALAR-NEXT: movb %al, 56(%rdx)
5064 ; SCALAR-NEXT: movb %cl, 59(%rdx)
5065 ; SCALAR-NEXT: movb %al, 58(%rdx)
5066 ; SCALAR-NEXT: movb %cl, 61(%rdx)
5067 ; SCALAR-NEXT: movb %al, 60(%rdx)
5068 ; SCALAR-NEXT: movb %cl, 63(%rdx)
5069 ; SCALAR-NEXT: movb %al, 62(%rdx)
5072 ; SSE2-ONLY-LABEL: vec512_v2i8:
5073 ; SSE2-ONLY: # %bb.0:
5074 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
5075 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
5076 ; SSE2-ONLY-NEXT: movd %xmm0, %eax
5077 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
5078 ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5079 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5080 ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
5081 ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
5082 ; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
5083 ; SSE2-ONLY-NEXT: movdqa %xmm0, 48(%rdx)
5084 ; SSE2-ONLY-NEXT: retq
5086 ; SSE3-LABEL: vec512_v2i8:
5088 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
5089 ; SSE3-NEXT: pxor (%rdi), %xmm0
5090 ; SSE3-NEXT: movd %xmm0, %eax
5091 ; SSE3-NEXT: movw %ax, (%rsi)
5092 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5093 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5094 ; SSE3-NEXT: movdqa %xmm0, (%rdx)
5095 ; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
5096 ; SSE3-NEXT: movdqa %xmm0, 32(%rdx)
5097 ; SSE3-NEXT: movdqa %xmm0, 48(%rdx)
5100 ; SSSE3-ONLY-LABEL: vec512_v2i8:
5101 ; SSSE3-ONLY: # %bb.0:
5102 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
5103 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
5104 ; SSSE3-ONLY-NEXT: movd %xmm0, %eax
5105 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
5106 ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5107 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5108 ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
5109 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
5110 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
5111 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 48(%rdx)
5112 ; SSSE3-ONLY-NEXT: retq
5114 ; SSE41-LABEL: vec512_v2i8:
5116 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
5117 ; SSE41-NEXT: pxor (%rdi), %xmm0
5118 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
5119 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5120 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5121 ; SSE41-NEXT: movdqa %xmm0, (%rdx)
5122 ; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
5123 ; SSE41-NEXT: movdqa %xmm0, 32(%rdx)
5124 ; SSE41-NEXT: movdqa %xmm0, 48(%rdx)
5127 ; SSE42-LABEL: vec512_v2i8:
5129 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
5130 ; SSE42-NEXT: pxor (%rdi), %xmm0
5131 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
5132 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5133 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5134 ; SSE42-NEXT: movdqa %xmm0, (%rdx)
5135 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
5136 ; SSE42-NEXT: movdqa %xmm0, 32(%rdx)
5137 ; SSE42-NEXT: movdqa %xmm0, 48(%rdx)
5140 ; AVX1-LABEL: vec512_v2i8:
5142 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5143 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
5144 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
5145 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5146 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5147 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5148 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5149 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5150 ; AVX1-NEXT: vzeroupper
5153 ; AVX2-ONLY-LABEL: vec512_v2i8:
5154 ; AVX2-ONLY: # %bb.0:
5155 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5156 ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0
5157 ; AVX2-ONLY-NEXT: vpextrw $0, %xmm0, (%rsi)
5158 ; AVX2-ONLY-NEXT: vpbroadcastw %xmm0, %ymm0
5159 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5160 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5161 ; AVX2-ONLY-NEXT: vzeroupper
5162 ; AVX2-ONLY-NEXT: retq
5164 ; AVX512F-LABEL: vec512_v2i8:
5166 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5167 ; AVX512F-NEXT: vpxor (%rdi), %xmm0, %xmm0
5168 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
5169 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
5170 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5171 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx)
5172 ; AVX512F-NEXT: vzeroupper
5173 ; AVX512F-NEXT: retq
5175 ; AVX512BW-LABEL: vec512_v2i8:
5176 ; AVX512BW: # %bb.0:
5177 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5178 ; AVX512BW-NEXT: vpxor (%rdi), %xmm0, %xmm0
5179 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
5180 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0
5181 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
5182 ; AVX512BW-NEXT: vzeroupper
5183 ; AVX512BW-NEXT: retq
5184 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
5185 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
5186 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
5187 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
5188 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
5189 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
5190 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
5191 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
5192 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
5193 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
5194 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
5195 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
5196 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
5197 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
5198 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
5199 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
5200 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
5201 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
5202 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
5203 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
5204 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
5205 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
5206 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
5207 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
5208 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
5209 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
5210 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
5211 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
5212 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
5213 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
5214 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
5215 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
5216 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
5217 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
5218 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
5219 %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16
5220 store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32
5221 %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17
5222 store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2
5223 %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18
5224 store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4
5225 %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19
5226 store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2
5227 %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20
5228 store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8
5229 %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21
5230 store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2
5231 %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22
5232 store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4
5233 %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23
5234 store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2
5235 %out.subvec24.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 24
5236 store <2 x i8> %in.subvec, ptr %out.subvec24.ptr, align 16
5237 %out.subvec25.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 25
5238 store <2 x i8> %in.subvec, ptr %out.subvec25.ptr, align 2
5239 %out.subvec26.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 26
5240 store <2 x i8> %in.subvec, ptr %out.subvec26.ptr, align 4
5241 %out.subvec27.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 27
5242 store <2 x i8> %in.subvec, ptr %out.subvec27.ptr, align 2
5243 %out.subvec28.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 28
5244 store <2 x i8> %in.subvec, ptr %out.subvec28.ptr, align 8
5245 %out.subvec29.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 29
5246 store <2 x i8> %in.subvec, ptr %out.subvec29.ptr, align 2
5247 %out.subvec30.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 30
5248 store <2 x i8> %in.subvec, ptr %out.subvec30.ptr, align 4
5249 %out.subvec31.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 31
5250 store <2 x i8> %in.subvec, ptr %out.subvec31.ptr, align 2
5254 define void @vec512_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5255 ; SCALAR-LABEL: vec512_v2i16:
5257 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
5258 ; SCALAR-NEXT: movl (%rdi), %eax
5259 ; SCALAR-NEXT: notl %eax
5260 ; SCALAR-NEXT: notl %ecx
5261 ; SCALAR-NEXT: movw %cx, 2(%rsi)
5262 ; SCALAR-NEXT: movw %ax, (%rsi)
5263 ; SCALAR-NEXT: movw %cx, 2(%rdx)
5264 ; SCALAR-NEXT: movw %ax, (%rdx)
5265 ; SCALAR-NEXT: movw %cx, 6(%rdx)
5266 ; SCALAR-NEXT: movw %ax, 4(%rdx)
5267 ; SCALAR-NEXT: movw %cx, 10(%rdx)
5268 ; SCALAR-NEXT: movw %ax, 8(%rdx)
5269 ; SCALAR-NEXT: movw %cx, 14(%rdx)
5270 ; SCALAR-NEXT: movw %ax, 12(%rdx)
5271 ; SCALAR-NEXT: movw %cx, 18(%rdx)
5272 ; SCALAR-NEXT: movw %ax, 16(%rdx)
5273 ; SCALAR-NEXT: movw %cx, 22(%rdx)
5274 ; SCALAR-NEXT: movw %ax, 20(%rdx)
5275 ; SCALAR-NEXT: movw %cx, 26(%rdx)
5276 ; SCALAR-NEXT: movw %ax, 24(%rdx)
5277 ; SCALAR-NEXT: movw %cx, 30(%rdx)
5278 ; SCALAR-NEXT: movw %ax, 28(%rdx)
5279 ; SCALAR-NEXT: movw %cx, 34(%rdx)
5280 ; SCALAR-NEXT: movw %ax, 32(%rdx)
5281 ; SCALAR-NEXT: movw %cx, 38(%rdx)
5282 ; SCALAR-NEXT: movw %ax, 36(%rdx)
5283 ; SCALAR-NEXT: movw %cx, 42(%rdx)
5284 ; SCALAR-NEXT: movw %ax, 40(%rdx)
5285 ; SCALAR-NEXT: movw %cx, 46(%rdx)
5286 ; SCALAR-NEXT: movw %ax, 44(%rdx)
5287 ; SCALAR-NEXT: movw %cx, 50(%rdx)
5288 ; SCALAR-NEXT: movw %ax, 48(%rdx)
5289 ; SCALAR-NEXT: movw %cx, 54(%rdx)
5290 ; SCALAR-NEXT: movw %ax, 52(%rdx)
5291 ; SCALAR-NEXT: movw %cx, 58(%rdx)
5292 ; SCALAR-NEXT: movw %ax, 56(%rdx)
5293 ; SCALAR-NEXT: movw %cx, 62(%rdx)
5294 ; SCALAR-NEXT: movw %ax, 60(%rdx)
5297 ; SSE2-LABEL: vec512_v2i16:
5299 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5300 ; SSE2-NEXT: pxor (%rdi), %xmm0
5301 ; SSE2-NEXT: movd %xmm0, (%rsi)
5302 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5303 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5304 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5305 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5306 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5309 ; AVX1-LABEL: vec512_v2i16:
5311 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5312 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
5313 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
5314 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5315 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5316 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5317 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5318 ; AVX1-NEXT: vzeroupper
5321 ; AVX2-ONLY-LABEL: vec512_v2i16:
5322 ; AVX2-ONLY: # %bb.0:
5323 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5324 ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0
5325 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
5326 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0
5327 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5328 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5329 ; AVX2-ONLY-NEXT: vzeroupper
5330 ; AVX2-ONLY-NEXT: retq
5332 ; AVX512-LABEL: vec512_v2i16:
5334 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5335 ; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
5336 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
5337 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
5338 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
5339 ; AVX512-NEXT: vzeroupper
5341 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
5342 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
5343 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
5344 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
5345 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
5346 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
5347 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
5348 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
5349 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
5350 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
5351 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
5352 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
5353 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
5354 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
5355 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
5356 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
5357 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
5358 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
5359 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
5360 %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8
5361 store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32
5362 %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9
5363 store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4
5364 %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10
5365 store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8
5366 %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11
5367 store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4
5368 %out.subvec12.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 12
5369 store <2 x i16> %in.subvec, ptr %out.subvec12.ptr, align 16
5370 %out.subvec13.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 13
5371 store <2 x i16> %in.subvec, ptr %out.subvec13.ptr, align 4
5372 %out.subvec14.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 14
5373 store <2 x i16> %in.subvec, ptr %out.subvec14.ptr, align 8
5374 %out.subvec15.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 15
5375 store <2 x i16> %in.subvec, ptr %out.subvec15.ptr, align 4
5379 define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5380 ; SCALAR-LABEL: vec512_v2i32:
5382 ; SCALAR-NEXT: movl (%rdi), %eax
5383 ; SCALAR-NEXT: movl 4(%rdi), %ecx
5384 ; SCALAR-NEXT: notl %eax
5385 ; SCALAR-NEXT: notl %ecx
5386 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
5387 ; SCALAR-NEXT: movl %eax, (%rsi)
5388 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
5389 ; SCALAR-NEXT: movl %eax, (%rdx)
5390 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
5391 ; SCALAR-NEXT: movl %eax, 8(%rdx)
5392 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
5393 ; SCALAR-NEXT: movl %eax, 16(%rdx)
5394 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
5395 ; SCALAR-NEXT: movl %eax, 24(%rdx)
5396 ; SCALAR-NEXT: movl %ecx, 36(%rdx)
5397 ; SCALAR-NEXT: movl %eax, 32(%rdx)
5398 ; SCALAR-NEXT: movl %ecx, 44(%rdx)
5399 ; SCALAR-NEXT: movl %eax, 40(%rdx)
5400 ; SCALAR-NEXT: movl %ecx, 52(%rdx)
5401 ; SCALAR-NEXT: movl %eax, 48(%rdx)
5402 ; SCALAR-NEXT: movl %ecx, 60(%rdx)
5403 ; SCALAR-NEXT: movl %eax, 56(%rdx)
5406 ; SSE2-LABEL: vec512_v2i32:
5408 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
5409 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
5410 ; SSE2-NEXT: pxor %xmm0, %xmm1
5411 ; SSE2-NEXT: movq %xmm1, (%rsi)
5412 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5413 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5414 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5415 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5416 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5419 ; AVX1-LABEL: vec512_v2i32:
5421 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5422 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5423 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
5424 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
5425 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5426 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5427 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5428 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5429 ; AVX1-NEXT: vzeroupper
5432 ; AVX2-ONLY-LABEL: vec512_v2i32:
5433 ; AVX2-ONLY: # %bb.0:
5434 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5435 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5436 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
5437 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
5438 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
5439 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5440 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5441 ; AVX2-ONLY-NEXT: vzeroupper
5442 ; AVX2-ONLY-NEXT: retq
5444 ; AVX512-LABEL: vec512_v2i32:
5446 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5447 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
5448 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
5449 ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
5450 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
5451 ; AVX512-NEXT: vzeroupper
5453 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
5454 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
5455 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
5456 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
5457 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
5458 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
5459 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
5460 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
5461 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
5462 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
5463 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
5464 %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4
5465 store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32
5466 %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5
5467 store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8
5468 %out.subvec6.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 6
5469 store <2 x i32> %in.subvec, ptr %out.subvec6.ptr, align 16
5470 %out.subvec7.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 7
5471 store <2 x i32> %in.subvec, ptr %out.subvec7.ptr, align 8
5475 define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5476 ; SCALAR-LABEL: vec512_v2f32:
5478 ; SCALAR-NEXT: movl (%rdi), %eax
5479 ; SCALAR-NEXT: movl 4(%rdi), %ecx
5480 ; SCALAR-NEXT: notl %eax
5481 ; SCALAR-NEXT: notl %ecx
5482 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
5483 ; SCALAR-NEXT: movl %eax, (%rsi)
5484 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
5485 ; SCALAR-NEXT: movl %eax, (%rdx)
5486 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
5487 ; SCALAR-NEXT: movl %eax, 8(%rdx)
5488 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
5489 ; SCALAR-NEXT: movl %eax, 16(%rdx)
5490 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
5491 ; SCALAR-NEXT: movl %eax, 24(%rdx)
5492 ; SCALAR-NEXT: movl %ecx, 36(%rdx)
5493 ; SCALAR-NEXT: movl %eax, 32(%rdx)
5494 ; SCALAR-NEXT: movl %ecx, 44(%rdx)
5495 ; SCALAR-NEXT: movl %eax, 40(%rdx)
5496 ; SCALAR-NEXT: movl %ecx, 52(%rdx)
5497 ; SCALAR-NEXT: movl %eax, 48(%rdx)
5498 ; SCALAR-NEXT: movl %ecx, 60(%rdx)
5499 ; SCALAR-NEXT: movl %eax, 56(%rdx)
5502 ; SSE2-LABEL: vec512_v2f32:
5504 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
5505 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
5506 ; SSE2-NEXT: pxor %xmm0, %xmm1
5507 ; SSE2-NEXT: movq %xmm1, (%rsi)
5508 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5509 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5510 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5511 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5512 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5515 ; AVX1-LABEL: vec512_v2f32:
5517 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5518 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5519 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
5520 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
5521 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5522 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5523 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5524 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5525 ; AVX1-NEXT: vzeroupper
5528 ; AVX2-ONLY-LABEL: vec512_v2f32:
5529 ; AVX2-ONLY: # %bb.0:
5530 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5531 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5532 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
5533 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
5534 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
5535 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5536 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5537 ; AVX2-ONLY-NEXT: vzeroupper
5538 ; AVX2-ONLY-NEXT: retq
5540 ; AVX512-LABEL: vec512_v2f32:
5542 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5543 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
5544 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
5545 ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
5546 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
5547 ; AVX512-NEXT: vzeroupper
5549 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
5550 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
5551 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
5552 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
5553 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
5554 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
5555 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
5556 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
5557 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
5558 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
5559 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
5560 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
5561 %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4
5562 store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32
5563 %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5
5564 store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8
5565 %out.subvec6.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 6
5566 store <2 x float> %in.subvec, ptr %out.subvec6.ptr, align 16
5567 %out.subvec7.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 7
5568 store <2 x float> %in.subvec, ptr %out.subvec7.ptr, align 8
5572 define void @vec512_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5573 ; SCALAR-LABEL: vec512_v2i64:
5575 ; SCALAR-NEXT: movq (%rdi), %rax
5576 ; SCALAR-NEXT: movq 8(%rdi), %rcx
5577 ; SCALAR-NEXT: notq %rax
5578 ; SCALAR-NEXT: notq %rcx
5579 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
5580 ; SCALAR-NEXT: movq %rax, (%rsi)
5581 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
5582 ; SCALAR-NEXT: movq %rax, (%rdx)
5583 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
5584 ; SCALAR-NEXT: movq %rax, 16(%rdx)
5585 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
5586 ; SCALAR-NEXT: movq %rax, 32(%rdx)
5587 ; SCALAR-NEXT: movq %rcx, 56(%rdx)
5588 ; SCALAR-NEXT: movq %rax, 48(%rdx)
5591 ; SSE2-LABEL: vec512_v2i64:
5593 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5594 ; SSE2-NEXT: pxor (%rdi), %xmm0
5595 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
5596 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5597 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5598 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5599 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5602 ; AVX-LABEL: vec512_v2i64:
5604 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5605 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
5606 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
5607 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
5608 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
5609 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
5610 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
5612 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
5613 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
5614 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
5615 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
5616 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
5617 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
5618 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
5619 %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2
5620 store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32
5621 %out.subvec3.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 3
5622 store <2 x i64> %in.subvec, ptr %out.subvec3.ptr, align 16
5626 define void @vec512_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5627 ; SCALAR-LABEL: vec512_v2f64:
5629 ; SCALAR-NEXT: movq (%rdi), %rax
5630 ; SCALAR-NEXT: movq 8(%rdi), %rcx
5631 ; SCALAR-NEXT: notq %rax
5632 ; SCALAR-NEXT: notq %rcx
5633 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
5634 ; SCALAR-NEXT: movq %rax, (%rsi)
5635 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
5636 ; SCALAR-NEXT: movq %rax, (%rdx)
5637 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
5638 ; SCALAR-NEXT: movq %rax, 16(%rdx)
5639 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
5640 ; SCALAR-NEXT: movq %rax, 32(%rdx)
5641 ; SCALAR-NEXT: movq %rcx, 56(%rdx)
5642 ; SCALAR-NEXT: movq %rax, 48(%rdx)
5645 ; SSE2-LABEL: vec512_v2f64:
5647 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5648 ; SSE2-NEXT: pxor (%rdi), %xmm0
5649 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
5650 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5651 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5652 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5653 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5656 ; AVX-LABEL: vec512_v2f64:
5658 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5659 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
5660 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
5661 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
5662 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
5663 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
5664 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
5666 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
5667 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
5668 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
5669 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
5670 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
5671 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
5672 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
5673 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
5674 %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2
5675 store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32
5676 %out.subvec3.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 3
5677 store <2 x double> %in.subvec, ptr %out.subvec3.ptr, align 16
5681 define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5682 ; ALL-LABEL: vec512_v2i128:
5684 ; ALL-NEXT: movq 16(%rdi), %rax
5685 ; ALL-NEXT: movq 24(%rdi), %rcx
5686 ; ALL-NEXT: movq (%rdi), %r8
5687 ; ALL-NEXT: movq 8(%rdi), %rdi
5688 ; ALL-NEXT: notq %rdi
5689 ; ALL-NEXT: notq %r8
5690 ; ALL-NEXT: notq %rcx
5691 ; ALL-NEXT: notq %rax
5692 ; ALL-NEXT: movq %rax, 16(%rsi)
5693 ; ALL-NEXT: movq %rcx, 24(%rsi)
5694 ; ALL-NEXT: movq %r8, (%rsi)
5695 ; ALL-NEXT: movq %rdi, 8(%rsi)
5696 ; ALL-NEXT: movq %rax, 16(%rdx)
5697 ; ALL-NEXT: movq %rcx, 24(%rdx)
5698 ; ALL-NEXT: movq %r8, (%rdx)
5699 ; ALL-NEXT: movq %rdi, 8(%rdx)
5700 ; ALL-NEXT: movq %rax, 48(%rdx)
5701 ; ALL-NEXT: movq %rcx, 56(%rdx)
5702 ; ALL-NEXT: movq %r8, 32(%rdx)
5703 ; ALL-NEXT: movq %rdi, 40(%rdx)
5705 %in.subvec.not = load <2 x i128>, ptr %in.subvec.ptr, align 64
5706 %in.subvec = xor <2 x i128> %in.subvec.not, <i128 -1, i128 -1>
5707 store <2 x i128> %in.subvec, ptr %out.subvec.ptr, align 64
5708 %out.subvec0.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 0
5709 store <2 x i128> %in.subvec, ptr %out.subvec0.ptr, align 64
5710 %out.subvec1.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 1
5711 store <2 x i128> %in.subvec, ptr %out.subvec1.ptr, align 32
5715 define void @vec512_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5716 ; SCALAR-LABEL: vec512_v4i8:
5718 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
5719 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
5720 ; SCALAR-NEXT: movzbl (%rdi), %eax
5721 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
5722 ; SCALAR-NEXT: notb %al
5723 ; SCALAR-NEXT: notb %dil
5724 ; SCALAR-NEXT: notb %cl
5725 ; SCALAR-NEXT: notb %r8b
5726 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
5727 ; SCALAR-NEXT: movb %cl, 2(%rsi)
5728 ; SCALAR-NEXT: movb %dil, 1(%rsi)
5729 ; SCALAR-NEXT: movb %al, (%rsi)
5730 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
5731 ; SCALAR-NEXT: movb %cl, 2(%rdx)
5732 ; SCALAR-NEXT: movb %dil, 1(%rdx)
5733 ; SCALAR-NEXT: movb %al, (%rdx)
5734 ; SCALAR-NEXT: movb %r8b, 7(%rdx)
5735 ; SCALAR-NEXT: movb %cl, 6(%rdx)
5736 ; SCALAR-NEXT: movb %dil, 5(%rdx)
5737 ; SCALAR-NEXT: movb %al, 4(%rdx)
5738 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
5739 ; SCALAR-NEXT: movb %cl, 10(%rdx)
5740 ; SCALAR-NEXT: movb %dil, 9(%rdx)
5741 ; SCALAR-NEXT: movb %al, 8(%rdx)
5742 ; SCALAR-NEXT: movb %r8b, 15(%rdx)
5743 ; SCALAR-NEXT: movb %cl, 14(%rdx)
5744 ; SCALAR-NEXT: movb %dil, 13(%rdx)
5745 ; SCALAR-NEXT: movb %al, 12(%rdx)
5746 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
5747 ; SCALAR-NEXT: movb %cl, 18(%rdx)
5748 ; SCALAR-NEXT: movb %dil, 17(%rdx)
5749 ; SCALAR-NEXT: movb %al, 16(%rdx)
5750 ; SCALAR-NEXT: movb %r8b, 23(%rdx)
5751 ; SCALAR-NEXT: movb %cl, 22(%rdx)
5752 ; SCALAR-NEXT: movb %dil, 21(%rdx)
5753 ; SCALAR-NEXT: movb %al, 20(%rdx)
5754 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
5755 ; SCALAR-NEXT: movb %cl, 26(%rdx)
5756 ; SCALAR-NEXT: movb %dil, 25(%rdx)
5757 ; SCALAR-NEXT: movb %al, 24(%rdx)
5758 ; SCALAR-NEXT: movb %r8b, 31(%rdx)
5759 ; SCALAR-NEXT: movb %cl, 30(%rdx)
5760 ; SCALAR-NEXT: movb %dil, 29(%rdx)
5761 ; SCALAR-NEXT: movb %al, 28(%rdx)
5762 ; SCALAR-NEXT: movb %r8b, 35(%rdx)
5763 ; SCALAR-NEXT: movb %cl, 34(%rdx)
5764 ; SCALAR-NEXT: movb %dil, 33(%rdx)
5765 ; SCALAR-NEXT: movb %al, 32(%rdx)
5766 ; SCALAR-NEXT: movb %r8b, 39(%rdx)
5767 ; SCALAR-NEXT: movb %cl, 38(%rdx)
5768 ; SCALAR-NEXT: movb %dil, 37(%rdx)
5769 ; SCALAR-NEXT: movb %al, 36(%rdx)
5770 ; SCALAR-NEXT: movb %r8b, 43(%rdx)
5771 ; SCALAR-NEXT: movb %cl, 42(%rdx)
5772 ; SCALAR-NEXT: movb %dil, 41(%rdx)
5773 ; SCALAR-NEXT: movb %al, 40(%rdx)
5774 ; SCALAR-NEXT: movb %r8b, 47(%rdx)
5775 ; SCALAR-NEXT: movb %cl, 46(%rdx)
5776 ; SCALAR-NEXT: movb %dil, 45(%rdx)
5777 ; SCALAR-NEXT: movb %al, 44(%rdx)
5778 ; SCALAR-NEXT: movb %r8b, 51(%rdx)
5779 ; SCALAR-NEXT: movb %cl, 50(%rdx)
5780 ; SCALAR-NEXT: movb %dil, 49(%rdx)
5781 ; SCALAR-NEXT: movb %al, 48(%rdx)
5782 ; SCALAR-NEXT: movb %r8b, 55(%rdx)
5783 ; SCALAR-NEXT: movb %cl, 54(%rdx)
5784 ; SCALAR-NEXT: movb %dil, 53(%rdx)
5785 ; SCALAR-NEXT: movb %al, 52(%rdx)
5786 ; SCALAR-NEXT: movb %r8b, 59(%rdx)
5787 ; SCALAR-NEXT: movb %cl, 58(%rdx)
5788 ; SCALAR-NEXT: movb %dil, 57(%rdx)
5789 ; SCALAR-NEXT: movb %al, 56(%rdx)
5790 ; SCALAR-NEXT: movb %r8b, 63(%rdx)
5791 ; SCALAR-NEXT: movb %cl, 62(%rdx)
5792 ; SCALAR-NEXT: movb %dil, 61(%rdx)
5793 ; SCALAR-NEXT: movb %al, 60(%rdx)
5796 ; SSE2-LABEL: vec512_v4i8:
5798 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5799 ; SSE2-NEXT: pxor (%rdi), %xmm0
5800 ; SSE2-NEXT: movd %xmm0, (%rsi)
5801 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5802 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5803 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5804 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5805 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5808 ; AVX1-LABEL: vec512_v4i8:
5810 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5811 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
5812 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
5813 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5814 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5815 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5816 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5817 ; AVX1-NEXT: vzeroupper
5820 ; AVX2-ONLY-LABEL: vec512_v4i8:
5821 ; AVX2-ONLY: # %bb.0:
5822 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5823 ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0
5824 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
5825 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0
5826 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5827 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5828 ; AVX2-ONLY-NEXT: vzeroupper
5829 ; AVX2-ONLY-NEXT: retq
5831 ; AVX512-LABEL: vec512_v4i8:
5833 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5834 ; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
5835 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
5836 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
5837 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
5838 ; AVX512-NEXT: vzeroupper
5840 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
5841 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
5842 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
5843 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
5844 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
5845 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
5846 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
5847 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
5848 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
5849 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
5850 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
5851 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
5852 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
5853 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
5854 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
5855 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
5856 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
5857 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
5858 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
5859 %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8
5860 store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32
5861 %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9
5862 store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4
5863 %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10
5864 store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8
5865 %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11
5866 store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4
5867 %out.subvec12.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 12
5868 store <4 x i8> %in.subvec, ptr %out.subvec12.ptr, align 16
5869 %out.subvec13.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 13
5870 store <4 x i8> %in.subvec, ptr %out.subvec13.ptr, align 4
5871 %out.subvec14.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 14
5872 store <4 x i8> %in.subvec, ptr %out.subvec14.ptr, align 8
5873 %out.subvec15.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 15
5874 store <4 x i8> %in.subvec, ptr %out.subvec15.ptr, align 4
5878 define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5879 ; SCALAR-LABEL: vec512_v4i16:
5881 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
5882 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
5883 ; SCALAR-NEXT: movl (%rdi), %eax
5884 ; SCALAR-NEXT: movl 4(%rdi), %edi
5885 ; SCALAR-NEXT: notl %eax
5886 ; SCALAR-NEXT: notl %ecx
5887 ; SCALAR-NEXT: notl %edi
5888 ; SCALAR-NEXT: notl %r8d
5889 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
5890 ; SCALAR-NEXT: movw %di, 4(%rsi)
5891 ; SCALAR-NEXT: movw %cx, 2(%rsi)
5892 ; SCALAR-NEXT: movw %ax, (%rsi)
5893 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
5894 ; SCALAR-NEXT: movw %di, 4(%rdx)
5895 ; SCALAR-NEXT: movw %cx, 2(%rdx)
5896 ; SCALAR-NEXT: movw %ax, (%rdx)
5897 ; SCALAR-NEXT: movw %r8w, 14(%rdx)
5898 ; SCALAR-NEXT: movw %di, 12(%rdx)
5899 ; SCALAR-NEXT: movw %cx, 10(%rdx)
5900 ; SCALAR-NEXT: movw %ax, 8(%rdx)
5901 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
5902 ; SCALAR-NEXT: movw %di, 20(%rdx)
5903 ; SCALAR-NEXT: movw %cx, 18(%rdx)
5904 ; SCALAR-NEXT: movw %ax, 16(%rdx)
5905 ; SCALAR-NEXT: movw %r8w, 30(%rdx)
5906 ; SCALAR-NEXT: movw %di, 28(%rdx)
5907 ; SCALAR-NEXT: movw %cx, 26(%rdx)
5908 ; SCALAR-NEXT: movw %ax, 24(%rdx)
5909 ; SCALAR-NEXT: movw %r8w, 38(%rdx)
5910 ; SCALAR-NEXT: movw %di, 36(%rdx)
5911 ; SCALAR-NEXT: movw %cx, 34(%rdx)
5912 ; SCALAR-NEXT: movw %ax, 32(%rdx)
5913 ; SCALAR-NEXT: movw %r8w, 46(%rdx)
5914 ; SCALAR-NEXT: movw %di, 44(%rdx)
5915 ; SCALAR-NEXT: movw %cx, 42(%rdx)
5916 ; SCALAR-NEXT: movw %ax, 40(%rdx)
5917 ; SCALAR-NEXT: movw %r8w, 54(%rdx)
5918 ; SCALAR-NEXT: movw %di, 52(%rdx)
5919 ; SCALAR-NEXT: movw %cx, 50(%rdx)
5920 ; SCALAR-NEXT: movw %ax, 48(%rdx)
5921 ; SCALAR-NEXT: movw %r8w, 62(%rdx)
5922 ; SCALAR-NEXT: movw %di, 60(%rdx)
5923 ; SCALAR-NEXT: movw %cx, 58(%rdx)
5924 ; SCALAR-NEXT: movw %ax, 56(%rdx)
5927 ; SSE2-LABEL: vec512_v4i16:
5929 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
5930 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
5931 ; SSE2-NEXT: pxor %xmm0, %xmm1
5932 ; SSE2-NEXT: movq %xmm1, (%rsi)
5933 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5934 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5935 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5936 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5937 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5940 ; AVX1-LABEL: vec512_v4i16:
5942 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5943 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5944 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
5945 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
5946 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5947 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5948 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5949 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5950 ; AVX1-NEXT: vzeroupper
5953 ; AVX2-ONLY-LABEL: vec512_v4i16:
5954 ; AVX2-ONLY: # %bb.0:
5955 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5956 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5957 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
5958 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
5959 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
5960 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5961 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5962 ; AVX2-ONLY-NEXT: vzeroupper
5963 ; AVX2-ONLY-NEXT: retq
5965 ; AVX512-LABEL: vec512_v4i16:
5967 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5968 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
5969 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
5970 ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
5971 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
5972 ; AVX512-NEXT: vzeroupper
5974 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
5975 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
5976 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
5977 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
5978 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
5979 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
5980 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
5981 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
5982 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
5983 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
5984 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
5985 %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4
5986 store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32
5987 %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5
5988 store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8
5989 %out.subvec6.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 6
5990 store <4 x i16> %in.subvec, ptr %out.subvec6.ptr, align 16
5991 %out.subvec7.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 7
5992 store <4 x i16> %in.subvec, ptr %out.subvec7.ptr, align 8
5996 define void @vec512_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5997 ; SCALAR-LABEL: vec512_v4i32:
5999 ; SCALAR-NEXT: movaps (%rdi), %xmm0
6000 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6001 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
6002 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
6003 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
6004 ; SCALAR-NEXT: movaps %xmm0, 32(%rdx)
6005 ; SCALAR-NEXT: movaps %xmm0, 48(%rdx)
6008 ; SSE2-LABEL: vec512_v4i32:
6010 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6011 ; SSE2-NEXT: pxor (%rdi), %xmm0
6012 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6013 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6014 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6015 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6016 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6019 ; AVX-LABEL: vec512_v4i32:
6021 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6022 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
6023 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
6024 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
6025 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
6026 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
6027 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
6029 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
6030 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
6031 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
6032 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
6033 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
6034 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
6035 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
6036 %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2
6037 store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32
6038 %out.subvec3.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 3
6039 store <4 x i32> %in.subvec, ptr %out.subvec3.ptr, align 16
6043 define void @vec512_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6044 ; SCALAR-LABEL: vec512_v4f32:
6046 ; SCALAR-NEXT: movaps (%rdi), %xmm0
6047 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6048 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
6049 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
6050 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
6051 ; SCALAR-NEXT: movaps %xmm0, 32(%rdx)
6052 ; SCALAR-NEXT: movaps %xmm0, 48(%rdx)
6055 ; SSE2-LABEL: vec512_v4f32:
6057 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6058 ; SSE2-NEXT: pxor (%rdi), %xmm0
6059 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6060 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6061 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6062 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6063 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6066 ; AVX-LABEL: vec512_v4f32:
6068 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6069 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
6070 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
6071 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
6072 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
6073 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
6074 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
6076 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
6077 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
6078 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
6079 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
6080 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
6081 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
6082 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
6083 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
6084 %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2
6085 store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32
6086 %out.subvec3.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 3
6087 store <4 x float> %in.subvec, ptr %out.subvec3.ptr, align 16
6091 define void @vec512_v4i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6092 ; SCALAR-LABEL: vec512_v4i64:
6094 ; SCALAR-NEXT: movq 24(%rdi), %rax
6095 ; SCALAR-NEXT: movq 16(%rdi), %rcx
6096 ; SCALAR-NEXT: movq (%rdi), %r8
6097 ; SCALAR-NEXT: movq 8(%rdi), %rdi
6098 ; SCALAR-NEXT: notq %r8
6099 ; SCALAR-NEXT: notq %rdi
6100 ; SCALAR-NEXT: notq %rcx
6101 ; SCALAR-NEXT: notq %rax
6102 ; SCALAR-NEXT: movq %rax, 24(%rsi)
6103 ; SCALAR-NEXT: movq %rcx, 16(%rsi)
6104 ; SCALAR-NEXT: movq %rdi, 8(%rsi)
6105 ; SCALAR-NEXT: movq %r8, (%rsi)
6106 ; SCALAR-NEXT: movq %rax, 24(%rdx)
6107 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
6108 ; SCALAR-NEXT: movq %rdi, 8(%rdx)
6109 ; SCALAR-NEXT: movq %r8, (%rdx)
6110 ; SCALAR-NEXT: movq %rax, 56(%rdx)
6111 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
6112 ; SCALAR-NEXT: movq %rdi, 40(%rdx)
6113 ; SCALAR-NEXT: movq %r8, 32(%rdx)
6116 ; SSE2-LABEL: vec512_v4i64:
6118 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6119 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6120 ; SSE2-NEXT: pxor %xmm0, %xmm1
6121 ; SSE2-NEXT: pxor (%rdi), %xmm0
6122 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6123 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
6124 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6125 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
6126 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
6127 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6130 ; AVX1-LABEL: vec512_v4i64:
6132 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
6133 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
6134 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
6135 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
6136 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6137 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6138 ; AVX1-NEXT: vzeroupper
6141 ; AVX2-LABEL: vec512_v4i64:
6143 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
6144 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
6145 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
6146 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
6147 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
6148 ; AVX2-NEXT: vzeroupper
6150 %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64
6151 %in.subvec = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1>
6152 store <4 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
6153 %out.subvec0.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 0
6154 store <4 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
6155 %out.subvec1.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 1
6156 store <4 x i64> %in.subvec, ptr %out.subvec1.ptr, align 32
6160 define void @vec512_v4f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6161 ; SCALAR-LABEL: vec512_v4f64:
6163 ; SCALAR-NEXT: movq 24(%rdi), %rax
6164 ; SCALAR-NEXT: movq 16(%rdi), %rcx
6165 ; SCALAR-NEXT: movq (%rdi), %r8
6166 ; SCALAR-NEXT: movq 8(%rdi), %rdi
6167 ; SCALAR-NEXT: notq %r8
6168 ; SCALAR-NEXT: notq %rdi
6169 ; SCALAR-NEXT: notq %rcx
6170 ; SCALAR-NEXT: notq %rax
6171 ; SCALAR-NEXT: movq %rax, 24(%rsi)
6172 ; SCALAR-NEXT: movq %rcx, 16(%rsi)
6173 ; SCALAR-NEXT: movq %rdi, 8(%rsi)
6174 ; SCALAR-NEXT: movq %r8, (%rsi)
6175 ; SCALAR-NEXT: movq %rax, 24(%rdx)
6176 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
6177 ; SCALAR-NEXT: movq %rdi, 8(%rdx)
6178 ; SCALAR-NEXT: movq %r8, (%rdx)
6179 ; SCALAR-NEXT: movq %rax, 56(%rdx)
6180 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
6181 ; SCALAR-NEXT: movq %rdi, 40(%rdx)
6182 ; SCALAR-NEXT: movq %r8, 32(%rdx)
6185 ; SSE2-LABEL: vec512_v4f64:
6187 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6188 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6189 ; SSE2-NEXT: pxor %xmm0, %xmm1
6190 ; SSE2-NEXT: pxor (%rdi), %xmm0
6191 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6192 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
6193 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6194 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
6195 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
6196 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6199 ; AVX1-LABEL: vec512_v4f64:
6201 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
6202 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
6203 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
6204 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
6205 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6206 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6207 ; AVX1-NEXT: vzeroupper
6210 ; AVX2-LABEL: vec512_v4f64:
6212 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
6213 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
6214 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
6215 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
6216 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
6217 ; AVX2-NEXT: vzeroupper
6219 %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64
6220 %in.subvec.int = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1>
6221 %in.subvec = bitcast <4 x i64> %in.subvec.int to <4 x double>
6222 store <4 x double> %in.subvec, ptr %out.subvec.ptr, align 64
6223 %out.subvec0.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 0
6224 store <4 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
6225 %out.subvec1.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 1
6226 store <4 x double> %in.subvec, ptr %out.subvec1.ptr, align 32
6230 define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6231 ; SCALAR-LABEL: vec512_v8i8:
6233 ; SCALAR-NEXT: pushq %rbx
6234 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
6235 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
6236 ; SCALAR-NEXT: movzbl 5(%rdi), %r10d
6237 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
6238 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
6239 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
6240 ; SCALAR-NEXT: movzbl (%rdi), %eax
6241 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
6242 ; SCALAR-NEXT: notb %al
6243 ; SCALAR-NEXT: notb %dil
6244 ; SCALAR-NEXT: notb %cl
6245 ; SCALAR-NEXT: notb %r8b
6246 ; SCALAR-NEXT: notb %r9b
6247 ; SCALAR-NEXT: notb %r10b
6248 ; SCALAR-NEXT: notb %r11b
6249 ; SCALAR-NEXT: notb %bl
6250 ; SCALAR-NEXT: movb %bl, 7(%rsi)
6251 ; SCALAR-NEXT: movb %r11b, 6(%rsi)
6252 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
6253 ; SCALAR-NEXT: movb %r9b, 4(%rsi)
6254 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
6255 ; SCALAR-NEXT: movb %cl, 2(%rsi)
6256 ; SCALAR-NEXT: movb %dil, 1(%rsi)
6257 ; SCALAR-NEXT: movb %al, (%rsi)
6258 ; SCALAR-NEXT: movb %bl, 7(%rdx)
6259 ; SCALAR-NEXT: movb %r11b, 6(%rdx)
6260 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
6261 ; SCALAR-NEXT: movb %r9b, 4(%rdx)
6262 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
6263 ; SCALAR-NEXT: movb %cl, 2(%rdx)
6264 ; SCALAR-NEXT: movb %dil, 1(%rdx)
6265 ; SCALAR-NEXT: movb %al, (%rdx)
6266 ; SCALAR-NEXT: movb %bl, 15(%rdx)
6267 ; SCALAR-NEXT: movb %r11b, 14(%rdx)
6268 ; SCALAR-NEXT: movb %r10b, 13(%rdx)
6269 ; SCALAR-NEXT: movb %r9b, 12(%rdx)
6270 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
6271 ; SCALAR-NEXT: movb %cl, 10(%rdx)
6272 ; SCALAR-NEXT: movb %dil, 9(%rdx)
6273 ; SCALAR-NEXT: movb %al, 8(%rdx)
6274 ; SCALAR-NEXT: movb %bl, 23(%rdx)
6275 ; SCALAR-NEXT: movb %r11b, 22(%rdx)
6276 ; SCALAR-NEXT: movb %r10b, 21(%rdx)
6277 ; SCALAR-NEXT: movb %r9b, 20(%rdx)
6278 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
6279 ; SCALAR-NEXT: movb %cl, 18(%rdx)
6280 ; SCALAR-NEXT: movb %dil, 17(%rdx)
6281 ; SCALAR-NEXT: movb %al, 16(%rdx)
6282 ; SCALAR-NEXT: movb %bl, 31(%rdx)
6283 ; SCALAR-NEXT: movb %r11b, 30(%rdx)
6284 ; SCALAR-NEXT: movb %r10b, 29(%rdx)
6285 ; SCALAR-NEXT: movb %r9b, 28(%rdx)
6286 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
6287 ; SCALAR-NEXT: movb %cl, 26(%rdx)
6288 ; SCALAR-NEXT: movb %dil, 25(%rdx)
6289 ; SCALAR-NEXT: movb %al, 24(%rdx)
6290 ; SCALAR-NEXT: movb %bl, 39(%rdx)
6291 ; SCALAR-NEXT: movb %r11b, 38(%rdx)
6292 ; SCALAR-NEXT: movb %r10b, 37(%rdx)
6293 ; SCALAR-NEXT: movb %r9b, 36(%rdx)
6294 ; SCALAR-NEXT: movb %r8b, 35(%rdx)
6295 ; SCALAR-NEXT: movb %cl, 34(%rdx)
6296 ; SCALAR-NEXT: movb %dil, 33(%rdx)
6297 ; SCALAR-NEXT: movb %al, 32(%rdx)
6298 ; SCALAR-NEXT: movb %bl, 47(%rdx)
6299 ; SCALAR-NEXT: movb %r11b, 46(%rdx)
6300 ; SCALAR-NEXT: movb %r10b, 45(%rdx)
6301 ; SCALAR-NEXT: movb %r9b, 44(%rdx)
6302 ; SCALAR-NEXT: movb %r8b, 43(%rdx)
6303 ; SCALAR-NEXT: movb %cl, 42(%rdx)
6304 ; SCALAR-NEXT: movb %dil, 41(%rdx)
6305 ; SCALAR-NEXT: movb %al, 40(%rdx)
6306 ; SCALAR-NEXT: movb %bl, 55(%rdx)
6307 ; SCALAR-NEXT: movb %r11b, 54(%rdx)
6308 ; SCALAR-NEXT: movb %r10b, 53(%rdx)
6309 ; SCALAR-NEXT: movb %r9b, 52(%rdx)
6310 ; SCALAR-NEXT: movb %r8b, 51(%rdx)
6311 ; SCALAR-NEXT: movb %cl, 50(%rdx)
6312 ; SCALAR-NEXT: movb %dil, 49(%rdx)
6313 ; SCALAR-NEXT: movb %al, 48(%rdx)
6314 ; SCALAR-NEXT: movb %bl, 63(%rdx)
6315 ; SCALAR-NEXT: movb %r11b, 62(%rdx)
6316 ; SCALAR-NEXT: movb %r10b, 61(%rdx)
6317 ; SCALAR-NEXT: movb %r9b, 60(%rdx)
6318 ; SCALAR-NEXT: movb %r8b, 59(%rdx)
6319 ; SCALAR-NEXT: movb %cl, 58(%rdx)
6320 ; SCALAR-NEXT: movb %dil, 57(%rdx)
6321 ; SCALAR-NEXT: movb %al, 56(%rdx)
6322 ; SCALAR-NEXT: popq %rbx
6325 ; SSE2-LABEL: vec512_v8i8:
6327 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
6328 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
6329 ; SSE2-NEXT: pxor %xmm0, %xmm1
6330 ; SSE2-NEXT: movq %xmm1, (%rsi)
6331 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
6332 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6333 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6334 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6335 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6338 ; AVX1-LABEL: vec512_v8i8:
6340 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6341 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
6342 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
6343 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
6344 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6345 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6346 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6347 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6348 ; AVX1-NEXT: vzeroupper
6351 ; AVX2-ONLY-LABEL: vec512_v8i8:
6352 ; AVX2-ONLY: # %bb.0:
6353 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6354 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
6355 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
6356 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
6357 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
6358 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
6359 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
6360 ; AVX2-ONLY-NEXT: vzeroupper
6361 ; AVX2-ONLY-NEXT: retq
6363 ; AVX512-LABEL: vec512_v8i8:
6365 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6366 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
6367 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
6368 ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
6369 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
6370 ; AVX512-NEXT: vzeroupper
6372 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
6373 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
6374 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
6375 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
6376 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
6377 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
6378 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
6379 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
6380 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
6381 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
6382 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
6383 %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4
6384 store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32
6385 %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5
6386 store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8
6387 %out.subvec6.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 6
6388 store <8 x i8> %in.subvec, ptr %out.subvec6.ptr, align 16
6389 %out.subvec7.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 7
6390 store <8 x i8> %in.subvec, ptr %out.subvec7.ptr, align 8
6394 define void @vec512_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6395 ; SCALAR-LABEL: vec512_v8i16:
6397 ; SCALAR-NEXT: pushq %rbx
6398 ; SCALAR-NEXT: movzwl 14(%rdi), %ebx
6399 ; SCALAR-NEXT: movl 12(%rdi), %r11d
6400 ; SCALAR-NEXT: movzwl 10(%rdi), %r10d
6401 ; SCALAR-NEXT: movl 8(%rdi), %r9d
6402 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
6403 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
6404 ; SCALAR-NEXT: movl (%rdi), %eax
6405 ; SCALAR-NEXT: movl 4(%rdi), %edi
6406 ; SCALAR-NEXT: notl %eax
6407 ; SCALAR-NEXT: notl %ecx
6408 ; SCALAR-NEXT: notl %edi
6409 ; SCALAR-NEXT: notl %r8d
6410 ; SCALAR-NEXT: notl %r9d
6411 ; SCALAR-NEXT: notl %r10d
6412 ; SCALAR-NEXT: notl %r11d
6413 ; SCALAR-NEXT: notl %ebx
6414 ; SCALAR-NEXT: movw %bx, 14(%rsi)
6415 ; SCALAR-NEXT: movw %r11w, 12(%rsi)
6416 ; SCALAR-NEXT: movw %r10w, 10(%rsi)
6417 ; SCALAR-NEXT: movw %r9w, 8(%rsi)
6418 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
6419 ; SCALAR-NEXT: movw %di, 4(%rsi)
6420 ; SCALAR-NEXT: movw %cx, 2(%rsi)
6421 ; SCALAR-NEXT: movw %ax, (%rsi)
6422 ; SCALAR-NEXT: movw %bx, 14(%rdx)
6423 ; SCALAR-NEXT: movw %r11w, 12(%rdx)
6424 ; SCALAR-NEXT: movw %r10w, 10(%rdx)
6425 ; SCALAR-NEXT: movw %r9w, 8(%rdx)
6426 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
6427 ; SCALAR-NEXT: movw %di, 4(%rdx)
6428 ; SCALAR-NEXT: movw %cx, 2(%rdx)
6429 ; SCALAR-NEXT: movw %ax, (%rdx)
6430 ; SCALAR-NEXT: movw %bx, 30(%rdx)
6431 ; SCALAR-NEXT: movw %r11w, 28(%rdx)
6432 ; SCALAR-NEXT: movw %r10w, 26(%rdx)
6433 ; SCALAR-NEXT: movw %r9w, 24(%rdx)
6434 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
6435 ; SCALAR-NEXT: movw %di, 20(%rdx)
6436 ; SCALAR-NEXT: movw %cx, 18(%rdx)
6437 ; SCALAR-NEXT: movw %ax, 16(%rdx)
6438 ; SCALAR-NEXT: movw %bx, 46(%rdx)
6439 ; SCALAR-NEXT: movw %r11w, 44(%rdx)
6440 ; SCALAR-NEXT: movw %r10w, 42(%rdx)
6441 ; SCALAR-NEXT: movw %r9w, 40(%rdx)
6442 ; SCALAR-NEXT: movw %r8w, 38(%rdx)
6443 ; SCALAR-NEXT: movw %di, 36(%rdx)
6444 ; SCALAR-NEXT: movw %cx, 34(%rdx)
6445 ; SCALAR-NEXT: movw %ax, 32(%rdx)
6446 ; SCALAR-NEXT: movw %bx, 62(%rdx)
6447 ; SCALAR-NEXT: movw %r11w, 60(%rdx)
6448 ; SCALAR-NEXT: movw %r10w, 58(%rdx)
6449 ; SCALAR-NEXT: movw %r9w, 56(%rdx)
6450 ; SCALAR-NEXT: movw %r8w, 54(%rdx)
6451 ; SCALAR-NEXT: movw %di, 52(%rdx)
6452 ; SCALAR-NEXT: movw %cx, 50(%rdx)
6453 ; SCALAR-NEXT: movw %ax, 48(%rdx)
6454 ; SCALAR-NEXT: popq %rbx
6457 ; SSE2-LABEL: vec512_v8i16:
6459 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6460 ; SSE2-NEXT: pxor (%rdi), %xmm0
6461 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6462 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6463 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6464 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6465 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6468 ; AVX-LABEL: vec512_v8i16:
6470 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6471 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
6472 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
6473 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
6474 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
6475 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
6476 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
6478 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
6479 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
6480 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
6481 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
6482 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
6483 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
6484 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
6485 %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2
6486 store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32
6487 %out.subvec3.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 3
6488 store <8 x i16> %in.subvec, ptr %out.subvec3.ptr, align 16
6492 define void @vec512_v8i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6493 ; SCALAR-LABEL: vec512_v8i32:
6495 ; SCALAR-NEXT: pushq %rbx
6496 ; SCALAR-NEXT: movl 28(%rdi), %ebx
6497 ; SCALAR-NEXT: movl 24(%rdi), %r11d
6498 ; SCALAR-NEXT: movl 20(%rdi), %r10d
6499 ; SCALAR-NEXT: movl 16(%rdi), %r9d
6500 ; SCALAR-NEXT: movl 12(%rdi), %r8d
6501 ; SCALAR-NEXT: movl 8(%rdi), %ecx
6502 ; SCALAR-NEXT: movl (%rdi), %eax
6503 ; SCALAR-NEXT: movl 4(%rdi), %edi
6504 ; SCALAR-NEXT: notl %eax
6505 ; SCALAR-NEXT: notl %edi
6506 ; SCALAR-NEXT: notl %ecx
6507 ; SCALAR-NEXT: notl %r8d
6508 ; SCALAR-NEXT: notl %r9d
6509 ; SCALAR-NEXT: notl %r10d
6510 ; SCALAR-NEXT: notl %r11d
6511 ; SCALAR-NEXT: notl %ebx
6512 ; SCALAR-NEXT: movl %ebx, 28(%rsi)
6513 ; SCALAR-NEXT: movl %r11d, 24(%rsi)
6514 ; SCALAR-NEXT: movl %r10d, 20(%rsi)
6515 ; SCALAR-NEXT: movl %r9d, 16(%rsi)
6516 ; SCALAR-NEXT: movl %r8d, 12(%rsi)
6517 ; SCALAR-NEXT: movl %ecx, 8(%rsi)
6518 ; SCALAR-NEXT: movl %edi, 4(%rsi)
6519 ; SCALAR-NEXT: movl %eax, (%rsi)
6520 ; SCALAR-NEXT: movl %ebx, 28(%rdx)
6521 ; SCALAR-NEXT: movl %r11d, 24(%rdx)
6522 ; SCALAR-NEXT: movl %r10d, 20(%rdx)
6523 ; SCALAR-NEXT: movl %r9d, 16(%rdx)
6524 ; SCALAR-NEXT: movl %r8d, 12(%rdx)
6525 ; SCALAR-NEXT: movl %ecx, 8(%rdx)
6526 ; SCALAR-NEXT: movl %edi, 4(%rdx)
6527 ; SCALAR-NEXT: movl %eax, (%rdx)
6528 ; SCALAR-NEXT: movl %ebx, 60(%rdx)
6529 ; SCALAR-NEXT: movl %r11d, 56(%rdx)
6530 ; SCALAR-NEXT: movl %r10d, 52(%rdx)
6531 ; SCALAR-NEXT: movl %r9d, 48(%rdx)
6532 ; SCALAR-NEXT: movl %r8d, 44(%rdx)
6533 ; SCALAR-NEXT: movl %ecx, 40(%rdx)
6534 ; SCALAR-NEXT: movl %edi, 36(%rdx)
6535 ; SCALAR-NEXT: movl %eax, 32(%rdx)
6536 ; SCALAR-NEXT: popq %rbx
6539 ; SSE2-LABEL: vec512_v8i32:
6541 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6542 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6543 ; SSE2-NEXT: pxor %xmm0, %xmm1
6544 ; SSE2-NEXT: pxor (%rdi), %xmm0
6545 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6546 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
6547 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6548 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
6549 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
6550 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6553 ; AVX1-LABEL: vec512_v8i32:
6555 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
6556 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
6557 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
6558 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
6559 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6560 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6561 ; AVX1-NEXT: vzeroupper
6564 ; AVX2-LABEL: vec512_v8i32:
6566 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
6567 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
6568 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
6569 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
6570 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
6571 ; AVX2-NEXT: vzeroupper
6573 %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64
6574 %in.subvec = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
6575 store <8 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
6576 %out.subvec0.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 0
6577 store <8 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
6578 %out.subvec1.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 1
6579 store <8 x i32> %in.subvec, ptr %out.subvec1.ptr, align 32
6583 define void @vec512_v8f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6584 ; SCALAR-LABEL: vec512_v8f32:
6586 ; SCALAR-NEXT: pushq %rbx
6587 ; SCALAR-NEXT: movl 28(%rdi), %ebx
6588 ; SCALAR-NEXT: movl 24(%rdi), %r11d
6589 ; SCALAR-NEXT: movl 20(%rdi), %r10d
6590 ; SCALAR-NEXT: movl 16(%rdi), %r9d
6591 ; SCALAR-NEXT: movl 12(%rdi), %r8d
6592 ; SCALAR-NEXT: movl 8(%rdi), %ecx
6593 ; SCALAR-NEXT: movl (%rdi), %eax
6594 ; SCALAR-NEXT: movl 4(%rdi), %edi
6595 ; SCALAR-NEXT: notl %eax
6596 ; SCALAR-NEXT: notl %edi
6597 ; SCALAR-NEXT: notl %ecx
6598 ; SCALAR-NEXT: notl %r8d
6599 ; SCALAR-NEXT: notl %r9d
6600 ; SCALAR-NEXT: notl %r10d
6601 ; SCALAR-NEXT: notl %r11d
6602 ; SCALAR-NEXT: notl %ebx
6603 ; SCALAR-NEXT: movl %ebx, 28(%rsi)
6604 ; SCALAR-NEXT: movl %r11d, 24(%rsi)
6605 ; SCALAR-NEXT: movl %r10d, 20(%rsi)
6606 ; SCALAR-NEXT: movl %r9d, 16(%rsi)
6607 ; SCALAR-NEXT: movl %r8d, 12(%rsi)
6608 ; SCALAR-NEXT: movl %ecx, 8(%rsi)
6609 ; SCALAR-NEXT: movl %edi, 4(%rsi)
6610 ; SCALAR-NEXT: movl %eax, (%rsi)
6611 ; SCALAR-NEXT: movl %ebx, 28(%rdx)
6612 ; SCALAR-NEXT: movl %r11d, 24(%rdx)
6613 ; SCALAR-NEXT: movl %r10d, 20(%rdx)
6614 ; SCALAR-NEXT: movl %r9d, 16(%rdx)
6615 ; SCALAR-NEXT: movl %r8d, 12(%rdx)
6616 ; SCALAR-NEXT: movl %ecx, 8(%rdx)
6617 ; SCALAR-NEXT: movl %edi, 4(%rdx)
6618 ; SCALAR-NEXT: movl %eax, (%rdx)
6619 ; SCALAR-NEXT: movl %ebx, 60(%rdx)
6620 ; SCALAR-NEXT: movl %r11d, 56(%rdx)
6621 ; SCALAR-NEXT: movl %r10d, 52(%rdx)
6622 ; SCALAR-NEXT: movl %r9d, 48(%rdx)
6623 ; SCALAR-NEXT: movl %r8d, 44(%rdx)
6624 ; SCALAR-NEXT: movl %ecx, 40(%rdx)
6625 ; SCALAR-NEXT: movl %edi, 36(%rdx)
6626 ; SCALAR-NEXT: movl %eax, 32(%rdx)
6627 ; SCALAR-NEXT: popq %rbx
6630 ; SSE2-LABEL: vec512_v8f32:
6632 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6633 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6634 ; SSE2-NEXT: pxor %xmm0, %xmm1
6635 ; SSE2-NEXT: pxor (%rdi), %xmm0
6636 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6637 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
6638 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6639 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
6640 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
6641 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6644 ; AVX1-LABEL: vec512_v8f32:
6646 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
6647 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
6648 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
6649 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
6650 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6651 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6652 ; AVX1-NEXT: vzeroupper
6655 ; AVX2-LABEL: vec512_v8f32:
6657 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
6658 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
6659 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
6660 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
6661 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
6662 ; AVX2-NEXT: vzeroupper
6664 %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64
6665 %in.subvec.int = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
6666 %in.subvec = bitcast <8 x i32> %in.subvec.int to <8 x float>
6667 store <8 x float> %in.subvec, ptr %out.subvec.ptr, align 64
6668 %out.subvec0.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 0
6669 store <8 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
6670 %out.subvec1.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 1
6671 store <8 x float> %in.subvec, ptr %out.subvec1.ptr, align 32
6675 define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6676 ; SCALAR-LABEL: vec512_v16i8:
6678 ; SCALAR-NEXT: pushq %rbp
6679 ; SCALAR-NEXT: pushq %r15
6680 ; SCALAR-NEXT: pushq %r14
6681 ; SCALAR-NEXT: pushq %r13
6682 ; SCALAR-NEXT: pushq %r12
6683 ; SCALAR-NEXT: pushq %rbx
6684 ; SCALAR-NEXT: movzbl 15(%rdi), %eax
6685 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6686 ; SCALAR-NEXT: movzbl 14(%rdi), %eax
6687 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6688 ; SCALAR-NEXT: movzbl 13(%rdi), %eax
6689 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6690 ; SCALAR-NEXT: movzbl 12(%rdi), %r10d
6691 ; SCALAR-NEXT: movzbl 11(%rdi), %r13d
6692 ; SCALAR-NEXT: movzbl 10(%rdi), %r12d
6693 ; SCALAR-NEXT: movzbl 9(%rdi), %r15d
6694 ; SCALAR-NEXT: movzbl 8(%rdi), %r14d
6695 ; SCALAR-NEXT: movzbl 7(%rdi), %ebp
6696 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
6697 ; SCALAR-NEXT: movzbl 5(%rdi), %ebx
6698 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
6699 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
6700 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
6701 ; SCALAR-NEXT: movzbl (%rdi), %eax
6702 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
6703 ; SCALAR-NEXT: notb %al
6704 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6705 ; SCALAR-NEXT: notb %dil
6706 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6707 ; SCALAR-NEXT: notb %cl
6708 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6709 ; SCALAR-NEXT: notb %r8b
6710 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6711 ; SCALAR-NEXT: notb %r9b
6712 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6713 ; SCALAR-NEXT: movl %ebx, %r9d
6714 ; SCALAR-NEXT: notb %r9b
6715 ; SCALAR-NEXT: notb %r11b
6716 ; SCALAR-NEXT: movl %r11d, %ebx
6717 ; SCALAR-NEXT: notb %bpl
6718 ; SCALAR-NEXT: notb %r14b
6719 ; SCALAR-NEXT: notb %r15b
6720 ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6721 ; SCALAR-NEXT: notb %r12b
6722 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6723 ; SCALAR-NEXT: notb %r13b
6724 ; SCALAR-NEXT: notb %r10b
6725 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6726 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
6727 ; SCALAR-NEXT: notb %dil
6728 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
6729 ; SCALAR-NEXT: notb %r8b
6730 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
6731 ; SCALAR-NEXT: notb %r11b
6732 ; SCALAR-NEXT: movb %r11b, 15(%rsi)
6733 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6734 ; SCALAR-NEXT: movb %r8b, 14(%rsi)
6735 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6736 ; SCALAR-NEXT: movl %edi, %eax
6737 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6738 ; SCALAR-NEXT: movb %dil, 13(%rsi)
6739 ; SCALAR-NEXT: movb %r10b, 12(%rsi)
6740 ; SCALAR-NEXT: movb %r13b, 11(%rsi)
6741 ; SCALAR-NEXT: movb %r12b, 10(%rsi)
6742 ; SCALAR-NEXT: movb %r15b, 9(%rsi)
6743 ; SCALAR-NEXT: movb %r14b, 8(%rsi)
6744 ; SCALAR-NEXT: movl %r14d, %r12d
6745 ; SCALAR-NEXT: movb %bpl, 7(%rsi)
6746 ; SCALAR-NEXT: movl %ebp, %r14d
6747 ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6748 ; SCALAR-NEXT: movb %bl, 6(%rsi)
6749 ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6750 ; SCALAR-NEXT: movb %r9b, 5(%rsi)
6751 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
6752 ; SCALAR-NEXT: movb %cl, 4(%rsi)
6753 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
6754 ; SCALAR-NEXT: movb %bpl, 3(%rsi)
6755 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
6756 ; SCALAR-NEXT: movb %dil, 2(%rsi)
6757 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
6758 ; SCALAR-NEXT: movb %cl, 1(%rsi)
6759 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
6760 ; SCALAR-NEXT: movb %r10b, (%rsi)
6761 ; SCALAR-NEXT: movb %r11b, 15(%rdx)
6762 ; SCALAR-NEXT: movb %r8b, 14(%rdx)
6763 ; SCALAR-NEXT: movb %al, 13(%rdx)
6764 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
6765 ; SCALAR-NEXT: movb %al, 12(%rdx)
6766 ; SCALAR-NEXT: movb %r13b, 11(%rdx)
6767 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
6768 ; SCALAR-NEXT: movb %r15b, 10(%rdx)
6769 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
6770 ; SCALAR-NEXT: movb %sil, 9(%rdx)
6771 ; SCALAR-NEXT: movb %r12b, 8(%rdx)
6772 ; SCALAR-NEXT: movb %r14b, 7(%rdx)
6773 ; SCALAR-NEXT: movb %bl, 6(%rdx)
6774 ; SCALAR-NEXT: movb %r9b, 5(%rdx)
6775 ; SCALAR-NEXT: movl %r9d, %r11d
6776 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
6777 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
6778 ; SCALAR-NEXT: movb %r8b, 4(%rdx)
6779 ; SCALAR-NEXT: movb %bpl, 3(%rdx)
6780 ; SCALAR-NEXT: movb %dil, 2(%rdx)
6781 ; SCALAR-NEXT: movb %cl, 1(%rdx)
6782 ; SCALAR-NEXT: movl %ecx, %r14d
6783 ; SCALAR-NEXT: movl %r10d, %esi
6784 ; SCALAR-NEXT: movb %r10b, (%rdx)
6785 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
6786 ; SCALAR-NEXT: movb %cl, 31(%rdx)
6787 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
6788 ; SCALAR-NEXT: movb %r9b, 30(%rdx)
6789 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
6790 ; SCALAR-NEXT: movb %dil, 29(%rdx)
6791 ; SCALAR-NEXT: movb %al, 28(%rdx)
6792 ; SCALAR-NEXT: movl %eax, %r10d
6793 ; SCALAR-NEXT: movb %r13b, 27(%rdx)
6794 ; SCALAR-NEXT: movb %r15b, 26(%rdx)
6795 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
6796 ; SCALAR-NEXT: movb %r15b, 25(%rdx)
6797 ; SCALAR-NEXT: movl %r12d, %ebp
6798 ; SCALAR-NEXT: movb %r12b, 24(%rdx)
6799 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
6800 ; SCALAR-NEXT: movb %bl, 23(%rdx)
6801 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
6802 ; SCALAR-NEXT: movb %al, 22(%rdx)
6803 ; SCALAR-NEXT: movb %r11b, 21(%rdx)
6804 ; SCALAR-NEXT: movb %r8b, 20(%rdx)
6805 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
6806 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
6807 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
6808 ; SCALAR-NEXT: movb %r8b, 18(%rdx)
6809 ; SCALAR-NEXT: movb %r14b, 17(%rdx)
6810 ; SCALAR-NEXT: movb %sil, 16(%rdx)
6811 ; SCALAR-NEXT: movl %esi, %r11d
6812 ; SCALAR-NEXT: movb %cl, 47(%rdx)
6813 ; SCALAR-NEXT: movb %r9b, 46(%rdx)
6814 ; SCALAR-NEXT: movb %dil, 45(%rdx)
6815 ; SCALAR-NEXT: movb %r10b, 44(%rdx)
6816 ; SCALAR-NEXT: movb %r13b, 43(%rdx)
6817 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
6818 ; SCALAR-NEXT: movb %r12b, 42(%rdx)
6819 ; SCALAR-NEXT: movb %r15b, 41(%rdx)
6820 ; SCALAR-NEXT: movl %ebp, %r14d
6821 ; SCALAR-NEXT: movb %bpl, 40(%rdx)
6822 ; SCALAR-NEXT: movl %ebx, %ebp
6823 ; SCALAR-NEXT: movb %bl, 39(%rdx)
6824 ; SCALAR-NEXT: movl %eax, %ebx
6825 ; SCALAR-NEXT: movb %al, 38(%rdx)
6826 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
6827 ; SCALAR-NEXT: movb %cl, 37(%rdx)
6828 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
6829 ; SCALAR-NEXT: movb %al, 36(%rdx)
6830 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
6831 ; SCALAR-NEXT: movb %sil, 35(%rdx)
6832 ; SCALAR-NEXT: movb %r8b, 34(%rdx)
6833 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
6834 ; SCALAR-NEXT: movb %r9b, 33(%rdx)
6835 ; SCALAR-NEXT: movb %r11b, 32(%rdx)
6836 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
6837 ; SCALAR-NEXT: movb %r11b, 63(%rdx)
6838 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
6839 ; SCALAR-NEXT: movb %r11b, 62(%rdx)
6840 ; SCALAR-NEXT: movb %dil, 61(%rdx)
6841 ; SCALAR-NEXT: movb %r10b, 60(%rdx)
6842 ; SCALAR-NEXT: movb %r13b, 59(%rdx)
6843 ; SCALAR-NEXT: movb %r12b, 58(%rdx)
6844 ; SCALAR-NEXT: movb %r15b, 57(%rdx)
6845 ; SCALAR-NEXT: movb %r14b, 56(%rdx)
6846 ; SCALAR-NEXT: movb %bpl, 55(%rdx)
6847 ; SCALAR-NEXT: movb %bl, 54(%rdx)
6848 ; SCALAR-NEXT: movb %cl, 53(%rdx)
6849 ; SCALAR-NEXT: movb %al, 52(%rdx)
6850 ; SCALAR-NEXT: movb %sil, 51(%rdx)
6851 ; SCALAR-NEXT: movb %r8b, 50(%rdx)
6852 ; SCALAR-NEXT: movb %r9b, 49(%rdx)
6853 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
6854 ; SCALAR-NEXT: movb %al, 48(%rdx)
6855 ; SCALAR-NEXT: popq %rbx
6856 ; SCALAR-NEXT: popq %r12
6857 ; SCALAR-NEXT: popq %r13
6858 ; SCALAR-NEXT: popq %r14
6859 ; SCALAR-NEXT: popq %r15
6860 ; SCALAR-NEXT: popq %rbp
6863 ; SSE2-LABEL: vec512_v16i8:
6865 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6866 ; SSE2-NEXT: pxor (%rdi), %xmm0
6867 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6868 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6869 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6870 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6871 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6874 ; AVX-LABEL: vec512_v16i8:
6876 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6877 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
6878 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
6879 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
6880 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
6881 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
6882 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
6884 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
6885 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
6886 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
6887 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
6888 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
6889 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
6890 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
6891 %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2
6892 store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32
6893 %out.subvec3.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 3
6894 store <16 x i8> %in.subvec, ptr %out.subvec3.ptr, align 16
6898 define void @vec512_v16i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6899 ; SCALAR-LABEL: vec512_v16i16:
6901 ; SCALAR-NEXT: pushq %rbp
6902 ; SCALAR-NEXT: pushq %r15
6903 ; SCALAR-NEXT: pushq %r14
6904 ; SCALAR-NEXT: pushq %r13
6905 ; SCALAR-NEXT: pushq %r12
6906 ; SCALAR-NEXT: pushq %rbx
6907 ; SCALAR-NEXT: movzwl 30(%rdi), %eax
6908 ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6909 ; SCALAR-NEXT: movl 28(%rdi), %eax
6910 ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6911 ; SCALAR-NEXT: movzwl 26(%rdi), %eax
6912 ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6913 ; SCALAR-NEXT: movl 24(%rdi), %r13d
6914 ; SCALAR-NEXT: movzwl 22(%rdi), %r12d
6915 ; SCALAR-NEXT: movl 20(%rdi), %r15d
6916 ; SCALAR-NEXT: movzwl 18(%rdi), %r14d
6917 ; SCALAR-NEXT: movl 16(%rdi), %ebx
6918 ; SCALAR-NEXT: movzwl 14(%rdi), %r11d
6919 ; SCALAR-NEXT: movl 12(%rdi), %r10d
6920 ; SCALAR-NEXT: movzwl 10(%rdi), %r9d
6921 ; SCALAR-NEXT: movl 8(%rdi), %r8d
6922 ; SCALAR-NEXT: movzwl 6(%rdi), %ecx
6923 ; SCALAR-NEXT: movzwl 2(%rdi), %ebp
6924 ; SCALAR-NEXT: movl (%rdi), %eax
6925 ; SCALAR-NEXT: movl 4(%rdi), %edi
6926 ; SCALAR-NEXT: notl %eax
6927 ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6928 ; SCALAR-NEXT: notl %ebp
6929 ; SCALAR-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6930 ; SCALAR-NEXT: notl %edi
6931 ; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6932 ; SCALAR-NEXT: notl %ecx
6933 ; SCALAR-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6934 ; SCALAR-NEXT: notl %r8d
6935 ; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6936 ; SCALAR-NEXT: notl %r9d
6937 ; SCALAR-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6938 ; SCALAR-NEXT: movl %r10d, %edi
6939 ; SCALAR-NEXT: notl %edi
6940 ; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6941 ; SCALAR-NEXT: notl %r11d
6942 ; SCALAR-NEXT: movl %r11d, %r9d
6943 ; SCALAR-NEXT: notl %ebx
6944 ; SCALAR-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6945 ; SCALAR-NEXT: notl %r14d
6946 ; SCALAR-NEXT: notl %r15d
6947 ; SCALAR-NEXT: notl %r12d
6948 ; SCALAR-NEXT: notl %r13d
6949 ; SCALAR-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6950 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
6951 ; SCALAR-NEXT: notl %r10d
6952 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload
6953 ; SCALAR-NEXT: notl %r11d
6954 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
6955 ; SCALAR-NEXT: notl %r8d
6956 ; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6957 ; SCALAR-NEXT: movw %r8w, 30(%rsi)
6958 ; SCALAR-NEXT: movw %r11w, 28(%rsi)
6959 ; SCALAR-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6960 ; SCALAR-NEXT: movw %r10w, 26(%rsi)
6961 ; SCALAR-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
6962 ; SCALAR-NEXT: movw %r13w, 24(%rsi)
6963 ; SCALAR-NEXT: movw %r12w, 22(%rsi)
6964 ; SCALAR-NEXT: movw %r15w, 20(%rsi)
6965 ; SCALAR-NEXT: movw %r14w, 18(%rsi)
6966 ; SCALAR-NEXT: movw %bx, 16(%rsi)
6967 ; SCALAR-NEXT: movw %r9w, 14(%rsi)
6968 ; SCALAR-NEXT: movw %di, 12(%rsi)
6969 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Reload
6970 ; SCALAR-NEXT: movw %bp, 10(%rsi)
6971 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
6972 ; SCALAR-NEXT: movw %di, 8(%rsi)
6973 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
6974 ; SCALAR-NEXT: movw %cx, 6(%rsi)
6975 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
6976 ; SCALAR-NEXT: movw %r8w, 4(%rsi)
6977 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
6978 ; SCALAR-NEXT: movw %ax, 2(%rsi)
6979 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
6980 ; SCALAR-NEXT: movw %bx, (%rsi)
6981 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
6982 ; SCALAR-NEXT: movw %r13w, 30(%rdx)
6983 ; SCALAR-NEXT: movw %r11w, 28(%rdx)
6984 ; SCALAR-NEXT: movw %r10w, 26(%rdx)
6985 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
6986 ; SCALAR-NEXT: movw %si, 24(%rdx)
6987 ; SCALAR-NEXT: movw %r12w, 22(%rdx)
6988 ; SCALAR-NEXT: movw %r15w, 20(%rdx)
6989 ; SCALAR-NEXT: movw %r14w, 18(%rdx)
6990 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload
6991 ; SCALAR-NEXT: movw %r11w, 16(%rdx)
6992 ; SCALAR-NEXT: movw %r9w, 14(%rdx)
6993 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
6994 ; SCALAR-NEXT: movw %r10w, 12(%rdx)
6995 ; SCALAR-NEXT: movw %bp, 10(%rdx)
6996 ; SCALAR-NEXT: movw %di, 8(%rdx)
6997 ; SCALAR-NEXT: movw %cx, 6(%rdx)
6998 ; SCALAR-NEXT: movw %r8w, 4(%rdx)
6999 ; SCALAR-NEXT: movw %ax, 2(%rdx)
7000 ; SCALAR-NEXT: movl %ebx, %esi
7001 ; SCALAR-NEXT: movw %si, (%rdx)
7002 ; SCALAR-NEXT: movw %r13w, 62(%rdx)
7003 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7004 ; SCALAR-NEXT: movw %bx, 60(%rdx)
7005 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7006 ; SCALAR-NEXT: movw %bx, 58(%rdx)
7007 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7008 ; SCALAR-NEXT: movw %bx, 56(%rdx)
7009 ; SCALAR-NEXT: movw %r12w, 54(%rdx)
7010 ; SCALAR-NEXT: movw %r15w, 52(%rdx)
7011 ; SCALAR-NEXT: movw %r14w, 50(%rdx)
7012 ; SCALAR-NEXT: movw %r11w, 48(%rdx)
7013 ; SCALAR-NEXT: movw %r9w, 46(%rdx)
7014 ; SCALAR-NEXT: movw %r10w, 44(%rdx)
7015 ; SCALAR-NEXT: movw %bp, 42(%rdx)
7016 ; SCALAR-NEXT: movw %di, 40(%rdx)
7017 ; SCALAR-NEXT: movw %cx, 38(%rdx)
7018 ; SCALAR-NEXT: movw %r8w, 36(%rdx)
7019 ; SCALAR-NEXT: movw %ax, 34(%rdx)
7020 ; SCALAR-NEXT: movw %si, 32(%rdx)
7021 ; SCALAR-NEXT: popq %rbx
7022 ; SCALAR-NEXT: popq %r12
7023 ; SCALAR-NEXT: popq %r13
7024 ; SCALAR-NEXT: popq %r14
7025 ; SCALAR-NEXT: popq %r15
7026 ; SCALAR-NEXT: popq %rbp
7029 ; SSE2-LABEL: vec512_v16i16:
7031 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
7032 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
7033 ; SSE2-NEXT: pxor %xmm0, %xmm1
7034 ; SSE2-NEXT: pxor (%rdi), %xmm0
7035 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
7036 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
7037 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
7038 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
7039 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
7040 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
7043 ; AVX1-LABEL: vec512_v16i16:
7045 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
7046 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
7047 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
7048 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
7049 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
7050 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
7051 ; AVX1-NEXT: vzeroupper
7054 ; AVX2-LABEL: vec512_v16i16:
7056 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
7057 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
7058 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
7059 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
7060 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
7061 ; AVX2-NEXT: vzeroupper
7063 %in.subvec.not = load <16 x i16>, ptr %in.subvec.ptr, align 64
7064 %in.subvec = xor <16 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
7065 store <16 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
7066 %out.subvec0.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 0
7067 store <16 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
7068 %out.subvec1.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 1
7069 store <16 x i16> %in.subvec, ptr %out.subvec1.ptr, align 32
7073 define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
7074 ; SCALAR-LABEL: vec512_v32i8:
7076 ; SCALAR-NEXT: pushq %rbp
7077 ; SCALAR-NEXT: pushq %r15
7078 ; SCALAR-NEXT: pushq %r14
7079 ; SCALAR-NEXT: pushq %r13
7080 ; SCALAR-NEXT: pushq %r12
7081 ; SCALAR-NEXT: pushq %rbx
7082 ; SCALAR-NEXT: movzbl 16(%rdi), %eax
7083 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7084 ; SCALAR-NEXT: movzbl 15(%rdi), %eax
7085 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7086 ; SCALAR-NEXT: movzbl 14(%rdi), %eax
7087 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7088 ; SCALAR-NEXT: movzbl 13(%rdi), %eax
7089 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7090 ; SCALAR-NEXT: movzbl 12(%rdi), %r13d
7091 ; SCALAR-NEXT: movzbl 11(%rdi), %eax
7092 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7093 ; SCALAR-NEXT: movzbl 10(%rdi), %r12d
7094 ; SCALAR-NEXT: movzbl 9(%rdi), %r15d
7095 ; SCALAR-NEXT: movzbl 8(%rdi), %r14d
7096 ; SCALAR-NEXT: movzbl 7(%rdi), %ebp
7097 ; SCALAR-NEXT: movzbl 6(%rdi), %ebx
7098 ; SCALAR-NEXT: movzbl 5(%rdi), %r11d
7099 ; SCALAR-NEXT: movzbl 4(%rdi), %r10d
7100 ; SCALAR-NEXT: movzbl 3(%rdi), %r9d
7101 ; SCALAR-NEXT: movzbl 2(%rdi), %r8d
7102 ; SCALAR-NEXT: movzbl (%rdi), %eax
7103 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
7104 ; SCALAR-NEXT: notb %al
7105 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7106 ; SCALAR-NEXT: notb %cl
7107 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7108 ; SCALAR-NEXT: notb %r8b
7109 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7110 ; SCALAR-NEXT: notb %r9b
7111 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7112 ; SCALAR-NEXT: notb %r10b
7113 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7114 ; SCALAR-NEXT: notb %r11b
7115 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7116 ; SCALAR-NEXT: notb %bl
7117 ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7118 ; SCALAR-NEXT: notb %bpl
7119 ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7120 ; SCALAR-NEXT: notb %r14b
7121 ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7122 ; SCALAR-NEXT: notb %r15b
7123 ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7124 ; SCALAR-NEXT: notb %r12b
7125 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7126 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7127 ; SCALAR-NEXT: notb %r11b
7128 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7129 ; SCALAR-NEXT: notb %r13b
7130 ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7131 ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7132 ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7133 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7134 ; SCALAR-NEXT: notb %r8b
7135 ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7136 ; SCALAR-NEXT: movzbl 17(%rdi), %eax
7137 ; SCALAR-NEXT: notb %al
7138 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7139 ; SCALAR-NEXT: movzbl 18(%rdi), %eax
7140 ; SCALAR-NEXT: notb %al
7141 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7142 ; SCALAR-NEXT: movzbl 19(%rdi), %eax
7143 ; SCALAR-NEXT: notb %al
7144 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7145 ; SCALAR-NEXT: movzbl 20(%rdi), %eax
7146 ; SCALAR-NEXT: notb %al
7147 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7148 ; SCALAR-NEXT: movzbl 21(%rdi), %ebp
7149 ; SCALAR-NEXT: notb %bpl
7150 ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7151 ; SCALAR-NEXT: movzbl 22(%rdi), %ebx
7152 ; SCALAR-NEXT: notb %bl
7153 ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7154 ; SCALAR-NEXT: movzbl 23(%rdi), %r10d
7155 ; SCALAR-NEXT: notb %r10b
7156 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7157 ; SCALAR-NEXT: movzbl 24(%rdi), %r9d
7158 ; SCALAR-NEXT: notb %r9b
7159 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7160 ; SCALAR-NEXT: movzbl 25(%rdi), %ecx
7161 ; SCALAR-NEXT: notb %cl
7162 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7163 ; SCALAR-NEXT: movzbl 26(%rdi), %r14d
7164 ; SCALAR-NEXT: notb %r14b
7165 ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7166 ; SCALAR-NEXT: movzbl 27(%rdi), %r15d
7167 ; SCALAR-NEXT: notb %r15b
7168 ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7169 ; SCALAR-NEXT: movzbl 28(%rdi), %r12d
7170 ; SCALAR-NEXT: notb %r12b
7171 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7172 ; SCALAR-NEXT: movzbl 29(%rdi), %r13d
7173 ; SCALAR-NEXT: notb %r13b
7174 ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7175 ; SCALAR-NEXT: movzbl 30(%rdi), %eax
7176 ; SCALAR-NEXT: notb %al
7177 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7178 ; SCALAR-NEXT: movzbl 31(%rdi), %edi
7179 ; SCALAR-NEXT: notb %dil
7180 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7181 ; SCALAR-NEXT: movb %dil, 31(%rsi)
7182 ; SCALAR-NEXT: movb %al, 30(%rsi)
7183 ; SCALAR-NEXT: movb %r13b, 29(%rsi)
7184 ; SCALAR-NEXT: movb %r12b, 28(%rsi)
7185 ; SCALAR-NEXT: movb %r15b, 27(%rsi)
7186 ; SCALAR-NEXT: movb %r14b, 26(%rsi)
7187 ; SCALAR-NEXT: movb %cl, 25(%rsi)
7188 ; SCALAR-NEXT: movb %r9b, 24(%rsi)
7189 ; SCALAR-NEXT: movb %r10b, 23(%rsi)
7190 ; SCALAR-NEXT: movb %bl, 22(%rsi)
7191 ; SCALAR-NEXT: movb %bpl, 21(%rsi)
7192 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
7193 ; SCALAR-NEXT: movb %bpl, 20(%rsi)
7194 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7195 ; SCALAR-NEXT: movb %al, 19(%rsi)
7196 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7197 ; SCALAR-NEXT: movb %al, 18(%rsi)
7198 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7199 ; SCALAR-NEXT: movb %al, 17(%rsi)
7200 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7201 ; SCALAR-NEXT: movb %cl, 16(%rsi)
7202 ; SCALAR-NEXT: movb %r8b, 15(%rsi)
7203 ; SCALAR-NEXT: movl %r8d, %r14d
7204 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7205 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
7206 ; SCALAR-NEXT: movb %bl, 14(%rsi)
7207 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7208 ; SCALAR-NEXT: movb %al, 13(%rsi)
7209 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7210 ; SCALAR-NEXT: movb %al, 12(%rsi)
7211 ; SCALAR-NEXT: movb %r11b, 11(%rsi)
7212 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7213 ; SCALAR-NEXT: movb %dil, 10(%rsi)
7214 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7215 ; SCALAR-NEXT: movb %dil, 9(%rsi)
7216 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7217 ; SCALAR-NEXT: movb %dil, 8(%rsi)
7218 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7219 ; SCALAR-NEXT: movb %r11b, 7(%rsi)
7220 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
7221 ; SCALAR-NEXT: movb %r13b, 6(%rsi)
7222 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
7223 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
7224 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
7225 ; SCALAR-NEXT: movb %r12b, 4(%rsi)
7226 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
7227 ; SCALAR-NEXT: movb %r9b, 3(%rsi)
7228 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
7229 ; SCALAR-NEXT: movb %r15b, 2(%rsi)
7230 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7231 ; SCALAR-NEXT: movb %r8b, 1(%rsi)
7232 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7233 ; SCALAR-NEXT: movb %dil, (%rsi)
7234 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7235 ; SCALAR-NEXT: movb %sil, 31(%rdx)
7236 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7237 ; SCALAR-NEXT: movb %sil, 30(%rdx)
7238 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7239 ; SCALAR-NEXT: movb %sil, 29(%rdx)
7240 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7241 ; SCALAR-NEXT: movb %sil, 28(%rdx)
7242 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7243 ; SCALAR-NEXT: movb %sil, 27(%rdx)
7244 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7245 ; SCALAR-NEXT: movb %sil, 26(%rdx)
7246 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7247 ; SCALAR-NEXT: movb %sil, 25(%rdx)
7248 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7249 ; SCALAR-NEXT: movb %sil, 24(%rdx)
7250 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7251 ; SCALAR-NEXT: movb %sil, 23(%rdx)
7252 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7253 ; SCALAR-NEXT: movb %sil, 22(%rdx)
7254 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7255 ; SCALAR-NEXT: movb %sil, 21(%rdx)
7256 ; SCALAR-NEXT: movb %bpl, 20(%rdx)
7257 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7258 ; SCALAR-NEXT: movb %sil, 19(%rdx)
7259 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7260 ; SCALAR-NEXT: movb %sil, 18(%rdx)
7261 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7262 ; SCALAR-NEXT: movb %sil, 17(%rdx)
7263 ; SCALAR-NEXT: movb %cl, 16(%rdx)
7264 ; SCALAR-NEXT: movb %r14b, 15(%rdx)
7265 ; SCALAR-NEXT: movb %bl, 14(%rdx)
7266 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7267 ; SCALAR-NEXT: movb %cl, 13(%rdx)
7268 ; SCALAR-NEXT: movb %al, 12(%rdx)
7269 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7270 ; SCALAR-NEXT: movb %sil, 11(%rdx)
7271 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
7272 ; SCALAR-NEXT: movb %bl, 10(%rdx)
7273 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
7274 ; SCALAR-NEXT: movb %r14b, 9(%rdx)
7275 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
7276 ; SCALAR-NEXT: movb %bpl, 8(%rdx)
7277 ; SCALAR-NEXT: movb %r11b, 7(%rdx)
7278 ; SCALAR-NEXT: movb %r13b, 6(%rdx)
7279 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
7280 ; SCALAR-NEXT: movb %r12b, 4(%rdx)
7281 ; SCALAR-NEXT: movb %r9b, 3(%rdx)
7282 ; SCALAR-NEXT: movb %r15b, 2(%rdx)
7283 ; SCALAR-NEXT: movb %r8b, 1(%rdx)
7284 ; SCALAR-NEXT: movb %dil, (%rdx)
7285 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7286 ; SCALAR-NEXT: movb %al, 63(%rdx)
7287 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7288 ; SCALAR-NEXT: movb %al, 62(%rdx)
7289 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7290 ; SCALAR-NEXT: movb %al, 61(%rdx)
7291 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7292 ; SCALAR-NEXT: movb %al, 60(%rdx)
7293 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7294 ; SCALAR-NEXT: movb %al, 59(%rdx)
7295 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7296 ; SCALAR-NEXT: movb %al, 58(%rdx)
7297 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7298 ; SCALAR-NEXT: movb %al, 57(%rdx)
7299 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7300 ; SCALAR-NEXT: movb %al, 56(%rdx)
7301 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7302 ; SCALAR-NEXT: movb %al, 55(%rdx)
7303 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7304 ; SCALAR-NEXT: movb %al, 54(%rdx)
7305 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7306 ; SCALAR-NEXT: movb %al, 53(%rdx)
7307 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7308 ; SCALAR-NEXT: movb %al, 52(%rdx)
7309 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7310 ; SCALAR-NEXT: movb %al, 51(%rdx)
7311 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7312 ; SCALAR-NEXT: movb %al, 50(%rdx)
7313 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7314 ; SCALAR-NEXT: movb %al, 49(%rdx)
7315 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7316 ; SCALAR-NEXT: movb %al, 48(%rdx)
7317 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7318 ; SCALAR-NEXT: movb %al, 47(%rdx)
7319 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7320 ; SCALAR-NEXT: movb %al, 46(%rdx)
7321 ; SCALAR-NEXT: movb %cl, 45(%rdx)
7322 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7323 ; SCALAR-NEXT: movb %al, 44(%rdx)
7324 ; SCALAR-NEXT: movb %sil, 43(%rdx)
7325 ; SCALAR-NEXT: movb %bl, 42(%rdx)
7326 ; SCALAR-NEXT: movb %r14b, 41(%rdx)
7327 ; SCALAR-NEXT: movb %bpl, 40(%rdx)
7328 ; SCALAR-NEXT: movb %r11b, 39(%rdx)
7329 ; SCALAR-NEXT: movb %r13b, 38(%rdx)
7330 ; SCALAR-NEXT: movb %r10b, 37(%rdx)
7331 ; SCALAR-NEXT: movb %r12b, 36(%rdx)
7332 ; SCALAR-NEXT: movb %r9b, 35(%rdx)
7333 ; SCALAR-NEXT: movb %r15b, 34(%rdx)
7334 ; SCALAR-NEXT: movb %r8b, 33(%rdx)
7335 ; SCALAR-NEXT: movb %dil, 32(%rdx)
7336 ; SCALAR-NEXT: popq %rbx
7337 ; SCALAR-NEXT: popq %r12
7338 ; SCALAR-NEXT: popq %r13
7339 ; SCALAR-NEXT: popq %r14
7340 ; SCALAR-NEXT: popq %r15
7341 ; SCALAR-NEXT: popq %rbp
7344 ; SSE2-LABEL: vec512_v32i8:
7346 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
7347 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
7348 ; SSE2-NEXT: pxor %xmm0, %xmm1
7349 ; SSE2-NEXT: pxor (%rdi), %xmm0
7350 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
7351 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
7352 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
7353 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
7354 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
7355 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
7358 ; AVX1-LABEL: vec512_v32i8:
7360 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
7361 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
7362 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
7363 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
7364 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
7365 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
7366 ; AVX1-NEXT: vzeroupper
7369 ; AVX2-LABEL: vec512_v32i8:
7371 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
7372 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
7373 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
7374 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
7375 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
7376 ; AVX2-NEXT: vzeroupper
7378 %in.subvec.not = load <32 x i8>, ptr %in.subvec.ptr, align 64
7379 %in.subvec = xor <32 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
7380 store <32 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
7381 %out.subvec0.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 0
7382 store <32 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
7383 %out.subvec1.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 1
7384 store <32 x i8> %in.subvec, ptr %out.subvec1.ptr, align 32
7387 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: