1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-sse2 | FileCheck %s --check-prefixes=ALL,SCALAR
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE2-ONLY
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSE3
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSSE3-ONLY
6 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE41
7 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2,SSSE3,SSE42
8 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX2-ONLY
10 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512F
11 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,SSE,AVX,AVX2,AVX512,AVX512BW
13 define void @vec32_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
14 ; SCALAR-LABEL: vec32_v2i8:
16 ; SCALAR-NEXT: movzbl (%rdi), %eax
17 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
18 ; SCALAR-NEXT: notb %al
19 ; SCALAR-NEXT: notb %cl
20 ; SCALAR-NEXT: movb %cl, 1(%rsi)
21 ; SCALAR-NEXT: movb %al, (%rsi)
22 ; SCALAR-NEXT: movb %cl, 1(%rdx)
23 ; SCALAR-NEXT: movb %al, (%rdx)
24 ; SCALAR-NEXT: movb %cl, 3(%rdx)
25 ; SCALAR-NEXT: movb %al, 2(%rdx)
28 ; SSE-LABEL: vec32_v2i8:
30 ; SSE-NEXT: movl (%rdi), %eax
32 ; SSE-NEXT: movw %ax, (%rsi)
33 ; SSE-NEXT: movw %ax, (%rdx)
34 ; SSE-NEXT: movw %ax, 2(%rdx)
36 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
37 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
38 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
39 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
40 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
41 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
42 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
46 define void @vec64_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
47 ; SCALAR-LABEL: vec64_v2i8:
49 ; SCALAR-NEXT: movzbl (%rdi), %eax
50 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
51 ; SCALAR-NEXT: notb %al
52 ; SCALAR-NEXT: notb %cl
53 ; SCALAR-NEXT: movb %cl, 1(%rsi)
54 ; SCALAR-NEXT: movb %al, (%rsi)
55 ; SCALAR-NEXT: movb %cl, 1(%rdx)
56 ; SCALAR-NEXT: movb %al, (%rdx)
57 ; SCALAR-NEXT: movb %cl, 3(%rdx)
58 ; SCALAR-NEXT: movb %al, 2(%rdx)
59 ; SCALAR-NEXT: movb %cl, 5(%rdx)
60 ; SCALAR-NEXT: movb %al, 4(%rdx)
61 ; SCALAR-NEXT: movb %cl, 7(%rdx)
62 ; SCALAR-NEXT: movb %al, 6(%rdx)
65 ; SSE-LABEL: vec64_v2i8:
67 ; SSE-NEXT: movl (%rdi), %eax
69 ; SSE-NEXT: movw %ax, (%rsi)
70 ; SSE-NEXT: movw %ax, (%rdx)
71 ; SSE-NEXT: movw %ax, 2(%rdx)
72 ; SSE-NEXT: movw %ax, 4(%rdx)
73 ; SSE-NEXT: movw %ax, 6(%rdx)
75 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
76 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
77 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
78 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
79 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
80 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
81 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
82 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
83 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
84 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
85 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
89 define void @vec64_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
90 ; SCALAR-LABEL: vec64_v2i16:
92 ; SCALAR-NEXT: movzwl 2(%rdi), %eax
93 ; SCALAR-NEXT: movl (%rdi), %ecx
94 ; SCALAR-NEXT: notl %ecx
95 ; SCALAR-NEXT: notl %eax
96 ; SCALAR-NEXT: movw %ax, 2(%rsi)
97 ; SCALAR-NEXT: movw %cx, (%rsi)
98 ; SCALAR-NEXT: movw %ax, 2(%rdx)
99 ; SCALAR-NEXT: movw %cx, (%rdx)
100 ; SCALAR-NEXT: movw %ax, 6(%rdx)
101 ; SCALAR-NEXT: movw %cx, 4(%rdx)
104 ; SSE-LABEL: vec64_v2i16:
106 ; SSE-NEXT: movl (%rdi), %eax
107 ; SSE-NEXT: notl %eax
108 ; SSE-NEXT: movl %eax, (%rsi)
109 ; SSE-NEXT: movl %eax, (%rdx)
110 ; SSE-NEXT: movl %eax, 4(%rdx)
112 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
113 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
114 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
115 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
116 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
117 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
118 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
122 define void @vec64_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
123 ; SCALAR-LABEL: vec64_v4i8:
125 ; SCALAR-NEXT: movzbl 3(%rdi), %eax
126 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
127 ; SCALAR-NEXT: movzbl (%rdi), %r8d
128 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
129 ; SCALAR-NEXT: notb %r8b
130 ; SCALAR-NEXT: notb %dil
131 ; SCALAR-NEXT: notb %cl
132 ; SCALAR-NEXT: notb %al
133 ; SCALAR-NEXT: movb %al, 3(%rsi)
134 ; SCALAR-NEXT: movb %cl, 2(%rsi)
135 ; SCALAR-NEXT: movb %dil, 1(%rsi)
136 ; SCALAR-NEXT: movb %r8b, (%rsi)
137 ; SCALAR-NEXT: movb %al, 3(%rdx)
138 ; SCALAR-NEXT: movb %cl, 2(%rdx)
139 ; SCALAR-NEXT: movb %dil, 1(%rdx)
140 ; SCALAR-NEXT: movb %r8b, (%rdx)
141 ; SCALAR-NEXT: movb %al, 7(%rdx)
142 ; SCALAR-NEXT: movb %cl, 6(%rdx)
143 ; SCALAR-NEXT: movb %dil, 5(%rdx)
144 ; SCALAR-NEXT: movb %r8b, 4(%rdx)
147 ; SSE-LABEL: vec64_v4i8:
149 ; SSE-NEXT: movl (%rdi), %eax
150 ; SSE-NEXT: notl %eax
151 ; SSE-NEXT: movl %eax, (%rsi)
152 ; SSE-NEXT: movl %eax, (%rdx)
153 ; SSE-NEXT: movl %eax, 4(%rdx)
155 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
156 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
157 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
158 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
159 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
160 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
161 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
165 define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
166 ; SCALAR-LABEL: vec128_v2i8:
168 ; SCALAR-NEXT: movzbl (%rdi), %eax
169 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
170 ; SCALAR-NEXT: notb %al
171 ; SCALAR-NEXT: notb %cl
172 ; SCALAR-NEXT: movb %cl, 1(%rsi)
173 ; SCALAR-NEXT: movb %al, (%rsi)
174 ; SCALAR-NEXT: movb %cl, 1(%rdx)
175 ; SCALAR-NEXT: movb %al, (%rdx)
176 ; SCALAR-NEXT: movb %cl, 3(%rdx)
177 ; SCALAR-NEXT: movb %al, 2(%rdx)
178 ; SCALAR-NEXT: movb %cl, 5(%rdx)
179 ; SCALAR-NEXT: movb %al, 4(%rdx)
180 ; SCALAR-NEXT: movb %cl, 7(%rdx)
181 ; SCALAR-NEXT: movb %al, 6(%rdx)
182 ; SCALAR-NEXT: movb %cl, 9(%rdx)
183 ; SCALAR-NEXT: movb %al, 8(%rdx)
184 ; SCALAR-NEXT: movb %cl, 11(%rdx)
185 ; SCALAR-NEXT: movb %al, 10(%rdx)
186 ; SCALAR-NEXT: movb %cl, 13(%rdx)
187 ; SCALAR-NEXT: movb %al, 12(%rdx)
188 ; SCALAR-NEXT: movb %cl, 15(%rdx)
189 ; SCALAR-NEXT: movb %al, 14(%rdx)
192 ; SSE2-ONLY-LABEL: vec128_v2i8:
193 ; SSE2-ONLY: # %bb.0:
194 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
195 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
196 ; SSE2-ONLY-NEXT: movd %xmm0, %eax
197 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
198 ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
199 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
200 ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
201 ; SSE2-ONLY-NEXT: retq
203 ; SSE3-LABEL: vec128_v2i8:
205 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
206 ; SSE3-NEXT: pxor (%rdi), %xmm0
207 ; SSE3-NEXT: movd %xmm0, %eax
208 ; SSE3-NEXT: movw %ax, (%rsi)
209 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
210 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
211 ; SSE3-NEXT: movdqa %xmm0, (%rdx)
214 ; SSSE3-ONLY-LABEL: vec128_v2i8:
215 ; SSSE3-ONLY: # %bb.0:
216 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
217 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
218 ; SSSE3-ONLY-NEXT: movd %xmm0, %eax
219 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
220 ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
221 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
222 ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
223 ; SSSE3-ONLY-NEXT: retq
225 ; SSE41-LABEL: vec128_v2i8:
227 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
228 ; SSE41-NEXT: pxor (%rdi), %xmm0
229 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
230 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
231 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
232 ; SSE41-NEXT: movdqa %xmm0, (%rdx)
235 ; SSE42-LABEL: vec128_v2i8:
237 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
238 ; SSE42-NEXT: pxor (%rdi), %xmm0
239 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
240 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
241 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
242 ; SSE42-NEXT: movdqa %xmm0, (%rdx)
245 ; AVX1-LABEL: vec128_v2i8:
247 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
248 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
249 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
250 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
251 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
252 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
255 ; AVX2-LABEL: vec128_v2i8:
257 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
258 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
259 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi)
260 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
261 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
263 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
264 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
265 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
266 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
267 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
268 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
269 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
270 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
271 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
272 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
273 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
274 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
275 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
276 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
277 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
278 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
279 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
280 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
281 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
285 define void @vec128_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
286 ; SCALAR-LABEL: vec128_v2i16:
288 ; SCALAR-NEXT: movzwl 2(%rdi), %eax
289 ; SCALAR-NEXT: movl (%rdi), %ecx
290 ; SCALAR-NEXT: notl %ecx
291 ; SCALAR-NEXT: notl %eax
292 ; SCALAR-NEXT: movw %ax, 2(%rsi)
293 ; SCALAR-NEXT: movw %cx, (%rsi)
294 ; SCALAR-NEXT: movw %ax, 2(%rdx)
295 ; SCALAR-NEXT: movw %cx, (%rdx)
296 ; SCALAR-NEXT: movw %ax, 6(%rdx)
297 ; SCALAR-NEXT: movw %cx, 4(%rdx)
298 ; SCALAR-NEXT: movw %ax, 10(%rdx)
299 ; SCALAR-NEXT: movw %cx, 8(%rdx)
300 ; SCALAR-NEXT: movw %ax, 14(%rdx)
301 ; SCALAR-NEXT: movw %cx, 12(%rdx)
304 ; SSE2-LABEL: vec128_v2i16:
306 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
307 ; SSE2-NEXT: pxor (%rdi), %xmm0
308 ; SSE2-NEXT: movd %xmm0, (%rsi)
309 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
310 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
313 ; AVX1-LABEL: vec128_v2i16:
315 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
316 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
317 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
318 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
319 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
322 ; AVX2-LABEL: vec128_v2i16:
324 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
325 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
326 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
327 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
328 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
330 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
331 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
332 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
333 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
334 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
335 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
336 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
337 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
338 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
339 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
340 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
344 define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
345 ; SCALAR-LABEL: vec128_v2i32:
347 ; SCALAR-NEXT: movl (%rdi), %eax
348 ; SCALAR-NEXT: movl 4(%rdi), %ecx
349 ; SCALAR-NEXT: notl %eax
350 ; SCALAR-NEXT: notl %ecx
351 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
352 ; SCALAR-NEXT: movl %eax, (%rsi)
353 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
354 ; SCALAR-NEXT: movl %eax, (%rdx)
355 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
356 ; SCALAR-NEXT: movl %eax, 8(%rdx)
359 ; SSE2-LABEL: vec128_v2i32:
361 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
362 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
363 ; SSE2-NEXT: pxor %xmm0, %xmm1
364 ; SSE2-NEXT: movq %xmm1, (%rsi)
365 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
366 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
369 ; AVX1-LABEL: vec128_v2i32:
371 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
372 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
373 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
374 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
375 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
376 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
379 ; AVX2-ONLY-LABEL: vec128_v2i32:
380 ; AVX2-ONLY: # %bb.0:
381 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
382 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
383 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
384 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
385 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0
386 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
387 ; AVX2-ONLY-NEXT: retq
389 ; AVX512-LABEL: vec128_v2i32:
391 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
392 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
393 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
394 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
395 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
397 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
398 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
399 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
400 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
401 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
402 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
403 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
407 define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
408 ; SCALAR-LABEL: vec128_v2f32:
410 ; SCALAR-NEXT: movl (%rdi), %eax
411 ; SCALAR-NEXT: movl 4(%rdi), %ecx
412 ; SCALAR-NEXT: notl %eax
413 ; SCALAR-NEXT: notl %ecx
414 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
415 ; SCALAR-NEXT: movl %eax, (%rsi)
416 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
417 ; SCALAR-NEXT: movl %eax, (%rdx)
418 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
419 ; SCALAR-NEXT: movl %eax, 8(%rdx)
422 ; SSE2-LABEL: vec128_v2f32:
424 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
425 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
426 ; SSE2-NEXT: pxor %xmm0, %xmm1
427 ; SSE2-NEXT: movq %xmm1, (%rsi)
428 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
429 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
432 ; AVX1-LABEL: vec128_v2f32:
434 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
435 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
436 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
437 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
438 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
439 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
442 ; AVX2-ONLY-LABEL: vec128_v2f32:
443 ; AVX2-ONLY: # %bb.0:
444 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
445 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
446 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
447 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
448 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0
449 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
450 ; AVX2-ONLY-NEXT: retq
452 ; AVX512-LABEL: vec128_v2f32:
454 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
455 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
456 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
457 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
458 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
460 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
461 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
462 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
463 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
464 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
465 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
466 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
467 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
471 define void @vec128_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
472 ; SCALAR-LABEL: vec128_v4i8:
474 ; SCALAR-NEXT: movzbl 3(%rdi), %eax
475 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
476 ; SCALAR-NEXT: movzbl (%rdi), %r8d
477 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
478 ; SCALAR-NEXT: notb %r8b
479 ; SCALAR-NEXT: notb %dil
480 ; SCALAR-NEXT: notb %cl
481 ; SCALAR-NEXT: notb %al
482 ; SCALAR-NEXT: movb %al, 3(%rsi)
483 ; SCALAR-NEXT: movb %cl, 2(%rsi)
484 ; SCALAR-NEXT: movb %dil, 1(%rsi)
485 ; SCALAR-NEXT: movb %r8b, (%rsi)
486 ; SCALAR-NEXT: movb %al, 3(%rdx)
487 ; SCALAR-NEXT: movb %cl, 2(%rdx)
488 ; SCALAR-NEXT: movb %dil, 1(%rdx)
489 ; SCALAR-NEXT: movb %r8b, (%rdx)
490 ; SCALAR-NEXT: movb %al, 7(%rdx)
491 ; SCALAR-NEXT: movb %cl, 6(%rdx)
492 ; SCALAR-NEXT: movb %dil, 5(%rdx)
493 ; SCALAR-NEXT: movb %r8b, 4(%rdx)
494 ; SCALAR-NEXT: movb %al, 11(%rdx)
495 ; SCALAR-NEXT: movb %cl, 10(%rdx)
496 ; SCALAR-NEXT: movb %dil, 9(%rdx)
497 ; SCALAR-NEXT: movb %r8b, 8(%rdx)
498 ; SCALAR-NEXT: movb %al, 15(%rdx)
499 ; SCALAR-NEXT: movb %cl, 14(%rdx)
500 ; SCALAR-NEXT: movb %dil, 13(%rdx)
501 ; SCALAR-NEXT: movb %r8b, 12(%rdx)
504 ; SSE2-LABEL: vec128_v4i8:
506 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
507 ; SSE2-NEXT: pxor (%rdi), %xmm0
508 ; SSE2-NEXT: movd %xmm0, (%rsi)
509 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
510 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
513 ; AVX1-LABEL: vec128_v4i8:
515 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
516 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
517 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
518 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
519 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
522 ; AVX2-LABEL: vec128_v4i8:
524 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
525 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
526 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
527 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
528 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
530 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
531 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
532 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
533 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
534 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
535 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
536 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
537 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
538 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
539 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
540 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
544 define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
545 ; SCALAR-LABEL: vec128_v4i16:
547 ; SCALAR-NEXT: movzwl 6(%rdi), %eax
548 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
549 ; SCALAR-NEXT: movl (%rdi), %r8d
550 ; SCALAR-NEXT: movl 4(%rdi), %edi
551 ; SCALAR-NEXT: notl %r8d
552 ; SCALAR-NEXT: notl %ecx
553 ; SCALAR-NEXT: notl %edi
554 ; SCALAR-NEXT: notl %eax
555 ; SCALAR-NEXT: movw %ax, 6(%rsi)
556 ; SCALAR-NEXT: movw %di, 4(%rsi)
557 ; SCALAR-NEXT: movw %cx, 2(%rsi)
558 ; SCALAR-NEXT: movw %r8w, (%rsi)
559 ; SCALAR-NEXT: movw %ax, 6(%rdx)
560 ; SCALAR-NEXT: movw %di, 4(%rdx)
561 ; SCALAR-NEXT: movw %cx, 2(%rdx)
562 ; SCALAR-NEXT: movw %r8w, (%rdx)
563 ; SCALAR-NEXT: movw %ax, 14(%rdx)
564 ; SCALAR-NEXT: movw %di, 12(%rdx)
565 ; SCALAR-NEXT: movw %cx, 10(%rdx)
566 ; SCALAR-NEXT: movw %r8w, 8(%rdx)
569 ; SSE2-LABEL: vec128_v4i16:
571 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
572 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
573 ; SSE2-NEXT: pxor %xmm0, %xmm1
574 ; SSE2-NEXT: movq %xmm1, (%rsi)
575 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
576 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
579 ; AVX1-LABEL: vec128_v4i16:
581 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
582 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
583 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
584 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
585 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
586 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
589 ; AVX2-ONLY-LABEL: vec128_v4i16:
590 ; AVX2-ONLY: # %bb.0:
591 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
592 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
593 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
594 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
595 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0
596 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
597 ; AVX2-ONLY-NEXT: retq
599 ; AVX512-LABEL: vec128_v4i16:
601 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
602 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
603 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
604 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
605 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
607 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
608 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
609 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
610 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
611 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
612 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
613 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
617 define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
618 ; SCALAR-LABEL: vec128_v8i8:
620 ; SCALAR-NEXT: pushq %rbx
621 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
622 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
623 ; SCALAR-NEXT: movzbl 5(%rdi), %r10d
624 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
625 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
626 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
627 ; SCALAR-NEXT: movzbl (%rdi), %eax
628 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
629 ; SCALAR-NEXT: notb %al
630 ; SCALAR-NEXT: notb %dil
631 ; SCALAR-NEXT: notb %cl
632 ; SCALAR-NEXT: notb %r8b
633 ; SCALAR-NEXT: notb %r9b
634 ; SCALAR-NEXT: notb %r10b
635 ; SCALAR-NEXT: notb %r11b
636 ; SCALAR-NEXT: notb %bl
637 ; SCALAR-NEXT: movb %bl, 7(%rsi)
638 ; SCALAR-NEXT: movb %r11b, 6(%rsi)
639 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
640 ; SCALAR-NEXT: movb %r9b, 4(%rsi)
641 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
642 ; SCALAR-NEXT: movb %cl, 2(%rsi)
643 ; SCALAR-NEXT: movb %dil, 1(%rsi)
644 ; SCALAR-NEXT: movb %al, (%rsi)
645 ; SCALAR-NEXT: movb %bl, 7(%rdx)
646 ; SCALAR-NEXT: movb %r11b, 6(%rdx)
647 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
648 ; SCALAR-NEXT: movb %r9b, 4(%rdx)
649 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
650 ; SCALAR-NEXT: movb %cl, 2(%rdx)
651 ; SCALAR-NEXT: movb %dil, 1(%rdx)
652 ; SCALAR-NEXT: movb %al, (%rdx)
653 ; SCALAR-NEXT: movb %bl, 15(%rdx)
654 ; SCALAR-NEXT: movb %r11b, 14(%rdx)
655 ; SCALAR-NEXT: movb %r10b, 13(%rdx)
656 ; SCALAR-NEXT: movb %r9b, 12(%rdx)
657 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
658 ; SCALAR-NEXT: movb %cl, 10(%rdx)
659 ; SCALAR-NEXT: movb %dil, 9(%rdx)
660 ; SCALAR-NEXT: movb %al, 8(%rdx)
661 ; SCALAR-NEXT: popq %rbx
664 ; SSE2-LABEL: vec128_v8i8:
666 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
667 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
668 ; SSE2-NEXT: pxor %xmm0, %xmm1
669 ; SSE2-NEXT: movq %xmm1, (%rsi)
670 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
671 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
674 ; AVX1-LABEL: vec128_v8i8:
676 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
677 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
678 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
679 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
680 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
681 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
684 ; AVX2-ONLY-LABEL: vec128_v8i8:
685 ; AVX2-ONLY: # %bb.0:
686 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
687 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
688 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
689 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
690 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %xmm0
691 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rdx)
692 ; AVX2-ONLY-NEXT: retq
694 ; AVX512-LABEL: vec128_v8i8:
696 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
697 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
698 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
699 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
700 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
702 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
703 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
704 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
705 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
706 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
707 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
708 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
712 define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
713 ; SCALAR-LABEL: vec256_v2i8:
715 ; SCALAR-NEXT: movzbl (%rdi), %eax
716 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
717 ; SCALAR-NEXT: notb %al
718 ; SCALAR-NEXT: notb %cl
719 ; SCALAR-NEXT: movb %cl, 1(%rsi)
720 ; SCALAR-NEXT: movb %al, (%rsi)
721 ; SCALAR-NEXT: movb %cl, 1(%rdx)
722 ; SCALAR-NEXT: movb %al, (%rdx)
723 ; SCALAR-NEXT: movb %cl, 3(%rdx)
724 ; SCALAR-NEXT: movb %al, 2(%rdx)
725 ; SCALAR-NEXT: movb %cl, 5(%rdx)
726 ; SCALAR-NEXT: movb %al, 4(%rdx)
727 ; SCALAR-NEXT: movb %cl, 7(%rdx)
728 ; SCALAR-NEXT: movb %al, 6(%rdx)
729 ; SCALAR-NEXT: movb %cl, 9(%rdx)
730 ; SCALAR-NEXT: movb %al, 8(%rdx)
731 ; SCALAR-NEXT: movb %cl, 11(%rdx)
732 ; SCALAR-NEXT: movb %al, 10(%rdx)
733 ; SCALAR-NEXT: movb %cl, 13(%rdx)
734 ; SCALAR-NEXT: movb %al, 12(%rdx)
735 ; SCALAR-NEXT: movb %cl, 15(%rdx)
736 ; SCALAR-NEXT: movb %al, 14(%rdx)
737 ; SCALAR-NEXT: movb %cl, 17(%rdx)
738 ; SCALAR-NEXT: movb %al, 16(%rdx)
739 ; SCALAR-NEXT: movb %cl, 19(%rdx)
740 ; SCALAR-NEXT: movb %al, 18(%rdx)
741 ; SCALAR-NEXT: movb %cl, 21(%rdx)
742 ; SCALAR-NEXT: movb %al, 20(%rdx)
743 ; SCALAR-NEXT: movb %cl, 23(%rdx)
744 ; SCALAR-NEXT: movb %al, 22(%rdx)
745 ; SCALAR-NEXT: movb %cl, 25(%rdx)
746 ; SCALAR-NEXT: movb %al, 24(%rdx)
747 ; SCALAR-NEXT: movb %cl, 27(%rdx)
748 ; SCALAR-NEXT: movb %al, 26(%rdx)
749 ; SCALAR-NEXT: movb %cl, 29(%rdx)
750 ; SCALAR-NEXT: movb %al, 28(%rdx)
751 ; SCALAR-NEXT: movb %cl, 31(%rdx)
752 ; SCALAR-NEXT: movb %al, 30(%rdx)
755 ; SSE2-ONLY-LABEL: vec256_v2i8:
756 ; SSE2-ONLY: # %bb.0:
757 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
758 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
759 ; SSE2-ONLY-NEXT: movd %xmm0, %eax
760 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
761 ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
762 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
763 ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
764 ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
765 ; SSE2-ONLY-NEXT: retq
767 ; SSE3-LABEL: vec256_v2i8:
769 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
770 ; SSE3-NEXT: pxor (%rdi), %xmm0
771 ; SSE3-NEXT: movd %xmm0, %eax
772 ; SSE3-NEXT: movw %ax, (%rsi)
773 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
774 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
775 ; SSE3-NEXT: movdqa %xmm0, (%rdx)
776 ; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
779 ; SSSE3-ONLY-LABEL: vec256_v2i8:
780 ; SSSE3-ONLY: # %bb.0:
781 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
782 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
783 ; SSSE3-ONLY-NEXT: movd %xmm0, %eax
784 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
785 ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
786 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
787 ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
788 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
789 ; SSSE3-ONLY-NEXT: retq
791 ; SSE41-LABEL: vec256_v2i8:
793 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
794 ; SSE41-NEXT: pxor (%rdi), %xmm0
795 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
796 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
797 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
798 ; SSE41-NEXT: movdqa %xmm0, (%rdx)
799 ; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
802 ; SSE42-LABEL: vec256_v2i8:
804 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
805 ; SSE42-NEXT: pxor (%rdi), %xmm0
806 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
807 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
808 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
809 ; SSE42-NEXT: movdqa %xmm0, (%rdx)
810 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
813 ; AVX1-LABEL: vec256_v2i8:
815 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
816 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
817 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
818 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
819 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
820 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
821 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
822 ; AVX1-NEXT: vzeroupper
825 ; AVX2-LABEL: vec256_v2i8:
827 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
828 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
829 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi)
830 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
831 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
832 ; AVX2-NEXT: vzeroupper
834 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
835 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
836 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
837 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
838 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
839 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
840 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
841 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
842 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
843 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
844 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
845 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
846 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
847 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
848 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
849 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
850 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
851 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
852 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
853 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
854 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
855 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
856 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
857 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
858 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
859 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
860 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
861 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
862 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
863 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
864 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
865 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
866 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
867 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
868 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
872 define void @vec256_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
873 ; SCALAR-LABEL: vec256_v2i16:
875 ; SCALAR-NEXT: movzwl 2(%rdi), %eax
876 ; SCALAR-NEXT: movl (%rdi), %ecx
877 ; SCALAR-NEXT: notl %ecx
878 ; SCALAR-NEXT: notl %eax
879 ; SCALAR-NEXT: movw %ax, 2(%rsi)
880 ; SCALAR-NEXT: movw %cx, (%rsi)
881 ; SCALAR-NEXT: movw %ax, 2(%rdx)
882 ; SCALAR-NEXT: movw %cx, (%rdx)
883 ; SCALAR-NEXT: movw %ax, 6(%rdx)
884 ; SCALAR-NEXT: movw %cx, 4(%rdx)
885 ; SCALAR-NEXT: movw %ax, 10(%rdx)
886 ; SCALAR-NEXT: movw %cx, 8(%rdx)
887 ; SCALAR-NEXT: movw %ax, 14(%rdx)
888 ; SCALAR-NEXT: movw %cx, 12(%rdx)
889 ; SCALAR-NEXT: movw %ax, 18(%rdx)
890 ; SCALAR-NEXT: movw %cx, 16(%rdx)
891 ; SCALAR-NEXT: movw %ax, 22(%rdx)
892 ; SCALAR-NEXT: movw %cx, 20(%rdx)
893 ; SCALAR-NEXT: movw %ax, 26(%rdx)
894 ; SCALAR-NEXT: movw %cx, 24(%rdx)
895 ; SCALAR-NEXT: movw %ax, 30(%rdx)
896 ; SCALAR-NEXT: movw %cx, 28(%rdx)
899 ; SSE2-LABEL: vec256_v2i16:
901 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
902 ; SSE2-NEXT: pxor (%rdi), %xmm0
903 ; SSE2-NEXT: movd %xmm0, (%rsi)
904 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
905 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
906 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
909 ; AVX1-LABEL: vec256_v2i16:
911 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
912 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
913 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
914 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
915 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
916 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
919 ; AVX2-LABEL: vec256_v2i16:
921 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
922 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
923 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
924 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
925 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
926 ; AVX2-NEXT: vzeroupper
928 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
929 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
930 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
931 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
932 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
933 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
934 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
935 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
936 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
937 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
938 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
939 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
940 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
941 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
942 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
943 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
944 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
945 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
946 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
950 define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
951 ; SCALAR-LABEL: vec256_v2i32:
953 ; SCALAR-NEXT: movl (%rdi), %eax
954 ; SCALAR-NEXT: movl 4(%rdi), %ecx
955 ; SCALAR-NEXT: notl %eax
956 ; SCALAR-NEXT: notl %ecx
957 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
958 ; SCALAR-NEXT: movl %eax, (%rsi)
959 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
960 ; SCALAR-NEXT: movl %eax, (%rdx)
961 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
962 ; SCALAR-NEXT: movl %eax, 8(%rdx)
963 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
964 ; SCALAR-NEXT: movl %eax, 16(%rdx)
965 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
966 ; SCALAR-NEXT: movl %eax, 24(%rdx)
969 ; SSE2-LABEL: vec256_v2i32:
971 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
972 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
973 ; SSE2-NEXT: pxor %xmm0, %xmm1
974 ; SSE2-NEXT: movq %xmm1, (%rsi)
975 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
976 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
977 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
980 ; AVX1-LABEL: vec256_v2i32:
982 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
983 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
984 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
985 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
986 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
987 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
988 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
989 ; AVX1-NEXT: vzeroupper
992 ; AVX2-ONLY-LABEL: vec256_v2i32:
993 ; AVX2-ONLY: # %bb.0:
994 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
995 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
996 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
997 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
998 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
999 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
1000 ; AVX2-ONLY-NEXT: vzeroupper
1001 ; AVX2-ONLY-NEXT: retq
1003 ; AVX512-LABEL: vec256_v2i32:
1005 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1006 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
1007 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
1008 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
1009 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
1010 ; AVX512-NEXT: vzeroupper
1012 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
1013 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
1014 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
1015 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
1016 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
1017 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
1018 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
1019 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
1020 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
1021 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
1022 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
1026 define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1027 ; SCALAR-LABEL: vec256_v2f32:
1029 ; SCALAR-NEXT: movl (%rdi), %eax
1030 ; SCALAR-NEXT: movl 4(%rdi), %ecx
1031 ; SCALAR-NEXT: notl %eax
1032 ; SCALAR-NEXT: notl %ecx
1033 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
1034 ; SCALAR-NEXT: movl %eax, (%rsi)
1035 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
1036 ; SCALAR-NEXT: movl %eax, (%rdx)
1037 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
1038 ; SCALAR-NEXT: movl %eax, 8(%rdx)
1039 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
1040 ; SCALAR-NEXT: movl %eax, 16(%rdx)
1041 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
1042 ; SCALAR-NEXT: movl %eax, 24(%rdx)
1045 ; SSE2-LABEL: vec256_v2f32:
1047 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1048 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1049 ; SSE2-NEXT: pxor %xmm0, %xmm1
1050 ; SSE2-NEXT: movq %xmm1, (%rsi)
1051 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1052 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1053 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1056 ; AVX1-LABEL: vec256_v2f32:
1058 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1059 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1060 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
1061 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
1062 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1063 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1064 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1065 ; AVX1-NEXT: vzeroupper
1068 ; AVX2-ONLY-LABEL: vec256_v2f32:
1069 ; AVX2-ONLY: # %bb.0:
1070 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1071 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1072 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
1073 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
1074 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
1075 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
1076 ; AVX2-ONLY-NEXT: vzeroupper
1077 ; AVX2-ONLY-NEXT: retq
1079 ; AVX512-LABEL: vec256_v2f32:
1081 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1082 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
1083 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
1084 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
1085 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
1086 ; AVX512-NEXT: vzeroupper
1088 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
1089 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
1090 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
1091 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
1092 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
1093 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
1094 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
1095 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
1096 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
1097 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
1098 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
1099 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
1103 define void @vec256_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1104 ; SCALAR-LABEL: vec256_v2i64:
1106 ; SCALAR-NEXT: movq (%rdi), %rax
1107 ; SCALAR-NEXT: movq 8(%rdi), %rcx
1108 ; SCALAR-NEXT: notq %rax
1109 ; SCALAR-NEXT: notq %rcx
1110 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
1111 ; SCALAR-NEXT: movq %rax, (%rsi)
1112 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
1113 ; SCALAR-NEXT: movq %rax, (%rdx)
1114 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
1115 ; SCALAR-NEXT: movq %rax, 16(%rdx)
1118 ; SSE2-LABEL: vec256_v2i64:
1120 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1121 ; SSE2-NEXT: pxor (%rdi), %xmm0
1122 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1123 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1124 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1127 ; AVX-LABEL: vec256_v2i64:
1129 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1130 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1131 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1132 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1133 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1135 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
1136 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
1137 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
1138 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
1139 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
1140 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
1141 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
1145 define void @vec256_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1146 ; SCALAR-LABEL: vec256_v2f64:
1148 ; SCALAR-NEXT: movq (%rdi), %rax
1149 ; SCALAR-NEXT: movq 8(%rdi), %rcx
1150 ; SCALAR-NEXT: notq %rax
1151 ; SCALAR-NEXT: notq %rcx
1152 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
1153 ; SCALAR-NEXT: movq %rax, (%rsi)
1154 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
1155 ; SCALAR-NEXT: movq %rax, (%rdx)
1156 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
1157 ; SCALAR-NEXT: movq %rax, 16(%rdx)
1160 ; SSE2-LABEL: vec256_v2f64:
1162 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1163 ; SSE2-NEXT: pxor (%rdi), %xmm0
1164 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1165 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1166 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1169 ; AVX-LABEL: vec256_v2f64:
1171 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1172 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1173 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1174 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1175 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1177 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
1178 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
1179 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
1180 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
1181 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
1182 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
1183 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
1184 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
1188 define void @vec256_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1189 ; SCALAR-LABEL: vec256_v4i8:
1191 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
1192 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
1193 ; SCALAR-NEXT: movzbl (%rdi), %eax
1194 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
1195 ; SCALAR-NEXT: notb %al
1196 ; SCALAR-NEXT: notb %dil
1197 ; SCALAR-NEXT: notb %cl
1198 ; SCALAR-NEXT: notb %r8b
1199 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
1200 ; SCALAR-NEXT: movb %cl, 2(%rsi)
1201 ; SCALAR-NEXT: movb %dil, 1(%rsi)
1202 ; SCALAR-NEXT: movb %al, (%rsi)
1203 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
1204 ; SCALAR-NEXT: movb %cl, 2(%rdx)
1205 ; SCALAR-NEXT: movb %dil, 1(%rdx)
1206 ; SCALAR-NEXT: movb %al, (%rdx)
1207 ; SCALAR-NEXT: movb %r8b, 7(%rdx)
1208 ; SCALAR-NEXT: movb %cl, 6(%rdx)
1209 ; SCALAR-NEXT: movb %dil, 5(%rdx)
1210 ; SCALAR-NEXT: movb %al, 4(%rdx)
1211 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
1212 ; SCALAR-NEXT: movb %cl, 10(%rdx)
1213 ; SCALAR-NEXT: movb %dil, 9(%rdx)
1214 ; SCALAR-NEXT: movb %al, 8(%rdx)
1215 ; SCALAR-NEXT: movb %r8b, 15(%rdx)
1216 ; SCALAR-NEXT: movb %cl, 14(%rdx)
1217 ; SCALAR-NEXT: movb %dil, 13(%rdx)
1218 ; SCALAR-NEXT: movb %al, 12(%rdx)
1219 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
1220 ; SCALAR-NEXT: movb %cl, 18(%rdx)
1221 ; SCALAR-NEXT: movb %dil, 17(%rdx)
1222 ; SCALAR-NEXT: movb %al, 16(%rdx)
1223 ; SCALAR-NEXT: movb %r8b, 23(%rdx)
1224 ; SCALAR-NEXT: movb %cl, 22(%rdx)
1225 ; SCALAR-NEXT: movb %dil, 21(%rdx)
1226 ; SCALAR-NEXT: movb %al, 20(%rdx)
1227 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
1228 ; SCALAR-NEXT: movb %cl, 26(%rdx)
1229 ; SCALAR-NEXT: movb %dil, 25(%rdx)
1230 ; SCALAR-NEXT: movb %al, 24(%rdx)
1231 ; SCALAR-NEXT: movb %r8b, 31(%rdx)
1232 ; SCALAR-NEXT: movb %cl, 30(%rdx)
1233 ; SCALAR-NEXT: movb %dil, 29(%rdx)
1234 ; SCALAR-NEXT: movb %al, 28(%rdx)
1237 ; SSE2-LABEL: vec256_v4i8:
1239 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1240 ; SSE2-NEXT: pxor (%rdi), %xmm0
1241 ; SSE2-NEXT: movd %xmm0, (%rsi)
1242 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1243 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1244 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1247 ; AVX1-LABEL: vec256_v4i8:
1249 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1250 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
1251 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
1252 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1253 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
1254 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
1257 ; AVX2-LABEL: vec256_v4i8:
1259 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1260 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
1261 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
1262 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
1263 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1264 ; AVX2-NEXT: vzeroupper
1266 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
1267 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
1268 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1269 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
1270 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1271 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
1272 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
1273 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
1274 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
1275 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
1276 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
1277 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
1278 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
1279 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
1280 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
1281 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
1282 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
1283 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
1284 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
1288 define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1289 ; SCALAR-LABEL: vec256_v4i16:
1291 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
1292 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
1293 ; SCALAR-NEXT: movl (%rdi), %eax
1294 ; SCALAR-NEXT: movl 4(%rdi), %edi
1295 ; SCALAR-NEXT: notl %eax
1296 ; SCALAR-NEXT: notl %ecx
1297 ; SCALAR-NEXT: notl %edi
1298 ; SCALAR-NEXT: notl %r8d
1299 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
1300 ; SCALAR-NEXT: movw %di, 4(%rsi)
1301 ; SCALAR-NEXT: movw %cx, 2(%rsi)
1302 ; SCALAR-NEXT: movw %ax, (%rsi)
1303 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
1304 ; SCALAR-NEXT: movw %di, 4(%rdx)
1305 ; SCALAR-NEXT: movw %cx, 2(%rdx)
1306 ; SCALAR-NEXT: movw %ax, (%rdx)
1307 ; SCALAR-NEXT: movw %r8w, 14(%rdx)
1308 ; SCALAR-NEXT: movw %di, 12(%rdx)
1309 ; SCALAR-NEXT: movw %cx, 10(%rdx)
1310 ; SCALAR-NEXT: movw %ax, 8(%rdx)
1311 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
1312 ; SCALAR-NEXT: movw %di, 20(%rdx)
1313 ; SCALAR-NEXT: movw %cx, 18(%rdx)
1314 ; SCALAR-NEXT: movw %ax, 16(%rdx)
1315 ; SCALAR-NEXT: movw %r8w, 30(%rdx)
1316 ; SCALAR-NEXT: movw %di, 28(%rdx)
1317 ; SCALAR-NEXT: movw %cx, 26(%rdx)
1318 ; SCALAR-NEXT: movw %ax, 24(%rdx)
1321 ; SSE2-LABEL: vec256_v4i16:
1323 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1324 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1325 ; SSE2-NEXT: pxor %xmm0, %xmm1
1326 ; SSE2-NEXT: movq %xmm1, (%rsi)
1327 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1328 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1329 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1332 ; AVX1-LABEL: vec256_v4i16:
1334 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1335 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1336 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
1337 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
1338 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1339 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1340 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1341 ; AVX1-NEXT: vzeroupper
1344 ; AVX2-ONLY-LABEL: vec256_v4i16:
1345 ; AVX2-ONLY: # %bb.0:
1346 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1347 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1348 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
1349 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
1350 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
1351 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
1352 ; AVX2-ONLY-NEXT: vzeroupper
1353 ; AVX2-ONLY-NEXT: retq
1355 ; AVX512-LABEL: vec256_v4i16:
1357 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1358 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
1359 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
1360 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
1361 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
1362 ; AVX512-NEXT: vzeroupper
1364 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
1365 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
1366 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
1367 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
1368 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
1369 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
1370 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
1371 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
1372 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
1373 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
1374 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
1378 define void @vec256_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1379 ; SCALAR-LABEL: vec256_v4i32:
1381 ; SCALAR-NEXT: movaps (%rdi), %xmm0
1382 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1383 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
1384 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
1385 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
1388 ; SSE2-LABEL: vec256_v4i32:
1390 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1391 ; SSE2-NEXT: pxor (%rdi), %xmm0
1392 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1393 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1394 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1397 ; AVX-LABEL: vec256_v4i32:
1399 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1400 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1401 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1402 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1403 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1405 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
1406 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
1407 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
1408 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
1409 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
1410 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
1411 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
1415 define void @vec256_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1416 ; SCALAR-LABEL: vec256_v4f32:
1418 ; SCALAR-NEXT: movaps (%rdi), %xmm0
1419 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1420 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
1421 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
1422 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
1425 ; SSE2-LABEL: vec256_v4f32:
1427 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1428 ; SSE2-NEXT: pxor (%rdi), %xmm0
1429 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1430 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1431 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1434 ; AVX-LABEL: vec256_v4f32:
1436 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1437 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1438 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1439 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1440 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1442 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
1443 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
1444 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
1445 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
1446 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
1447 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
1448 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
1449 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
1453 define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1454 ; SCALAR-LABEL: vec256_v8i8:
1456 ; SCALAR-NEXT: pushq %rbx
1457 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
1458 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
1459 ; SCALAR-NEXT: movzbl 5(%rdi), %r10d
1460 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
1461 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
1462 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
1463 ; SCALAR-NEXT: movzbl (%rdi), %eax
1464 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
1465 ; SCALAR-NEXT: notb %al
1466 ; SCALAR-NEXT: notb %dil
1467 ; SCALAR-NEXT: notb %cl
1468 ; SCALAR-NEXT: notb %r8b
1469 ; SCALAR-NEXT: notb %r9b
1470 ; SCALAR-NEXT: notb %r10b
1471 ; SCALAR-NEXT: notb %r11b
1472 ; SCALAR-NEXT: notb %bl
1473 ; SCALAR-NEXT: movb %bl, 7(%rsi)
1474 ; SCALAR-NEXT: movb %r11b, 6(%rsi)
1475 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
1476 ; SCALAR-NEXT: movb %r9b, 4(%rsi)
1477 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
1478 ; SCALAR-NEXT: movb %cl, 2(%rsi)
1479 ; SCALAR-NEXT: movb %dil, 1(%rsi)
1480 ; SCALAR-NEXT: movb %al, (%rsi)
1481 ; SCALAR-NEXT: movb %bl, 7(%rdx)
1482 ; SCALAR-NEXT: movb %r11b, 6(%rdx)
1483 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
1484 ; SCALAR-NEXT: movb %r9b, 4(%rdx)
1485 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
1486 ; SCALAR-NEXT: movb %cl, 2(%rdx)
1487 ; SCALAR-NEXT: movb %dil, 1(%rdx)
1488 ; SCALAR-NEXT: movb %al, (%rdx)
1489 ; SCALAR-NEXT: movb %bl, 15(%rdx)
1490 ; SCALAR-NEXT: movb %r11b, 14(%rdx)
1491 ; SCALAR-NEXT: movb %r10b, 13(%rdx)
1492 ; SCALAR-NEXT: movb %r9b, 12(%rdx)
1493 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
1494 ; SCALAR-NEXT: movb %cl, 10(%rdx)
1495 ; SCALAR-NEXT: movb %dil, 9(%rdx)
1496 ; SCALAR-NEXT: movb %al, 8(%rdx)
1497 ; SCALAR-NEXT: movb %bl, 23(%rdx)
1498 ; SCALAR-NEXT: movb %r11b, 22(%rdx)
1499 ; SCALAR-NEXT: movb %r10b, 21(%rdx)
1500 ; SCALAR-NEXT: movb %r9b, 20(%rdx)
1501 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
1502 ; SCALAR-NEXT: movb %cl, 18(%rdx)
1503 ; SCALAR-NEXT: movb %dil, 17(%rdx)
1504 ; SCALAR-NEXT: movb %al, 16(%rdx)
1505 ; SCALAR-NEXT: movb %bl, 31(%rdx)
1506 ; SCALAR-NEXT: movb %r11b, 30(%rdx)
1507 ; SCALAR-NEXT: movb %r10b, 29(%rdx)
1508 ; SCALAR-NEXT: movb %r9b, 28(%rdx)
1509 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
1510 ; SCALAR-NEXT: movb %cl, 26(%rdx)
1511 ; SCALAR-NEXT: movb %dil, 25(%rdx)
1512 ; SCALAR-NEXT: movb %al, 24(%rdx)
1513 ; SCALAR-NEXT: popq %rbx
1516 ; SSE2-LABEL: vec256_v8i8:
1518 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1519 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
1520 ; SSE2-NEXT: pxor %xmm0, %xmm1
1521 ; SSE2-NEXT: movq %xmm1, (%rsi)
1522 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
1523 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1524 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1527 ; AVX1-LABEL: vec256_v8i8:
1529 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1530 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1531 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
1532 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
1533 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1534 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1535 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1536 ; AVX1-NEXT: vzeroupper
1539 ; AVX2-ONLY-LABEL: vec256_v8i8:
1540 ; AVX2-ONLY: # %bb.0:
1541 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1542 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
1543 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
1544 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
1545 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
1546 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
1547 ; AVX2-ONLY-NEXT: vzeroupper
1548 ; AVX2-ONLY-NEXT: retq
1550 ; AVX512-LABEL: vec256_v8i8:
1552 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
1553 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
1554 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
1555 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
1556 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
1557 ; AVX512-NEXT: vzeroupper
1559 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
1560 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1561 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1562 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
1563 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1564 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
1565 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
1566 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
1567 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
1568 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
1569 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
1573 define void @vec256_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1574 ; SCALAR-LABEL: vec256_v8i16:
1576 ; SCALAR-NEXT: pushq %rbx
1577 ; SCALAR-NEXT: movzwl 14(%rdi), %ebx
1578 ; SCALAR-NEXT: movl 12(%rdi), %r11d
1579 ; SCALAR-NEXT: movzwl 10(%rdi), %r10d
1580 ; SCALAR-NEXT: movl 8(%rdi), %r9d
1581 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
1582 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
1583 ; SCALAR-NEXT: movl (%rdi), %eax
1584 ; SCALAR-NEXT: movl 4(%rdi), %edi
1585 ; SCALAR-NEXT: notl %eax
1586 ; SCALAR-NEXT: notl %ecx
1587 ; SCALAR-NEXT: notl %edi
1588 ; SCALAR-NEXT: notl %r8d
1589 ; SCALAR-NEXT: notl %r9d
1590 ; SCALAR-NEXT: notl %r10d
1591 ; SCALAR-NEXT: notl %r11d
1592 ; SCALAR-NEXT: notl %ebx
1593 ; SCALAR-NEXT: movw %bx, 14(%rsi)
1594 ; SCALAR-NEXT: movw %r11w, 12(%rsi)
1595 ; SCALAR-NEXT: movw %r10w, 10(%rsi)
1596 ; SCALAR-NEXT: movw %r9w, 8(%rsi)
1597 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
1598 ; SCALAR-NEXT: movw %di, 4(%rsi)
1599 ; SCALAR-NEXT: movw %cx, 2(%rsi)
1600 ; SCALAR-NEXT: movw %ax, (%rsi)
1601 ; SCALAR-NEXT: movw %bx, 14(%rdx)
1602 ; SCALAR-NEXT: movw %r11w, 12(%rdx)
1603 ; SCALAR-NEXT: movw %r10w, 10(%rdx)
1604 ; SCALAR-NEXT: movw %r9w, 8(%rdx)
1605 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
1606 ; SCALAR-NEXT: movw %di, 4(%rdx)
1607 ; SCALAR-NEXT: movw %cx, 2(%rdx)
1608 ; SCALAR-NEXT: movw %ax, (%rdx)
1609 ; SCALAR-NEXT: movw %bx, 30(%rdx)
1610 ; SCALAR-NEXT: movw %r11w, 28(%rdx)
1611 ; SCALAR-NEXT: movw %r10w, 26(%rdx)
1612 ; SCALAR-NEXT: movw %r9w, 24(%rdx)
1613 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
1614 ; SCALAR-NEXT: movw %di, 20(%rdx)
1615 ; SCALAR-NEXT: movw %cx, 18(%rdx)
1616 ; SCALAR-NEXT: movw %ax, 16(%rdx)
1617 ; SCALAR-NEXT: popq %rbx
1620 ; SSE2-LABEL: vec256_v8i16:
1622 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1623 ; SSE2-NEXT: pxor (%rdi), %xmm0
1624 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1625 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1626 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1629 ; AVX-LABEL: vec256_v8i16:
1631 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1632 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1633 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1634 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1635 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1637 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
1638 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1639 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
1640 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
1641 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
1642 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
1643 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
1647 define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1648 ; SCALAR-LABEL: vec256_v16i8:
1650 ; SCALAR-NEXT: pushq %rbp
1651 ; SCALAR-NEXT: pushq %r15
1652 ; SCALAR-NEXT: pushq %r14
1653 ; SCALAR-NEXT: pushq %r13
1654 ; SCALAR-NEXT: pushq %r12
1655 ; SCALAR-NEXT: pushq %rbx
1656 ; SCALAR-NEXT: movzbl 15(%rdi), %eax
1657 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1658 ; SCALAR-NEXT: movzbl 14(%rdi), %eax
1659 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1660 ; SCALAR-NEXT: movzbl 13(%rdi), %eax
1661 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1662 ; SCALAR-NEXT: movzbl 12(%rdi), %r15d
1663 ; SCALAR-NEXT: movzbl 11(%rdi), %eax
1664 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1665 ; SCALAR-NEXT: movzbl 10(%rdi), %ebp
1666 ; SCALAR-NEXT: movzbl 9(%rdi), %r14d
1667 ; SCALAR-NEXT: movzbl 8(%rdi), %eax
1668 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1669 ; SCALAR-NEXT: movzbl 7(%rdi), %r12d
1670 ; SCALAR-NEXT: movzbl 6(%rdi), %r10d
1671 ; SCALAR-NEXT: movzbl 5(%rdi), %r9d
1672 ; SCALAR-NEXT: movzbl 4(%rdi), %ebx
1673 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
1674 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
1675 ; SCALAR-NEXT: movzbl (%rdi), %eax
1676 ; SCALAR-NEXT: movzbl 1(%rdi), %r13d
1677 ; SCALAR-NEXT: notb %al
1678 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1679 ; SCALAR-NEXT: notb %r13b
1680 ; SCALAR-NEXT: notb %cl
1681 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1682 ; SCALAR-NEXT: notb %r8b
1683 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1684 ; SCALAR-NEXT: notb %bl
1685 ; SCALAR-NEXT: notb %r9b
1686 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1687 ; SCALAR-NEXT: notb %r10b
1688 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1689 ; SCALAR-NEXT: notb %r12b
1690 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1691 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
1692 ; SCALAR-NEXT: notb %r11b
1693 ; SCALAR-NEXT: movl %r14d, %r10d
1694 ; SCALAR-NEXT: notb %r10b
1695 ; SCALAR-NEXT: notb %bpl
1696 ; SCALAR-NEXT: movl %ebp, %r14d
1697 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
1698 ; SCALAR-NEXT: notb %r8b
1699 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1700 ; SCALAR-NEXT: movl %r15d, %edi
1701 ; SCALAR-NEXT: notb %dil
1702 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1703 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
1704 ; SCALAR-NEXT: notb %r9b
1705 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
1706 ; SCALAR-NEXT: notb %bpl
1707 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
1708 ; SCALAR-NEXT: notb %r15b
1709 ; SCALAR-NEXT: movb %r15b, 15(%rsi)
1710 ; SCALAR-NEXT: movb %bpl, 14(%rsi)
1711 ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1712 ; SCALAR-NEXT: movl %r9d, %eax
1713 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1714 ; SCALAR-NEXT: movb %r9b, 13(%rsi)
1715 ; SCALAR-NEXT: movb %dil, 12(%rsi)
1716 ; SCALAR-NEXT: movb %r8b, 11(%rsi)
1717 ; SCALAR-NEXT: movb %r14b, 10(%rsi)
1718 ; SCALAR-NEXT: movb %r10b, 9(%rsi)
1719 ; SCALAR-NEXT: movl %r10d, %r8d
1720 ; SCALAR-NEXT: movb %r11b, 8(%rsi)
1721 ; SCALAR-NEXT: movl %r11d, %r9d
1722 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
1723 ; SCALAR-NEXT: movb %r12b, 7(%rsi)
1724 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1725 ; SCALAR-NEXT: movb %cl, 6(%rsi)
1726 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
1727 ; SCALAR-NEXT: movb %dil, 5(%rsi)
1728 ; SCALAR-NEXT: movb %bl, 4(%rsi)
1729 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1730 ; SCALAR-NEXT: movb %cl, 3(%rsi)
1731 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
1732 ; SCALAR-NEXT: movb %cl, 2(%rsi)
1733 ; SCALAR-NEXT: movb %r13b, 1(%rsi)
1734 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1735 ; SCALAR-NEXT: movb %r10b, (%rsi)
1736 ; SCALAR-NEXT: movb %r15b, 15(%rdx)
1737 ; SCALAR-NEXT: movl %r15d, %r11d
1738 ; SCALAR-NEXT: movb %bpl, 14(%rdx)
1739 ; SCALAR-NEXT: movb %al, 13(%rdx)
1740 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
1741 ; SCALAR-NEXT: movb %r12b, 12(%rdx)
1742 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
1743 ; SCALAR-NEXT: movb %r15b, 11(%rdx)
1744 ; SCALAR-NEXT: movb %r14b, 10(%rdx)
1745 ; SCALAR-NEXT: movb %r8b, 9(%rdx)
1746 ; SCALAR-NEXT: movb %r9b, 8(%rdx)
1747 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
1748 ; SCALAR-NEXT: movb %r9b, 7(%rdx)
1749 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1750 ; SCALAR-NEXT: movb %al, 6(%rdx)
1751 ; SCALAR-NEXT: movb %dil, 5(%rdx)
1752 ; SCALAR-NEXT: movb %bl, 4(%rdx)
1753 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
1754 ; SCALAR-NEXT: movb %sil, 3(%rdx)
1755 ; SCALAR-NEXT: movb %cl, 2(%rdx)
1756 ; SCALAR-NEXT: movb %r13b, 1(%rdx)
1757 ; SCALAR-NEXT: movl %r10d, %edi
1758 ; SCALAR-NEXT: movb %r10b, (%rdx)
1759 ; SCALAR-NEXT: movb %r11b, 31(%rdx)
1760 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1761 ; SCALAR-NEXT: movb %r10b, 30(%rdx)
1762 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1763 ; SCALAR-NEXT: movb %r10b, 29(%rdx)
1764 ; SCALAR-NEXT: movb %r12b, 28(%rdx)
1765 ; SCALAR-NEXT: movb %r15b, 27(%rdx)
1766 ; SCALAR-NEXT: movb %r14b, 26(%rdx)
1767 ; SCALAR-NEXT: movb %r8b, 25(%rdx)
1768 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
1769 ; SCALAR-NEXT: movb %r10b, 24(%rdx)
1770 ; SCALAR-NEXT: movb %r9b, 23(%rdx)
1771 ; SCALAR-NEXT: movb %al, 22(%rdx)
1772 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
1773 ; SCALAR-NEXT: movb %al, 21(%rdx)
1774 ; SCALAR-NEXT: movb %bl, 20(%rdx)
1775 ; SCALAR-NEXT: movb %sil, 19(%rdx)
1776 ; SCALAR-NEXT: movb %cl, 18(%rdx)
1777 ; SCALAR-NEXT: movb %r13b, 17(%rdx)
1778 ; SCALAR-NEXT: movb %dil, 16(%rdx)
1779 ; SCALAR-NEXT: popq %rbx
1780 ; SCALAR-NEXT: popq %r12
1781 ; SCALAR-NEXT: popq %r13
1782 ; SCALAR-NEXT: popq %r14
1783 ; SCALAR-NEXT: popq %r15
1784 ; SCALAR-NEXT: popq %rbp
1787 ; SSE2-LABEL: vec256_v16i8:
1789 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
1790 ; SSE2-NEXT: pxor (%rdi), %xmm0
1791 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
1792 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
1793 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
1796 ; AVX-LABEL: vec256_v16i8:
1798 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1799 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
1800 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1801 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
1802 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
1804 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
1805 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1806 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1807 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
1808 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1809 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
1810 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
1814 define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
1815 ; SCALAR-LABEL: vec384_v2i8:
1817 ; SCALAR-NEXT: movzbl (%rdi), %eax
1818 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
1819 ; SCALAR-NEXT: notb %al
1820 ; SCALAR-NEXT: notb %cl
1821 ; SCALAR-NEXT: movb %cl, 1(%rsi)
1822 ; SCALAR-NEXT: movb %al, (%rsi)
1823 ; SCALAR-NEXT: movb %cl, 1(%rdx)
1824 ; SCALAR-NEXT: movb %al, (%rdx)
1825 ; SCALAR-NEXT: movb %cl, 3(%rdx)
1826 ; SCALAR-NEXT: movb %al, 2(%rdx)
1827 ; SCALAR-NEXT: movb %cl, 5(%rdx)
1828 ; SCALAR-NEXT: movb %al, 4(%rdx)
1829 ; SCALAR-NEXT: movb %cl, 7(%rdx)
1830 ; SCALAR-NEXT: movb %al, 6(%rdx)
1831 ; SCALAR-NEXT: movb %cl, 9(%rdx)
1832 ; SCALAR-NEXT: movb %al, 8(%rdx)
1833 ; SCALAR-NEXT: movb %cl, 11(%rdx)
1834 ; SCALAR-NEXT: movb %al, 10(%rdx)
1835 ; SCALAR-NEXT: movb %cl, 13(%rdx)
1836 ; SCALAR-NEXT: movb %al, 12(%rdx)
1837 ; SCALAR-NEXT: movb %cl, 15(%rdx)
1838 ; SCALAR-NEXT: movb %al, 14(%rdx)
1839 ; SCALAR-NEXT: movb %cl, 17(%rdx)
1840 ; SCALAR-NEXT: movb %al, 16(%rdx)
1841 ; SCALAR-NEXT: movb %cl, 19(%rdx)
1842 ; SCALAR-NEXT: movb %al, 18(%rdx)
1843 ; SCALAR-NEXT: movb %cl, 21(%rdx)
1844 ; SCALAR-NEXT: movb %al, 20(%rdx)
1845 ; SCALAR-NEXT: movb %cl, 23(%rdx)
1846 ; SCALAR-NEXT: movb %al, 22(%rdx)
1847 ; SCALAR-NEXT: movb %cl, 25(%rdx)
1848 ; SCALAR-NEXT: movb %al, 24(%rdx)
1849 ; SCALAR-NEXT: movb %cl, 27(%rdx)
1850 ; SCALAR-NEXT: movb %al, 26(%rdx)
1851 ; SCALAR-NEXT: movb %cl, 29(%rdx)
1852 ; SCALAR-NEXT: movb %al, 28(%rdx)
1853 ; SCALAR-NEXT: movb %cl, 31(%rdx)
1854 ; SCALAR-NEXT: movb %al, 30(%rdx)
1855 ; SCALAR-NEXT: movb %cl, 33(%rdx)
1856 ; SCALAR-NEXT: movb %al, 32(%rdx)
1857 ; SCALAR-NEXT: movb %cl, 35(%rdx)
1858 ; SCALAR-NEXT: movb %al, 34(%rdx)
1859 ; SCALAR-NEXT: movb %cl, 37(%rdx)
1860 ; SCALAR-NEXT: movb %al, 36(%rdx)
1861 ; SCALAR-NEXT: movb %cl, 39(%rdx)
1862 ; SCALAR-NEXT: movb %al, 38(%rdx)
1863 ; SCALAR-NEXT: movb %cl, 41(%rdx)
1864 ; SCALAR-NEXT: movb %al, 40(%rdx)
1865 ; SCALAR-NEXT: movb %cl, 43(%rdx)
1866 ; SCALAR-NEXT: movb %al, 42(%rdx)
1867 ; SCALAR-NEXT: movb %cl, 45(%rdx)
1868 ; SCALAR-NEXT: movb %al, 44(%rdx)
1869 ; SCALAR-NEXT: movb %cl, 47(%rdx)
1870 ; SCALAR-NEXT: movb %al, 46(%rdx)
1873 ; SSE2-ONLY-LABEL: vec384_v2i8:
1874 ; SSE2-ONLY: # %bb.0:
1875 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
1876 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
1877 ; SSE2-ONLY-NEXT: movd %xmm0, %eax
1878 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
1879 ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1880 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1881 ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
1882 ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
1883 ; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
1884 ; SSE2-ONLY-NEXT: retq
1886 ; SSE3-LABEL: vec384_v2i8:
1888 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
1889 ; SSE3-NEXT: pxor (%rdi), %xmm0
1890 ; SSE3-NEXT: movd %xmm0, %eax
1891 ; SSE3-NEXT: movw %ax, (%rsi)
1892 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1893 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1894 ; SSE3-NEXT: movdqa %xmm0, (%rdx)
1895 ; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
1896 ; SSE3-NEXT: movdqa %xmm0, 32(%rdx)
1899 ; SSSE3-ONLY-LABEL: vec384_v2i8:
1900 ; SSSE3-ONLY: # %bb.0:
1901 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
1902 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
1903 ; SSSE3-ONLY-NEXT: movd %xmm0, %eax
1904 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
1905 ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1906 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1907 ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
1908 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
1909 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
1910 ; SSSE3-ONLY-NEXT: retq
1912 ; SSE41-LABEL: vec384_v2i8:
1914 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
1915 ; SSE41-NEXT: pxor (%rdi), %xmm0
1916 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
1917 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1918 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1919 ; SSE41-NEXT: movdqa %xmm0, (%rdx)
1920 ; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
1921 ; SSE41-NEXT: movdqa %xmm0, 32(%rdx)
1924 ; SSE42-LABEL: vec384_v2i8:
1926 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
1927 ; SSE42-NEXT: pxor (%rdi), %xmm0
1928 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
1929 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1930 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1931 ; SSE42-NEXT: movdqa %xmm0, (%rdx)
1932 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
1933 ; SSE42-NEXT: movdqa %xmm0, 32(%rdx)
1936 ; AVX1-LABEL: vec384_v2i8:
1938 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1939 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
1940 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
1941 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1942 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1943 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
1944 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
1945 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
1946 ; AVX1-NEXT: vzeroupper
1949 ; AVX2-LABEL: vec384_v2i8:
1951 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1952 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
1953 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rsi)
1954 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
1955 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1956 ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx)
1957 ; AVX2-NEXT: vzeroupper
1959 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
1960 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
1961 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
1962 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
1963 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
1964 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
1965 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
1966 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
1967 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
1968 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
1969 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
1970 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
1971 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
1972 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
1973 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
1974 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
1975 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
1976 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
1977 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
1978 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
1979 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
1980 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
1981 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
1982 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
1983 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
1984 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
1985 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
1986 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
1987 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
1988 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
1989 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
1990 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
1991 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
1992 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
1993 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
1994 %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16
1995 store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32
1996 %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17
1997 store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2
1998 %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18
1999 store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4
2000 %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19
2001 store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2
2002 %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20
2003 store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8
2004 %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21
2005 store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2
2006 %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22
2007 store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4
2008 %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23
2009 store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2
2013 define void @vec384_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2014 ; SCALAR-LABEL: vec384_v2i16:
2016 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
2017 ; SCALAR-NEXT: movl (%rdi), %eax
2018 ; SCALAR-NEXT: notl %eax
2019 ; SCALAR-NEXT: notl %ecx
2020 ; SCALAR-NEXT: movw %cx, 2(%rsi)
2021 ; SCALAR-NEXT: movw %ax, (%rsi)
2022 ; SCALAR-NEXT: movw %cx, 2(%rdx)
2023 ; SCALAR-NEXT: movw %ax, (%rdx)
2024 ; SCALAR-NEXT: movw %cx, 6(%rdx)
2025 ; SCALAR-NEXT: movw %ax, 4(%rdx)
2026 ; SCALAR-NEXT: movw %cx, 10(%rdx)
2027 ; SCALAR-NEXT: movw %ax, 8(%rdx)
2028 ; SCALAR-NEXT: movw %cx, 14(%rdx)
2029 ; SCALAR-NEXT: movw %ax, 12(%rdx)
2030 ; SCALAR-NEXT: movw %cx, 18(%rdx)
2031 ; SCALAR-NEXT: movw %ax, 16(%rdx)
2032 ; SCALAR-NEXT: movw %cx, 22(%rdx)
2033 ; SCALAR-NEXT: movw %ax, 20(%rdx)
2034 ; SCALAR-NEXT: movw %cx, 26(%rdx)
2035 ; SCALAR-NEXT: movw %ax, 24(%rdx)
2036 ; SCALAR-NEXT: movw %cx, 30(%rdx)
2037 ; SCALAR-NEXT: movw %ax, 28(%rdx)
2038 ; SCALAR-NEXT: movw %cx, 34(%rdx)
2039 ; SCALAR-NEXT: movw %ax, 32(%rdx)
2040 ; SCALAR-NEXT: movw %cx, 38(%rdx)
2041 ; SCALAR-NEXT: movw %ax, 36(%rdx)
2042 ; SCALAR-NEXT: movw %cx, 42(%rdx)
2043 ; SCALAR-NEXT: movw %ax, 40(%rdx)
2044 ; SCALAR-NEXT: movw %cx, 46(%rdx)
2045 ; SCALAR-NEXT: movw %ax, 44(%rdx)
2048 ; SSE2-LABEL: vec384_v2i16:
2050 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
2051 ; SSE2-NEXT: pxor (%rdi), %xmm0
2052 ; SSE2-NEXT: movd %xmm0, (%rsi)
2053 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2054 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2055 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2056 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2059 ; AVX1-LABEL: vec384_v2i16:
2061 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
2062 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
2063 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
2064 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2065 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
2066 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
2067 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
2070 ; AVX2-LABEL: vec384_v2i16:
2072 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
2073 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
2074 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
2075 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
2076 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
2077 ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx)
2078 ; AVX2-NEXT: vzeroupper
2080 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
2081 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
2082 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
2083 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
2084 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
2085 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
2086 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
2087 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
2088 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
2089 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
2090 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
2091 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
2092 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
2093 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
2094 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
2095 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
2096 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
2097 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
2098 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
2099 %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8
2100 store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32
2101 %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9
2102 store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4
2103 %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10
2104 store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8
2105 %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11
2106 store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4
2110 define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2111 ; SCALAR-LABEL: vec384_v2i32:
2113 ; SCALAR-NEXT: movl (%rdi), %eax
2114 ; SCALAR-NEXT: movl 4(%rdi), %ecx
2115 ; SCALAR-NEXT: notl %eax
2116 ; SCALAR-NEXT: notl %ecx
2117 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
2118 ; SCALAR-NEXT: movl %eax, (%rsi)
2119 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
2120 ; SCALAR-NEXT: movl %eax, (%rdx)
2121 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
2122 ; SCALAR-NEXT: movl %eax, 8(%rdx)
2123 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
2124 ; SCALAR-NEXT: movl %eax, 16(%rdx)
2125 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
2126 ; SCALAR-NEXT: movl %eax, 24(%rdx)
2127 ; SCALAR-NEXT: movl %ecx, 36(%rdx)
2128 ; SCALAR-NEXT: movl %eax, 32(%rdx)
2129 ; SCALAR-NEXT: movl %ecx, 44(%rdx)
2130 ; SCALAR-NEXT: movl %eax, 40(%rdx)
2133 ; SSE2-LABEL: vec384_v2i32:
2135 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2136 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
2137 ; SSE2-NEXT: pxor %xmm0, %xmm1
2138 ; SSE2-NEXT: movq %xmm1, (%rsi)
2139 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
2140 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2141 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2142 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2145 ; AVX1-LABEL: vec384_v2i32:
2147 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2148 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2149 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2150 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
2151 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2152 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
2153 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
2154 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
2155 ; AVX1-NEXT: vzeroupper
2158 ; AVX2-ONLY-LABEL: vec384_v2i32:
2159 ; AVX2-ONLY: # %bb.0:
2160 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2161 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2162 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
2163 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
2164 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
2165 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
2166 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx)
2167 ; AVX2-ONLY-NEXT: vzeroupper
2168 ; AVX2-ONLY-NEXT: retq
2170 ; AVX512-LABEL: vec384_v2i32:
2172 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2173 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
2174 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
2175 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
2176 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
2177 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
2178 ; AVX512-NEXT: vzeroupper
2180 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
2181 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
2182 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
2183 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
2184 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
2185 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
2186 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
2187 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
2188 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
2189 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
2190 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
2191 %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4
2192 store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32
2193 %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5
2194 store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8
2198 define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2199 ; SCALAR-LABEL: vec384_v2f32:
2201 ; SCALAR-NEXT: movl (%rdi), %eax
2202 ; SCALAR-NEXT: movl 4(%rdi), %ecx
2203 ; SCALAR-NEXT: notl %eax
2204 ; SCALAR-NEXT: notl %ecx
2205 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
2206 ; SCALAR-NEXT: movl %eax, (%rsi)
2207 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
2208 ; SCALAR-NEXT: movl %eax, (%rdx)
2209 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
2210 ; SCALAR-NEXT: movl %eax, 8(%rdx)
2211 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
2212 ; SCALAR-NEXT: movl %eax, 16(%rdx)
2213 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
2214 ; SCALAR-NEXT: movl %eax, 24(%rdx)
2215 ; SCALAR-NEXT: movl %ecx, 36(%rdx)
2216 ; SCALAR-NEXT: movl %eax, 32(%rdx)
2217 ; SCALAR-NEXT: movl %ecx, 44(%rdx)
2218 ; SCALAR-NEXT: movl %eax, 40(%rdx)
2221 ; SSE2-LABEL: vec384_v2f32:
2223 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2224 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
2225 ; SSE2-NEXT: pxor %xmm0, %xmm1
2226 ; SSE2-NEXT: movq %xmm1, (%rsi)
2227 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
2228 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2229 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2230 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2233 ; AVX1-LABEL: vec384_v2f32:
2235 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2236 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2237 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2238 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
2239 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2240 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
2241 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
2242 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
2243 ; AVX1-NEXT: vzeroupper
2246 ; AVX2-ONLY-LABEL: vec384_v2f32:
2247 ; AVX2-ONLY: # %bb.0:
2248 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2249 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2250 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
2251 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
2252 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
2253 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
2254 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx)
2255 ; AVX2-ONLY-NEXT: vzeroupper
2256 ; AVX2-ONLY-NEXT: retq
2258 ; AVX512-LABEL: vec384_v2f32:
2260 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2261 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
2262 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
2263 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
2264 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
2265 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
2266 ; AVX512-NEXT: vzeroupper
2268 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
2269 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
2270 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
2271 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
2272 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
2273 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
2274 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
2275 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
2276 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
2277 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
2278 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
2279 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
2280 %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4
2281 store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32
2282 %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5
2283 store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8
2287 define void @vec384_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2288 ; SCALAR-LABEL: vec384_v2i64:
2290 ; SCALAR-NEXT: movq (%rdi), %rax
2291 ; SCALAR-NEXT: movq 8(%rdi), %rcx
2292 ; SCALAR-NEXT: notq %rax
2293 ; SCALAR-NEXT: notq %rcx
2294 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
2295 ; SCALAR-NEXT: movq %rax, (%rsi)
2296 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
2297 ; SCALAR-NEXT: movq %rax, (%rdx)
2298 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
2299 ; SCALAR-NEXT: movq %rax, 16(%rdx)
2300 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
2301 ; SCALAR-NEXT: movq %rax, 32(%rdx)
2304 ; SSE2-LABEL: vec384_v2i64:
2306 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
2307 ; SSE2-NEXT: pxor (%rdi), %xmm0
2308 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
2309 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2310 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2311 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2314 ; AVX-LABEL: vec384_v2i64:
2316 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
2317 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
2318 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
2319 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
2320 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
2321 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
2323 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
2324 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
2325 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
2326 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
2327 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
2328 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
2329 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
2330 %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2
2331 store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32
2335 define void @vec384_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2336 ; SCALAR-LABEL: vec384_v2f64:
2338 ; SCALAR-NEXT: movq (%rdi), %rax
2339 ; SCALAR-NEXT: movq 8(%rdi), %rcx
2340 ; SCALAR-NEXT: notq %rax
2341 ; SCALAR-NEXT: notq %rcx
2342 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
2343 ; SCALAR-NEXT: movq %rax, (%rsi)
2344 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
2345 ; SCALAR-NEXT: movq %rax, (%rdx)
2346 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
2347 ; SCALAR-NEXT: movq %rax, 16(%rdx)
2348 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
2349 ; SCALAR-NEXT: movq %rax, 32(%rdx)
2352 ; SSE2-LABEL: vec384_v2f64:
2354 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
2355 ; SSE2-NEXT: pxor (%rdi), %xmm0
2356 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
2357 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
2358 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
2359 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
2362 ; AVX-LABEL: vec384_v2f64:
2364 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
2365 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
2366 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
2367 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
2368 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
2369 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
2371 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
2372 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
2373 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
2374 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
2375 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
2376 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
2377 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
2378 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
2379 %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2
2380 store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32
2384 define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2385 ; SCALAR-LABEL: vec384_v3i8:
2387 ; SCALAR-NEXT: movl (%rdi), %ecx
2388 ; SCALAR-NEXT: movl %ecx, %eax
2389 ; SCALAR-NEXT: shrl $16, %eax
2390 ; SCALAR-NEXT: movl %ecx, %edi
2391 ; SCALAR-NEXT: shrl $8, %edi
2392 ; SCALAR-NEXT: notb %cl
2393 ; SCALAR-NEXT: movzbl %cl, %r8d
2394 ; SCALAR-NEXT: notb %dil
2395 ; SCALAR-NEXT: movzbl %dil, %ecx
2396 ; SCALAR-NEXT: shll $8, %ecx
2397 ; SCALAR-NEXT: orl %r8d, %ecx
2398 ; SCALAR-NEXT: notb %al
2399 ; SCALAR-NEXT: movb %al, 2(%rsi)
2400 ; SCALAR-NEXT: movw %cx, (%rsi)
2401 ; SCALAR-NEXT: movb %al, 2(%rdx)
2402 ; SCALAR-NEXT: movw %cx, (%rdx)
2403 ; SCALAR-NEXT: movb %al, 6(%rdx)
2404 ; SCALAR-NEXT: movw %cx, 4(%rdx)
2405 ; SCALAR-NEXT: movb %al, 10(%rdx)
2406 ; SCALAR-NEXT: movw %cx, 8(%rdx)
2407 ; SCALAR-NEXT: movb %al, 14(%rdx)
2408 ; SCALAR-NEXT: movw %cx, 12(%rdx)
2409 ; SCALAR-NEXT: movb %al, 18(%rdx)
2410 ; SCALAR-NEXT: movw %cx, 16(%rdx)
2411 ; SCALAR-NEXT: movb %al, 22(%rdx)
2412 ; SCALAR-NEXT: movw %cx, 20(%rdx)
2413 ; SCALAR-NEXT: movb %al, 26(%rdx)
2414 ; SCALAR-NEXT: movw %cx, 24(%rdx)
2415 ; SCALAR-NEXT: movb %al, 30(%rdx)
2416 ; SCALAR-NEXT: movw %cx, 28(%rdx)
2417 ; SCALAR-NEXT: movb %al, 34(%rdx)
2418 ; SCALAR-NEXT: movw %cx, 32(%rdx)
2419 ; SCALAR-NEXT: movb %al, 38(%rdx)
2420 ; SCALAR-NEXT: movw %cx, 36(%rdx)
2421 ; SCALAR-NEXT: movb %al, 42(%rdx)
2422 ; SCALAR-NEXT: movw %cx, 40(%rdx)
2423 ; SCALAR-NEXT: movb %al, 46(%rdx)
2424 ; SCALAR-NEXT: movw %cx, 44(%rdx)
2425 ; SCALAR-NEXT: movb %al, 50(%rdx)
2426 ; SCALAR-NEXT: movw %cx, 48(%rdx)
2427 ; SCALAR-NEXT: movb %al, 54(%rdx)
2428 ; SCALAR-NEXT: movw %cx, 52(%rdx)
2429 ; SCALAR-NEXT: movb %al, 58(%rdx)
2430 ; SCALAR-NEXT: movw %cx, 56(%rdx)
2431 ; SCALAR-NEXT: movb %al, 62(%rdx)
2432 ; SCALAR-NEXT: movw %cx, 60(%rdx)
2435 ; SSE2-ONLY-LABEL: vec384_v3i8:
2436 ; SSE2-ONLY: # %bb.0:
2437 ; SSE2-ONLY-NEXT: movl (%rdi), %eax
2438 ; SSE2-ONLY-NEXT: notl %eax
2439 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
2440 ; SSE2-ONLY-NEXT: movl %eax, %ecx
2441 ; SSE2-ONLY-NEXT: shrl $16, %ecx
2442 ; SSE2-ONLY-NEXT: movb %cl, 2(%rsi)
2443 ; SSE2-ONLY-NEXT: movb %cl, 2(%rdx)
2444 ; SSE2-ONLY-NEXT: movw %ax, (%rdx)
2445 ; SSE2-ONLY-NEXT: movb %cl, 6(%rdx)
2446 ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx)
2447 ; SSE2-ONLY-NEXT: movb %cl, 10(%rdx)
2448 ; SSE2-ONLY-NEXT: movw %ax, 8(%rdx)
2449 ; SSE2-ONLY-NEXT: movb %cl, 14(%rdx)
2450 ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx)
2451 ; SSE2-ONLY-NEXT: movb %cl, 18(%rdx)
2452 ; SSE2-ONLY-NEXT: movw %ax, 16(%rdx)
2453 ; SSE2-ONLY-NEXT: movb %cl, 22(%rdx)
2454 ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx)
2455 ; SSE2-ONLY-NEXT: movb %cl, 26(%rdx)
2456 ; SSE2-ONLY-NEXT: movw %ax, 24(%rdx)
2457 ; SSE2-ONLY-NEXT: movb %cl, 30(%rdx)
2458 ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx)
2459 ; SSE2-ONLY-NEXT: movb %cl, 34(%rdx)
2460 ; SSE2-ONLY-NEXT: movw %ax, 32(%rdx)
2461 ; SSE2-ONLY-NEXT: movb %cl, 38(%rdx)
2462 ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx)
2463 ; SSE2-ONLY-NEXT: movb %cl, 42(%rdx)
2464 ; SSE2-ONLY-NEXT: movw %ax, 40(%rdx)
2465 ; SSE2-ONLY-NEXT: movb %cl, 46(%rdx)
2466 ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx)
2467 ; SSE2-ONLY-NEXT: movb %cl, 50(%rdx)
2468 ; SSE2-ONLY-NEXT: movw %ax, 48(%rdx)
2469 ; SSE2-ONLY-NEXT: movb %cl, 54(%rdx)
2470 ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx)
2471 ; SSE2-ONLY-NEXT: movb %cl, 58(%rdx)
2472 ; SSE2-ONLY-NEXT: movw %ax, 56(%rdx)
2473 ; SSE2-ONLY-NEXT: movb %cl, 62(%rdx)
2474 ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx)
2475 ; SSE2-ONLY-NEXT: retq
2477 ; SSE3-LABEL: vec384_v3i8:
2479 ; SSE3-NEXT: movl (%rdi), %eax
2480 ; SSE3-NEXT: notl %eax
2481 ; SSE3-NEXT: movw %ax, (%rsi)
2482 ; SSE3-NEXT: movl %eax, %ecx
2483 ; SSE3-NEXT: shrl $16, %ecx
2484 ; SSE3-NEXT: movb %cl, 2(%rsi)
2485 ; SSE3-NEXT: movb %cl, 2(%rdx)
2486 ; SSE3-NEXT: movw %ax, (%rdx)
2487 ; SSE3-NEXT: movb %cl, 6(%rdx)
2488 ; SSE3-NEXT: movw %ax, 4(%rdx)
2489 ; SSE3-NEXT: movb %cl, 10(%rdx)
2490 ; SSE3-NEXT: movw %ax, 8(%rdx)
2491 ; SSE3-NEXT: movb %cl, 14(%rdx)
2492 ; SSE3-NEXT: movw %ax, 12(%rdx)
2493 ; SSE3-NEXT: movb %cl, 18(%rdx)
2494 ; SSE3-NEXT: movw %ax, 16(%rdx)
2495 ; SSE3-NEXT: movb %cl, 22(%rdx)
2496 ; SSE3-NEXT: movw %ax, 20(%rdx)
2497 ; SSE3-NEXT: movb %cl, 26(%rdx)
2498 ; SSE3-NEXT: movw %ax, 24(%rdx)
2499 ; SSE3-NEXT: movb %cl, 30(%rdx)
2500 ; SSE3-NEXT: movw %ax, 28(%rdx)
2501 ; SSE3-NEXT: movb %cl, 34(%rdx)
2502 ; SSE3-NEXT: movw %ax, 32(%rdx)
2503 ; SSE3-NEXT: movb %cl, 38(%rdx)
2504 ; SSE3-NEXT: movw %ax, 36(%rdx)
2505 ; SSE3-NEXT: movb %cl, 42(%rdx)
2506 ; SSE3-NEXT: movw %ax, 40(%rdx)
2507 ; SSE3-NEXT: movb %cl, 46(%rdx)
2508 ; SSE3-NEXT: movw %ax, 44(%rdx)
2509 ; SSE3-NEXT: movb %cl, 50(%rdx)
2510 ; SSE3-NEXT: movw %ax, 48(%rdx)
2511 ; SSE3-NEXT: movb %cl, 54(%rdx)
2512 ; SSE3-NEXT: movw %ax, 52(%rdx)
2513 ; SSE3-NEXT: movb %cl, 58(%rdx)
2514 ; SSE3-NEXT: movw %ax, 56(%rdx)
2515 ; SSE3-NEXT: movb %cl, 62(%rdx)
2516 ; SSE3-NEXT: movw %ax, 60(%rdx)
2519 ; SSSE3-ONLY-LABEL: vec384_v3i8:
2520 ; SSSE3-ONLY: # %bb.0:
2521 ; SSSE3-ONLY-NEXT: movl (%rdi), %eax
2522 ; SSSE3-ONLY-NEXT: notl %eax
2523 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
2524 ; SSSE3-ONLY-NEXT: movl %eax, %ecx
2525 ; SSSE3-ONLY-NEXT: shrl $16, %ecx
2526 ; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi)
2527 ; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx)
2528 ; SSSE3-ONLY-NEXT: movw %ax, (%rdx)
2529 ; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx)
2530 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx)
2531 ; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx)
2532 ; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx)
2533 ; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx)
2534 ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx)
2535 ; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx)
2536 ; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx)
2537 ; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx)
2538 ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx)
2539 ; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx)
2540 ; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx)
2541 ; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx)
2542 ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx)
2543 ; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx)
2544 ; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx)
2545 ; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx)
2546 ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx)
2547 ; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx)
2548 ; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx)
2549 ; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx)
2550 ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx)
2551 ; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx)
2552 ; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx)
2553 ; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx)
2554 ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx)
2555 ; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx)
2556 ; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx)
2557 ; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx)
2558 ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx)
2559 ; SSSE3-ONLY-NEXT: retq
2561 ; SSE41-LABEL: vec384_v3i8:
2563 ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2564 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
2565 ; SSE41-NEXT: pxor %xmm1, %xmm0
2566 ; SSE41-NEXT: pextrb $2, %xmm0, 2(%rsi)
2567 ; SSE41-NEXT: movd %xmm0, %eax
2568 ; SSE41-NEXT: movw %ax, (%rsi)
2569 ; SSE41-NEXT: pextrb $2, %xmm0, 2(%rdx)
2570 ; SSE41-NEXT: movw %ax, (%rdx)
2571 ; SSE41-NEXT: pextrb $2, %xmm0, 6(%rdx)
2572 ; SSE41-NEXT: movw %ax, 4(%rdx)
2573 ; SSE41-NEXT: pextrb $2, %xmm0, 10(%rdx)
2574 ; SSE41-NEXT: movw %ax, 8(%rdx)
2575 ; SSE41-NEXT: pextrb $2, %xmm0, 14(%rdx)
2576 ; SSE41-NEXT: movw %ax, 12(%rdx)
2577 ; SSE41-NEXT: pextrb $2, %xmm0, 18(%rdx)
2578 ; SSE41-NEXT: movw %ax, 16(%rdx)
2579 ; SSE41-NEXT: pextrb $2, %xmm0, 22(%rdx)
2580 ; SSE41-NEXT: movw %ax, 20(%rdx)
2581 ; SSE41-NEXT: pextrb $2, %xmm0, 26(%rdx)
2582 ; SSE41-NEXT: movw %ax, 24(%rdx)
2583 ; SSE41-NEXT: pextrb $2, %xmm0, 30(%rdx)
2584 ; SSE41-NEXT: movw %ax, 28(%rdx)
2585 ; SSE41-NEXT: pextrb $2, %xmm0, 34(%rdx)
2586 ; SSE41-NEXT: movw %ax, 32(%rdx)
2587 ; SSE41-NEXT: pextrb $2, %xmm0, 38(%rdx)
2588 ; SSE41-NEXT: movw %ax, 36(%rdx)
2589 ; SSE41-NEXT: pextrb $2, %xmm0, 42(%rdx)
2590 ; SSE41-NEXT: movw %ax, 40(%rdx)
2591 ; SSE41-NEXT: pextrb $2, %xmm0, 46(%rdx)
2592 ; SSE41-NEXT: movw %ax, 44(%rdx)
2593 ; SSE41-NEXT: pextrb $2, %xmm0, 50(%rdx)
2594 ; SSE41-NEXT: movw %ax, 48(%rdx)
2595 ; SSE41-NEXT: pextrb $2, %xmm0, 54(%rdx)
2596 ; SSE41-NEXT: movw %ax, 52(%rdx)
2597 ; SSE41-NEXT: pextrb $2, %xmm0, 58(%rdx)
2598 ; SSE41-NEXT: movw %ax, 56(%rdx)
2599 ; SSE41-NEXT: pextrb $2, %xmm0, 62(%rdx)
2600 ; SSE41-NEXT: movw %ax, 60(%rdx)
2603 ; SSE42-LABEL: vec384_v3i8:
2605 ; SSE42-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2606 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
2607 ; SSE42-NEXT: pxor %xmm1, %xmm0
2608 ; SSE42-NEXT: pextrb $2, %xmm0, 2(%rsi)
2609 ; SSE42-NEXT: movd %xmm0, %eax
2610 ; SSE42-NEXT: movw %ax, (%rsi)
2611 ; SSE42-NEXT: pextrb $2, %xmm0, 2(%rdx)
2612 ; SSE42-NEXT: movw %ax, (%rdx)
2613 ; SSE42-NEXT: pextrb $2, %xmm0, 6(%rdx)
2614 ; SSE42-NEXT: movw %ax, 4(%rdx)
2615 ; SSE42-NEXT: pextrb $2, %xmm0, 10(%rdx)
2616 ; SSE42-NEXT: movw %ax, 8(%rdx)
2617 ; SSE42-NEXT: pextrb $2, %xmm0, 14(%rdx)
2618 ; SSE42-NEXT: movw %ax, 12(%rdx)
2619 ; SSE42-NEXT: pextrb $2, %xmm0, 18(%rdx)
2620 ; SSE42-NEXT: movw %ax, 16(%rdx)
2621 ; SSE42-NEXT: pextrb $2, %xmm0, 22(%rdx)
2622 ; SSE42-NEXT: movw %ax, 20(%rdx)
2623 ; SSE42-NEXT: pextrb $2, %xmm0, 26(%rdx)
2624 ; SSE42-NEXT: movw %ax, 24(%rdx)
2625 ; SSE42-NEXT: pextrb $2, %xmm0, 30(%rdx)
2626 ; SSE42-NEXT: movw %ax, 28(%rdx)
2627 ; SSE42-NEXT: pextrb $2, %xmm0, 34(%rdx)
2628 ; SSE42-NEXT: movw %ax, 32(%rdx)
2629 ; SSE42-NEXT: pextrb $2, %xmm0, 38(%rdx)
2630 ; SSE42-NEXT: movw %ax, 36(%rdx)
2631 ; SSE42-NEXT: pextrb $2, %xmm0, 42(%rdx)
2632 ; SSE42-NEXT: movw %ax, 40(%rdx)
2633 ; SSE42-NEXT: pextrb $2, %xmm0, 46(%rdx)
2634 ; SSE42-NEXT: movw %ax, 44(%rdx)
2635 ; SSE42-NEXT: pextrb $2, %xmm0, 50(%rdx)
2636 ; SSE42-NEXT: movw %ax, 48(%rdx)
2637 ; SSE42-NEXT: pextrb $2, %xmm0, 54(%rdx)
2638 ; SSE42-NEXT: movw %ax, 52(%rdx)
2639 ; SSE42-NEXT: pextrb $2, %xmm0, 58(%rdx)
2640 ; SSE42-NEXT: movw %ax, 56(%rdx)
2641 ; SSE42-NEXT: pextrb $2, %xmm0, 62(%rdx)
2642 ; SSE42-NEXT: movw %ax, 60(%rdx)
2645 ; AVX1-LABEL: vec384_v3i8:
2647 ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2648 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2649 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2650 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rsi)
2651 ; AVX1-NEXT: vmovd %xmm0, %eax
2652 ; AVX1-NEXT: movw %ax, (%rsi)
2653 ; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdx)
2654 ; AVX1-NEXT: movw %ax, (%rdx)
2655 ; AVX1-NEXT: vpextrb $2, %xmm0, 6(%rdx)
2656 ; AVX1-NEXT: movw %ax, 4(%rdx)
2657 ; AVX1-NEXT: vpextrb $2, %xmm0, 10(%rdx)
2658 ; AVX1-NEXT: movw %ax, 8(%rdx)
2659 ; AVX1-NEXT: vpextrb $2, %xmm0, 14(%rdx)
2660 ; AVX1-NEXT: movw %ax, 12(%rdx)
2661 ; AVX1-NEXT: vpextrb $2, %xmm0, 18(%rdx)
2662 ; AVX1-NEXT: movw %ax, 16(%rdx)
2663 ; AVX1-NEXT: vpextrb $2, %xmm0, 22(%rdx)
2664 ; AVX1-NEXT: movw %ax, 20(%rdx)
2665 ; AVX1-NEXT: vpextrb $2, %xmm0, 26(%rdx)
2666 ; AVX1-NEXT: movw %ax, 24(%rdx)
2667 ; AVX1-NEXT: vpextrb $2, %xmm0, 30(%rdx)
2668 ; AVX1-NEXT: movw %ax, 28(%rdx)
2669 ; AVX1-NEXT: vpextrb $2, %xmm0, 34(%rdx)
2670 ; AVX1-NEXT: movw %ax, 32(%rdx)
2671 ; AVX1-NEXT: vpextrb $2, %xmm0, 38(%rdx)
2672 ; AVX1-NEXT: movw %ax, 36(%rdx)
2673 ; AVX1-NEXT: vpextrb $2, %xmm0, 42(%rdx)
2674 ; AVX1-NEXT: movw %ax, 40(%rdx)
2675 ; AVX1-NEXT: vpextrb $2, %xmm0, 46(%rdx)
2676 ; AVX1-NEXT: movw %ax, 44(%rdx)
2677 ; AVX1-NEXT: vpextrb $2, %xmm0, 50(%rdx)
2678 ; AVX1-NEXT: movw %ax, 48(%rdx)
2679 ; AVX1-NEXT: vpextrb $2, %xmm0, 54(%rdx)
2680 ; AVX1-NEXT: movw %ax, 52(%rdx)
2681 ; AVX1-NEXT: vpextrb $2, %xmm0, 58(%rdx)
2682 ; AVX1-NEXT: movw %ax, 56(%rdx)
2683 ; AVX1-NEXT: vpextrb $2, %xmm0, 62(%rdx)
2684 ; AVX1-NEXT: movw %ax, 60(%rdx)
2687 ; AVX2-ONLY-LABEL: vec384_v3i8:
2688 ; AVX2-ONLY: # %bb.0:
2689 ; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2690 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2691 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
2692 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 2(%rsi)
2693 ; AVX2-ONLY-NEXT: vmovd %xmm0, %eax
2694 ; AVX2-ONLY-NEXT: movw %ax, (%rsi)
2695 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 2(%rdx)
2696 ; AVX2-ONLY-NEXT: movw %ax, (%rdx)
2697 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 6(%rdx)
2698 ; AVX2-ONLY-NEXT: movw %ax, 4(%rdx)
2699 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 10(%rdx)
2700 ; AVX2-ONLY-NEXT: movw %ax, 8(%rdx)
2701 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 14(%rdx)
2702 ; AVX2-ONLY-NEXT: movw %ax, 12(%rdx)
2703 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 18(%rdx)
2704 ; AVX2-ONLY-NEXT: movw %ax, 16(%rdx)
2705 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 22(%rdx)
2706 ; AVX2-ONLY-NEXT: movw %ax, 20(%rdx)
2707 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 26(%rdx)
2708 ; AVX2-ONLY-NEXT: movw %ax, 24(%rdx)
2709 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 30(%rdx)
2710 ; AVX2-ONLY-NEXT: movw %ax, 28(%rdx)
2711 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 34(%rdx)
2712 ; AVX2-ONLY-NEXT: movw %ax, 32(%rdx)
2713 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 38(%rdx)
2714 ; AVX2-ONLY-NEXT: movw %ax, 36(%rdx)
2715 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 42(%rdx)
2716 ; AVX2-ONLY-NEXT: movw %ax, 40(%rdx)
2717 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 46(%rdx)
2718 ; AVX2-ONLY-NEXT: movw %ax, 44(%rdx)
2719 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 50(%rdx)
2720 ; AVX2-ONLY-NEXT: movw %ax, 48(%rdx)
2721 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 54(%rdx)
2722 ; AVX2-ONLY-NEXT: movw %ax, 52(%rdx)
2723 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 58(%rdx)
2724 ; AVX2-ONLY-NEXT: movw %ax, 56(%rdx)
2725 ; AVX2-ONLY-NEXT: vpextrb $2, %xmm0, 62(%rdx)
2726 ; AVX2-ONLY-NEXT: movw %ax, 60(%rdx)
2727 ; AVX2-ONLY-NEXT: retq
2729 ; AVX512-LABEL: vec384_v3i8:
2731 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2732 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
2733 ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi)
2734 ; AVX512-NEXT: vmovd %xmm0, %eax
2735 ; AVX512-NEXT: movw %ax, (%rsi)
2736 ; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rdx)
2737 ; AVX512-NEXT: movw %ax, (%rdx)
2738 ; AVX512-NEXT: vpextrb $2, %xmm0, 6(%rdx)
2739 ; AVX512-NEXT: movw %ax, 4(%rdx)
2740 ; AVX512-NEXT: vpextrb $2, %xmm0, 10(%rdx)
2741 ; AVX512-NEXT: movw %ax, 8(%rdx)
2742 ; AVX512-NEXT: vpextrb $2, %xmm0, 14(%rdx)
2743 ; AVX512-NEXT: movw %ax, 12(%rdx)
2744 ; AVX512-NEXT: vpextrb $2, %xmm0, 18(%rdx)
2745 ; AVX512-NEXT: movw %ax, 16(%rdx)
2746 ; AVX512-NEXT: vpextrb $2, %xmm0, 22(%rdx)
2747 ; AVX512-NEXT: movw %ax, 20(%rdx)
2748 ; AVX512-NEXT: vpextrb $2, %xmm0, 26(%rdx)
2749 ; AVX512-NEXT: movw %ax, 24(%rdx)
2750 ; AVX512-NEXT: vpextrb $2, %xmm0, 30(%rdx)
2751 ; AVX512-NEXT: movw %ax, 28(%rdx)
2752 ; AVX512-NEXT: vpextrb $2, %xmm0, 34(%rdx)
2753 ; AVX512-NEXT: movw %ax, 32(%rdx)
2754 ; AVX512-NEXT: vpextrb $2, %xmm0, 38(%rdx)
2755 ; AVX512-NEXT: movw %ax, 36(%rdx)
2756 ; AVX512-NEXT: vpextrb $2, %xmm0, 42(%rdx)
2757 ; AVX512-NEXT: movw %ax, 40(%rdx)
2758 ; AVX512-NEXT: vpextrb $2, %xmm0, 46(%rdx)
2759 ; AVX512-NEXT: movw %ax, 44(%rdx)
2760 ; AVX512-NEXT: vpextrb $2, %xmm0, 50(%rdx)
2761 ; AVX512-NEXT: movw %ax, 48(%rdx)
2762 ; AVX512-NEXT: vpextrb $2, %xmm0, 54(%rdx)
2763 ; AVX512-NEXT: movw %ax, 52(%rdx)
2764 ; AVX512-NEXT: vpextrb $2, %xmm0, 58(%rdx)
2765 ; AVX512-NEXT: movw %ax, 56(%rdx)
2766 ; AVX512-NEXT: vpextrb $2, %xmm0, 62(%rdx)
2767 ; AVX512-NEXT: movw %ax, 60(%rdx)
2769 %in.subvec.not = load <3 x i8>, ptr %in.subvec.ptr, align 64
2770 %in.subvec = xor <3 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1>
2771 store <3 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
2772 %out.subvec0.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 0
2773 store <3 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
2774 %out.subvec1.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 1
2775 store <3 x i8> %in.subvec, ptr %out.subvec1.ptr, align 1
2776 %out.subvec2.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 2
2777 store <3 x i8> %in.subvec, ptr %out.subvec2.ptr, align 2
2778 %out.subvec3.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 3
2779 store <3 x i8> %in.subvec, ptr %out.subvec3.ptr, align 1
2780 %out.subvec4.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 4
2781 store <3 x i8> %in.subvec, ptr %out.subvec4.ptr, align 4
2782 %out.subvec5.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 5
2783 store <3 x i8> %in.subvec, ptr %out.subvec5.ptr, align 1
2784 %out.subvec6.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 6
2785 store <3 x i8> %in.subvec, ptr %out.subvec6.ptr, align 2
2786 %out.subvec7.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 7
2787 store <3 x i8> %in.subvec, ptr %out.subvec7.ptr, align 1
2788 %out.subvec8.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 8
2789 store <3 x i8> %in.subvec, ptr %out.subvec8.ptr, align 8
2790 %out.subvec9.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 9
2791 store <3 x i8> %in.subvec, ptr %out.subvec9.ptr, align 1
2792 %out.subvec10.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 10
2793 store <3 x i8> %in.subvec, ptr %out.subvec10.ptr, align 2
2794 %out.subvec11.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 11
2795 store <3 x i8> %in.subvec, ptr %out.subvec11.ptr, align 1
2796 %out.subvec12.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 12
2797 store <3 x i8> %in.subvec, ptr %out.subvec12.ptr, align 4
2798 %out.subvec13.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 13
2799 store <3 x i8> %in.subvec, ptr %out.subvec13.ptr, align 1
2800 %out.subvec14.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 14
2801 store <3 x i8> %in.subvec, ptr %out.subvec14.ptr, align 2
2802 %out.subvec15.ptr = getelementptr <3 x i8>, ptr %out.vec.ptr, i64 15
2803 store <3 x i8> %in.subvec, ptr %out.subvec15.ptr, align 1
2807 define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
2808 ; SCALAR-LABEL: vec384_v3i16:
2810 ; SCALAR-NEXT: movq (%rdi), %rax
2811 ; SCALAR-NEXT: movq %rax, %rcx
2812 ; SCALAR-NEXT: shrq $32, %rcx
2813 ; SCALAR-NEXT: notl %ecx
2814 ; SCALAR-NEXT: notl %eax
2815 ; SCALAR-NEXT: movl %eax, (%rsi)
2816 ; SCALAR-NEXT: movw %cx, 4(%rsi)
2817 ; SCALAR-NEXT: movw %cx, 4(%rdx)
2818 ; SCALAR-NEXT: movl %eax, (%rdx)
2819 ; SCALAR-NEXT: movw %cx, 12(%rdx)
2820 ; SCALAR-NEXT: movl %eax, 8(%rdx)
2821 ; SCALAR-NEXT: movw %cx, 20(%rdx)
2822 ; SCALAR-NEXT: movl %eax, 16(%rdx)
2823 ; SCALAR-NEXT: movw %cx, 28(%rdx)
2824 ; SCALAR-NEXT: movl %eax, 24(%rdx)
2825 ; SCALAR-NEXT: movw %cx, 36(%rdx)
2826 ; SCALAR-NEXT: movl %eax, 32(%rdx)
2827 ; SCALAR-NEXT: movw %cx, 44(%rdx)
2828 ; SCALAR-NEXT: movl %eax, 40(%rdx)
2829 ; SCALAR-NEXT: movw %cx, 52(%rdx)
2830 ; SCALAR-NEXT: movl %eax, 48(%rdx)
2831 ; SCALAR-NEXT: movw %cx, 60(%rdx)
2832 ; SCALAR-NEXT: movl %eax, 56(%rdx)
2835 ; SSE2-ONLY-LABEL: vec384_v3i16:
2836 ; SSE2-ONLY: # %bb.0:
2837 ; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2838 ; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
2839 ; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1
2840 ; SSE2-ONLY-NEXT: movd %xmm1, (%rsi)
2841 ; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax
2842 ; SSE2-ONLY-NEXT: movw %ax, 4(%rsi)
2843 ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx)
2844 ; SSE2-ONLY-NEXT: movd %xmm1, (%rdx)
2845 ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx)
2846 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
2847 ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx)
2848 ; SSE2-ONLY-NEXT: movd %xmm1, 16(%rdx)
2849 ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx)
2850 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
2851 ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx)
2852 ; SSE2-ONLY-NEXT: movd %xmm1, 32(%rdx)
2853 ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx)
2854 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
2855 ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx)
2856 ; SSE2-ONLY-NEXT: movd %xmm1, 48(%rdx)
2857 ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx)
2858 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
2859 ; SSE2-ONLY-NEXT: retq
2861 ; SSE3-LABEL: vec384_v3i16:
2863 ; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2864 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
2865 ; SSE3-NEXT: pxor %xmm0, %xmm1
2866 ; SSE3-NEXT: movd %xmm1, (%rsi)
2867 ; SSE3-NEXT: pextrw $2, %xmm1, %eax
2868 ; SSE3-NEXT: movw %ax, 4(%rsi)
2869 ; SSE3-NEXT: movw %ax, 4(%rdx)
2870 ; SSE3-NEXT: movd %xmm1, (%rdx)
2871 ; SSE3-NEXT: movw %ax, 12(%rdx)
2872 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
2873 ; SSE3-NEXT: movw %ax, 20(%rdx)
2874 ; SSE3-NEXT: movd %xmm1, 16(%rdx)
2875 ; SSE3-NEXT: movw %ax, 28(%rdx)
2876 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
2877 ; SSE3-NEXT: movw %ax, 36(%rdx)
2878 ; SSE3-NEXT: movd %xmm1, 32(%rdx)
2879 ; SSE3-NEXT: movw %ax, 44(%rdx)
2880 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
2881 ; SSE3-NEXT: movw %ax, 52(%rdx)
2882 ; SSE3-NEXT: movd %xmm1, 48(%rdx)
2883 ; SSE3-NEXT: movw %ax, 60(%rdx)
2884 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
2887 ; SSSE3-ONLY-LABEL: vec384_v3i16:
2888 ; SSSE3-ONLY: # %bb.0:
2889 ; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2890 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
2891 ; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1
2892 ; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi)
2893 ; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax
2894 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi)
2895 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx)
2896 ; SSSE3-ONLY-NEXT: movd %xmm1, (%rdx)
2897 ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx)
2898 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
2899 ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx)
2900 ; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rdx)
2901 ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx)
2902 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
2903 ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx)
2904 ; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rdx)
2905 ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx)
2906 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
2907 ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx)
2908 ; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rdx)
2909 ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx)
2910 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
2911 ; SSSE3-ONLY-NEXT: retq
2913 ; SSE41-LABEL: vec384_v3i16:
2915 ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2916 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
2917 ; SSE41-NEXT: pxor %xmm0, %xmm1
2918 ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi)
2919 ; SSE41-NEXT: movd %xmm1, (%rsi)
2920 ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rdx)
2921 ; SSE41-NEXT: movd %xmm1, (%rdx)
2922 ; SSE41-NEXT: pextrw $2, %xmm1, 12(%rdx)
2923 ; SSE41-NEXT: movd %xmm1, 8(%rdx)
2924 ; SSE41-NEXT: pextrw $2, %xmm1, 20(%rdx)
2925 ; SSE41-NEXT: movd %xmm1, 16(%rdx)
2926 ; SSE41-NEXT: pextrw $2, %xmm1, 28(%rdx)
2927 ; SSE41-NEXT: movd %xmm1, 24(%rdx)
2928 ; SSE41-NEXT: pextrw $2, %xmm1, 36(%rdx)
2929 ; SSE41-NEXT: movd %xmm1, 32(%rdx)
2930 ; SSE41-NEXT: pextrw $2, %xmm1, 44(%rdx)
2931 ; SSE41-NEXT: movd %xmm1, 40(%rdx)
2932 ; SSE41-NEXT: pextrw $2, %xmm1, 52(%rdx)
2933 ; SSE41-NEXT: movd %xmm1, 48(%rdx)
2934 ; SSE41-NEXT: pextrw $2, %xmm1, 60(%rdx)
2935 ; SSE41-NEXT: movd %xmm1, 56(%rdx)
2938 ; SSE42-LABEL: vec384_v3i16:
2940 ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2941 ; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
2942 ; SSE42-NEXT: pxor %xmm0, %xmm1
2943 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi)
2944 ; SSE42-NEXT: movd %xmm1, (%rsi)
2945 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdx)
2946 ; SSE42-NEXT: movd %xmm1, (%rdx)
2947 ; SSE42-NEXT: pextrw $2, %xmm1, 12(%rdx)
2948 ; SSE42-NEXT: movd %xmm1, 8(%rdx)
2949 ; SSE42-NEXT: pextrw $2, %xmm1, 20(%rdx)
2950 ; SSE42-NEXT: movd %xmm1, 16(%rdx)
2951 ; SSE42-NEXT: pextrw $2, %xmm1, 28(%rdx)
2952 ; SSE42-NEXT: movd %xmm1, 24(%rdx)
2953 ; SSE42-NEXT: pextrw $2, %xmm1, 36(%rdx)
2954 ; SSE42-NEXT: movd %xmm1, 32(%rdx)
2955 ; SSE42-NEXT: pextrw $2, %xmm1, 44(%rdx)
2956 ; SSE42-NEXT: movd %xmm1, 40(%rdx)
2957 ; SSE42-NEXT: pextrw $2, %xmm1, 52(%rdx)
2958 ; SSE42-NEXT: movd %xmm1, 48(%rdx)
2959 ; SSE42-NEXT: pextrw $2, %xmm1, 60(%rdx)
2960 ; SSE42-NEXT: movd %xmm1, 56(%rdx)
2963 ; AVX1-LABEL: vec384_v3i16:
2965 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2966 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2967 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
2968 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rsi)
2969 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
2970 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdx)
2971 ; AVX1-NEXT: vmovd %xmm0, (%rdx)
2972 ; AVX1-NEXT: vpextrw $2, %xmm0, 12(%rdx)
2973 ; AVX1-NEXT: vmovd %xmm0, 8(%rdx)
2974 ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdx)
2975 ; AVX1-NEXT: vmovd %xmm0, 16(%rdx)
2976 ; AVX1-NEXT: vpextrw $2, %xmm0, 28(%rdx)
2977 ; AVX1-NEXT: vmovd %xmm0, 24(%rdx)
2978 ; AVX1-NEXT: vpextrw $2, %xmm0, 36(%rdx)
2979 ; AVX1-NEXT: vmovd %xmm0, 32(%rdx)
2980 ; AVX1-NEXT: vpextrw $2, %xmm0, 44(%rdx)
2981 ; AVX1-NEXT: vmovd %xmm0, 40(%rdx)
2982 ; AVX1-NEXT: vpextrw $2, %xmm0, 52(%rdx)
2983 ; AVX1-NEXT: vmovd %xmm0, 48(%rdx)
2984 ; AVX1-NEXT: vpextrw $2, %xmm0, 60(%rdx)
2985 ; AVX1-NEXT: vmovd %xmm0, 56(%rdx)
2988 ; AVX2-ONLY-LABEL: vec384_v3i16:
2989 ; AVX2-ONLY: # %bb.0:
2990 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
2991 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
2992 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
2993 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rsi)
2994 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
2995 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rdx)
2996 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rdx)
2997 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 12(%rdx)
2998 ; AVX2-ONLY-NEXT: vmovd %xmm0, 8(%rdx)
2999 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 20(%rdx)
3000 ; AVX2-ONLY-NEXT: vmovd %xmm0, 16(%rdx)
3001 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 28(%rdx)
3002 ; AVX2-ONLY-NEXT: vmovd %xmm0, 24(%rdx)
3003 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 36(%rdx)
3004 ; AVX2-ONLY-NEXT: vmovd %xmm0, 32(%rdx)
3005 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 44(%rdx)
3006 ; AVX2-ONLY-NEXT: vmovd %xmm0, 40(%rdx)
3007 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 52(%rdx)
3008 ; AVX2-ONLY-NEXT: vmovd %xmm0, 48(%rdx)
3009 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 60(%rdx)
3010 ; AVX2-ONLY-NEXT: vmovd %xmm0, 56(%rdx)
3011 ; AVX2-ONLY-NEXT: retq
3013 ; AVX512-LABEL: vec384_v3i16:
3015 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3016 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
3017 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi)
3018 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
3019 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx)
3020 ; AVX512-NEXT: vmovd %xmm0, (%rdx)
3021 ; AVX512-NEXT: vpextrw $2, %xmm0, 12(%rdx)
3022 ; AVX512-NEXT: vmovd %xmm0, 8(%rdx)
3023 ; AVX512-NEXT: vpextrw $2, %xmm0, 20(%rdx)
3024 ; AVX512-NEXT: vmovd %xmm0, 16(%rdx)
3025 ; AVX512-NEXT: vpextrw $2, %xmm0, 28(%rdx)
3026 ; AVX512-NEXT: vmovd %xmm0, 24(%rdx)
3027 ; AVX512-NEXT: vpextrw $2, %xmm0, 36(%rdx)
3028 ; AVX512-NEXT: vmovd %xmm0, 32(%rdx)
3029 ; AVX512-NEXT: vpextrw $2, %xmm0, 44(%rdx)
3030 ; AVX512-NEXT: vmovd %xmm0, 40(%rdx)
3031 ; AVX512-NEXT: vpextrw $2, %xmm0, 52(%rdx)
3032 ; AVX512-NEXT: vmovd %xmm0, 48(%rdx)
3033 ; AVX512-NEXT: vpextrw $2, %xmm0, 60(%rdx)
3034 ; AVX512-NEXT: vmovd %xmm0, 56(%rdx)
3036 %in.subvec.not = load <3 x i16>, ptr %in.subvec.ptr, align 64
3037 %in.subvec = xor <3 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1>
3038 store <3 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
3039 %out.subvec0.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 0
3040 store <3 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
3041 %out.subvec1.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 1
3042 store <3 x i16> %in.subvec, ptr %out.subvec1.ptr, align 2
3043 %out.subvec2.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 2
3044 store <3 x i16> %in.subvec, ptr %out.subvec2.ptr, align 4
3045 %out.subvec3.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 3
3046 store <3 x i16> %in.subvec, ptr %out.subvec3.ptr, align 2
3047 %out.subvec4.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 4
3048 store <3 x i16> %in.subvec, ptr %out.subvec4.ptr, align 8
3049 %out.subvec5.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 5
3050 store <3 x i16> %in.subvec, ptr %out.subvec5.ptr, align 2
3051 %out.subvec6.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 6
3052 store <3 x i16> %in.subvec, ptr %out.subvec6.ptr, align 4
3053 %out.subvec7.ptr = getelementptr <3 x i16>, ptr %out.vec.ptr, i64 7
3054 store <3 x i16> %in.subvec, ptr %out.subvec7.ptr, align 2
3058 define void @vec384_v3i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3059 ; SCALAR-LABEL: vec384_v3i32:
3061 ; SCALAR-NEXT: movl 8(%rdi), %eax
3062 ; SCALAR-NEXT: movq (%rdi), %rcx
3063 ; SCALAR-NEXT: movq %rcx, %rdi
3064 ; SCALAR-NEXT: shrq $32, %rdi
3065 ; SCALAR-NEXT: notl %edi
3066 ; SCALAR-NEXT: shlq $32, %rdi
3067 ; SCALAR-NEXT: notl %ecx
3068 ; SCALAR-NEXT: orq %rdi, %rcx
3069 ; SCALAR-NEXT: notl %eax
3070 ; SCALAR-NEXT: movl %eax, 8(%rsi)
3071 ; SCALAR-NEXT: movq %rcx, (%rsi)
3072 ; SCALAR-NEXT: movl %eax, 8(%rdx)
3073 ; SCALAR-NEXT: movq %rcx, (%rdx)
3074 ; SCALAR-NEXT: movl %eax, 24(%rdx)
3075 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
3076 ; SCALAR-NEXT: movl %eax, 40(%rdx)
3077 ; SCALAR-NEXT: movq %rcx, 32(%rdx)
3078 ; SCALAR-NEXT: movl %eax, 56(%rdx)
3079 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
3082 ; SSE2-ONLY-LABEL: vec384_v3i32:
3083 ; SSE2-ONLY: # %bb.0:
3084 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
3085 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
3086 ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi)
3087 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3088 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi)
3089 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
3090 ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx)
3091 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
3092 ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx)
3093 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
3094 ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx)
3095 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
3096 ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx)
3097 ; SSE2-ONLY-NEXT: retq
3099 ; SSE3-LABEL: vec384_v3i32:
3101 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
3102 ; SSE3-NEXT: pxor (%rdi), %xmm0
3103 ; SSE3-NEXT: movq %xmm0, (%rsi)
3104 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3105 ; SSE3-NEXT: movd %xmm1, 8(%rsi)
3106 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
3107 ; SSE3-NEXT: movq %xmm0, (%rdx)
3108 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
3109 ; SSE3-NEXT: movq %xmm0, 16(%rdx)
3110 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
3111 ; SSE3-NEXT: movq %xmm0, 32(%rdx)
3112 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
3113 ; SSE3-NEXT: movq %xmm0, 48(%rdx)
3116 ; SSSE3-ONLY-LABEL: vec384_v3i32:
3117 ; SSSE3-ONLY: # %bb.0:
3118 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
3119 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
3120 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi)
3121 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3122 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi)
3123 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
3124 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx)
3125 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
3126 ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx)
3127 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
3128 ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx)
3129 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
3130 ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx)
3131 ; SSSE3-ONLY-NEXT: retq
3133 ; SSE41-LABEL: vec384_v3i32:
3135 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
3136 ; SSE41-NEXT: pxor (%rdi), %xmm0
3137 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi)
3138 ; SSE41-NEXT: movq %xmm0, (%rsi)
3139 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx)
3140 ; SSE41-NEXT: movq %xmm0, (%rdx)
3141 ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx)
3142 ; SSE41-NEXT: movq %xmm0, 16(%rdx)
3143 ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx)
3144 ; SSE41-NEXT: movq %xmm0, 32(%rdx)
3145 ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx)
3146 ; SSE41-NEXT: movq %xmm0, 48(%rdx)
3149 ; SSE42-LABEL: vec384_v3i32:
3151 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
3152 ; SSE42-NEXT: pxor (%rdi), %xmm0
3153 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi)
3154 ; SSE42-NEXT: movq %xmm0, (%rsi)
3155 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx)
3156 ; SSE42-NEXT: movq %xmm0, (%rdx)
3157 ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx)
3158 ; SSE42-NEXT: movq %xmm0, 16(%rdx)
3159 ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx)
3160 ; SSE42-NEXT: movq %xmm0, 32(%rdx)
3161 ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx)
3162 ; SSE42-NEXT: movq %xmm0, 48(%rdx)
3165 ; AVX-LABEL: vec384_v3i32:
3167 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3168 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
3169 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi)
3170 ; AVX-NEXT: vmovq %xmm0, (%rsi)
3171 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx)
3172 ; AVX-NEXT: vmovq %xmm0, (%rdx)
3173 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx)
3174 ; AVX-NEXT: vmovq %xmm0, 16(%rdx)
3175 ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx)
3176 ; AVX-NEXT: vmovq %xmm0, 32(%rdx)
3177 ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx)
3178 ; AVX-NEXT: vmovq %xmm0, 48(%rdx)
3180 %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64
3181 %in.subvec = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1>
3182 store <3 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
3183 %out.subvec0.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 0
3184 store <3 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
3185 %out.subvec1.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 1
3186 store <3 x i32> %in.subvec, ptr %out.subvec1.ptr, align 4
3187 %out.subvec2.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 2
3188 store <3 x i32> %in.subvec, ptr %out.subvec2.ptr, align 8
3189 %out.subvec3.ptr = getelementptr <3 x i32>, ptr %out.vec.ptr, i64 3
3190 store <3 x i32> %in.subvec, ptr %out.subvec3.ptr, align 4
3194 define void @vec384_v3f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3195 ; SCALAR-LABEL: vec384_v3f32:
3197 ; SCALAR-NEXT: movl 8(%rdi), %eax
3198 ; SCALAR-NEXT: movq (%rdi), %rcx
3199 ; SCALAR-NEXT: movq %rcx, %rdi
3200 ; SCALAR-NEXT: shrq $32, %rdi
3201 ; SCALAR-NEXT: notl %edi
3202 ; SCALAR-NEXT: shlq $32, %rdi
3203 ; SCALAR-NEXT: notl %ecx
3204 ; SCALAR-NEXT: orq %rdi, %rcx
3205 ; SCALAR-NEXT: notl %eax
3206 ; SCALAR-NEXT: movl %eax, 8(%rsi)
3207 ; SCALAR-NEXT: movq %rcx, (%rsi)
3208 ; SCALAR-NEXT: movl %eax, 8(%rdx)
3209 ; SCALAR-NEXT: movq %rcx, (%rdx)
3210 ; SCALAR-NEXT: movl %eax, 24(%rdx)
3211 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
3212 ; SCALAR-NEXT: movl %eax, 40(%rdx)
3213 ; SCALAR-NEXT: movq %rcx, 32(%rdx)
3214 ; SCALAR-NEXT: movl %eax, 56(%rdx)
3215 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
3218 ; SSE2-ONLY-LABEL: vec384_v3f32:
3219 ; SSE2-ONLY: # %bb.0:
3220 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
3221 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
3222 ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi)
3223 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3224 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi)
3225 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
3226 ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx)
3227 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
3228 ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx)
3229 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
3230 ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx)
3231 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
3232 ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx)
3233 ; SSE2-ONLY-NEXT: retq
3235 ; SSE3-LABEL: vec384_v3f32:
3237 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
3238 ; SSE3-NEXT: pxor (%rdi), %xmm0
3239 ; SSE3-NEXT: movq %xmm0, (%rsi)
3240 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3241 ; SSE3-NEXT: movd %xmm1, 8(%rsi)
3242 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
3243 ; SSE3-NEXT: movq %xmm0, (%rdx)
3244 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
3245 ; SSE3-NEXT: movq %xmm0, 16(%rdx)
3246 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
3247 ; SSE3-NEXT: movq %xmm0, 32(%rdx)
3248 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
3249 ; SSE3-NEXT: movq %xmm0, 48(%rdx)
3252 ; SSSE3-ONLY-LABEL: vec384_v3f32:
3253 ; SSSE3-ONLY: # %bb.0:
3254 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
3255 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
3256 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi)
3257 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
3258 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi)
3259 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
3260 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx)
3261 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
3262 ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx)
3263 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
3264 ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx)
3265 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
3266 ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx)
3267 ; SSSE3-ONLY-NEXT: retq
3269 ; SSE41-LABEL: vec384_v3f32:
3271 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
3272 ; SSE41-NEXT: pxor (%rdi), %xmm0
3273 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi)
3274 ; SSE41-NEXT: movq %xmm0, (%rsi)
3275 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx)
3276 ; SSE41-NEXT: movq %xmm0, (%rdx)
3277 ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx)
3278 ; SSE41-NEXT: movq %xmm0, 16(%rdx)
3279 ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx)
3280 ; SSE41-NEXT: movq %xmm0, 32(%rdx)
3281 ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx)
3282 ; SSE41-NEXT: movq %xmm0, 48(%rdx)
3285 ; SSE42-LABEL: vec384_v3f32:
3287 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
3288 ; SSE42-NEXT: pxor (%rdi), %xmm0
3289 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi)
3290 ; SSE42-NEXT: movq %xmm0, (%rsi)
3291 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx)
3292 ; SSE42-NEXT: movq %xmm0, (%rdx)
3293 ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx)
3294 ; SSE42-NEXT: movq %xmm0, 16(%rdx)
3295 ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx)
3296 ; SSE42-NEXT: movq %xmm0, 32(%rdx)
3297 ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx)
3298 ; SSE42-NEXT: movq %xmm0, 48(%rdx)
3301 ; AVX-LABEL: vec384_v3f32:
3303 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3304 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
3305 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi)
3306 ; AVX-NEXT: vmovq %xmm0, (%rsi)
3307 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx)
3308 ; AVX-NEXT: vmovq %xmm0, (%rdx)
3309 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx)
3310 ; AVX-NEXT: vmovq %xmm0, 16(%rdx)
3311 ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx)
3312 ; AVX-NEXT: vmovq %xmm0, 32(%rdx)
3313 ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx)
3314 ; AVX-NEXT: vmovq %xmm0, 48(%rdx)
3316 %in.subvec.not = load <3 x i32>, ptr %in.subvec.ptr, align 64
3317 %in.subvec.int = xor <3 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1>
3318 %in.subvec = bitcast <3 x i32> %in.subvec.int to <3 x float>
3319 store <3 x float> %in.subvec, ptr %out.subvec.ptr, align 64
3320 %out.subvec0.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 0
3321 store <3 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
3322 %out.subvec1.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 1
3323 store <3 x float> %in.subvec, ptr %out.subvec1.ptr, align 4
3324 %out.subvec2.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 2
3325 store <3 x float> %in.subvec, ptr %out.subvec2.ptr, align 8
3326 %out.subvec3.ptr = getelementptr <3 x float>, ptr %out.vec.ptr, i64 3
3327 store <3 x float> %in.subvec, ptr %out.subvec3.ptr, align 4
3331 define void @vec384_v3i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3332 ; SCALAR-LABEL: vec384_v3i64:
3334 ; SCALAR-NEXT: movq (%rdi), %rax
3335 ; SCALAR-NEXT: movq 8(%rdi), %rcx
3336 ; SCALAR-NEXT: movq 16(%rdi), %rdi
3337 ; SCALAR-NEXT: notq %rdi
3338 ; SCALAR-NEXT: notq %rcx
3339 ; SCALAR-NEXT: notq %rax
3340 ; SCALAR-NEXT: movq %rax, (%rsi)
3341 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
3342 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
3343 ; SCALAR-NEXT: movq %rax, (%rdx)
3344 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
3345 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
3346 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
3347 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
3348 ; SCALAR-NEXT: movq %rax, 32(%rdx)
3351 ; SSE2-LABEL: vec384_v3i64:
3353 ; SSE2-NEXT: movq 16(%rdi), %rax
3354 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3355 ; SSE2-NEXT: pxor (%rdi), %xmm0
3356 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
3357 ; SSE2-NEXT: notq %rax
3358 ; SSE2-NEXT: movq %rax, 16(%rsi)
3359 ; SSE2-NEXT: movq %rax, 16(%rdx)
3360 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3361 ; SSE2-NEXT: movq %rax, 48(%rdx)
3362 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
3365 ; AVX1-LABEL: vec384_v3i64:
3367 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3368 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
3369 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
3370 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3371 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
3372 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
3373 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
3374 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
3375 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
3376 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
3377 ; AVX1-NEXT: vzeroupper
3380 ; AVX2-LABEL: vec384_v3i64:
3382 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
3383 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
3384 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3385 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
3386 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
3387 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
3388 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
3389 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
3390 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
3391 ; AVX2-NEXT: vzeroupper
3393 %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64
3394 %in.subvec = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1>
3395 store <3 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
3396 %out.subvec0.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 0
3397 store <3 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
3398 %out.subvec1.ptr = getelementptr <3 x i64>, ptr %out.vec.ptr, i64 1
3399 store <3 x i64> %in.subvec, ptr %out.subvec1.ptr, align 8
3403 define void @vec384_v3f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3404 ; SCALAR-LABEL: vec384_v3f64:
3406 ; SCALAR-NEXT: movq (%rdi), %rax
3407 ; SCALAR-NEXT: movq 8(%rdi), %rcx
3408 ; SCALAR-NEXT: movq 16(%rdi), %rdi
3409 ; SCALAR-NEXT: notq %rdi
3410 ; SCALAR-NEXT: notq %rcx
3411 ; SCALAR-NEXT: notq %rax
3412 ; SCALAR-NEXT: movq %rax, (%rsi)
3413 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
3414 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
3415 ; SCALAR-NEXT: movq %rax, (%rdx)
3416 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
3417 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
3418 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
3419 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
3420 ; SCALAR-NEXT: movq %rax, 32(%rdx)
3423 ; SSE2-LABEL: vec384_v3f64:
3425 ; SSE2-NEXT: movq 16(%rdi), %rax
3426 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3427 ; SSE2-NEXT: pxor (%rdi), %xmm0
3428 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
3429 ; SSE2-NEXT: notq %rax
3430 ; SSE2-NEXT: movq %rax, 16(%rsi)
3431 ; SSE2-NEXT: movq %rax, 16(%rdx)
3432 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3433 ; SSE2-NEXT: movq %rax, 48(%rdx)
3434 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
3437 ; AVX1-LABEL: vec384_v3f64:
3439 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3440 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
3441 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
3442 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
3443 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
3444 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
3445 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
3446 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
3447 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
3448 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
3449 ; AVX1-NEXT: vzeroupper
3452 ; AVX2-LABEL: vec384_v3f64:
3454 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
3455 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
3456 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
3457 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
3458 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
3459 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
3460 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
3461 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
3462 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
3463 ; AVX2-NEXT: vzeroupper
3465 %in.subvec.not = load <3 x i64>, ptr %in.subvec.ptr, align 64
3466 %in.subvec.int = xor <3 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1>
3467 %in.subvec = bitcast <3 x i64> %in.subvec.int to <3 x double>
3468 store <3 x double> %in.subvec, ptr %out.subvec.ptr, align 64
3469 %out.subvec0.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 0
3470 store <3 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
3471 %out.subvec1.ptr = getelementptr <3 x double>, ptr %out.vec.ptr, i64 1
3472 store <3 x double> %in.subvec, ptr %out.subvec1.ptr, align 8
3476 define void @vec384_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3477 ; SCALAR-LABEL: vec384_v4i8:
3479 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
3480 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
3481 ; SCALAR-NEXT: movzbl (%rdi), %eax
3482 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
3483 ; SCALAR-NEXT: notb %al
3484 ; SCALAR-NEXT: notb %dil
3485 ; SCALAR-NEXT: notb %cl
3486 ; SCALAR-NEXT: notb %r8b
3487 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
3488 ; SCALAR-NEXT: movb %cl, 2(%rsi)
3489 ; SCALAR-NEXT: movb %dil, 1(%rsi)
3490 ; SCALAR-NEXT: movb %al, (%rsi)
3491 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
3492 ; SCALAR-NEXT: movb %cl, 2(%rdx)
3493 ; SCALAR-NEXT: movb %dil, 1(%rdx)
3494 ; SCALAR-NEXT: movb %al, (%rdx)
3495 ; SCALAR-NEXT: movb %r8b, 7(%rdx)
3496 ; SCALAR-NEXT: movb %cl, 6(%rdx)
3497 ; SCALAR-NEXT: movb %dil, 5(%rdx)
3498 ; SCALAR-NEXT: movb %al, 4(%rdx)
3499 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
3500 ; SCALAR-NEXT: movb %cl, 10(%rdx)
3501 ; SCALAR-NEXT: movb %dil, 9(%rdx)
3502 ; SCALAR-NEXT: movb %al, 8(%rdx)
3503 ; SCALAR-NEXT: movb %r8b, 15(%rdx)
3504 ; SCALAR-NEXT: movb %cl, 14(%rdx)
3505 ; SCALAR-NEXT: movb %dil, 13(%rdx)
3506 ; SCALAR-NEXT: movb %al, 12(%rdx)
3507 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
3508 ; SCALAR-NEXT: movb %cl, 18(%rdx)
3509 ; SCALAR-NEXT: movb %dil, 17(%rdx)
3510 ; SCALAR-NEXT: movb %al, 16(%rdx)
3511 ; SCALAR-NEXT: movb %r8b, 23(%rdx)
3512 ; SCALAR-NEXT: movb %cl, 22(%rdx)
3513 ; SCALAR-NEXT: movb %dil, 21(%rdx)
3514 ; SCALAR-NEXT: movb %al, 20(%rdx)
3515 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
3516 ; SCALAR-NEXT: movb %cl, 26(%rdx)
3517 ; SCALAR-NEXT: movb %dil, 25(%rdx)
3518 ; SCALAR-NEXT: movb %al, 24(%rdx)
3519 ; SCALAR-NEXT: movb %r8b, 31(%rdx)
3520 ; SCALAR-NEXT: movb %cl, 30(%rdx)
3521 ; SCALAR-NEXT: movb %dil, 29(%rdx)
3522 ; SCALAR-NEXT: movb %al, 28(%rdx)
3523 ; SCALAR-NEXT: movb %r8b, 35(%rdx)
3524 ; SCALAR-NEXT: movb %cl, 34(%rdx)
3525 ; SCALAR-NEXT: movb %dil, 33(%rdx)
3526 ; SCALAR-NEXT: movb %al, 32(%rdx)
3527 ; SCALAR-NEXT: movb %r8b, 39(%rdx)
3528 ; SCALAR-NEXT: movb %cl, 38(%rdx)
3529 ; SCALAR-NEXT: movb %dil, 37(%rdx)
3530 ; SCALAR-NEXT: movb %al, 36(%rdx)
3531 ; SCALAR-NEXT: movb %r8b, 43(%rdx)
3532 ; SCALAR-NEXT: movb %cl, 42(%rdx)
3533 ; SCALAR-NEXT: movb %dil, 41(%rdx)
3534 ; SCALAR-NEXT: movb %al, 40(%rdx)
3535 ; SCALAR-NEXT: movb %r8b, 47(%rdx)
3536 ; SCALAR-NEXT: movb %cl, 46(%rdx)
3537 ; SCALAR-NEXT: movb %dil, 45(%rdx)
3538 ; SCALAR-NEXT: movb %al, 44(%rdx)
3541 ; SSE2-LABEL: vec384_v4i8:
3543 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3544 ; SSE2-NEXT: pxor (%rdi), %xmm0
3545 ; SSE2-NEXT: movd %xmm0, (%rsi)
3546 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3547 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3548 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
3549 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
3552 ; AVX1-LABEL: vec384_v4i8:
3554 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3555 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
3556 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
3557 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3558 ; AVX1-NEXT: vmovdqa %xmm0, 16(%rdx)
3559 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
3560 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
3563 ; AVX2-LABEL: vec384_v4i8:
3565 ; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3566 ; AVX2-NEXT: vpxor (%rdi), %xmm0, %xmm0
3567 ; AVX2-NEXT: vmovd %xmm0, (%rsi)
3568 ; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
3569 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
3570 ; AVX2-NEXT: vmovdqa %xmm0, 32(%rdx)
3571 ; AVX2-NEXT: vzeroupper
3573 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
3574 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
3575 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
3576 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
3577 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
3578 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
3579 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
3580 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
3581 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
3582 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
3583 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
3584 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
3585 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
3586 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
3587 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
3588 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
3589 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
3590 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
3591 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
3592 %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8
3593 store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32
3594 %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9
3595 store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4
3596 %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10
3597 store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8
3598 %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11
3599 store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4
3603 define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3604 ; SCALAR-LABEL: vec384_v4i16:
3606 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
3607 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
3608 ; SCALAR-NEXT: movl (%rdi), %eax
3609 ; SCALAR-NEXT: movl 4(%rdi), %edi
3610 ; SCALAR-NEXT: notl %eax
3611 ; SCALAR-NEXT: notl %ecx
3612 ; SCALAR-NEXT: notl %edi
3613 ; SCALAR-NEXT: notl %r8d
3614 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
3615 ; SCALAR-NEXT: movw %di, 4(%rsi)
3616 ; SCALAR-NEXT: movw %cx, 2(%rsi)
3617 ; SCALAR-NEXT: movw %ax, (%rsi)
3618 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
3619 ; SCALAR-NEXT: movw %di, 4(%rdx)
3620 ; SCALAR-NEXT: movw %cx, 2(%rdx)
3621 ; SCALAR-NEXT: movw %ax, (%rdx)
3622 ; SCALAR-NEXT: movw %r8w, 14(%rdx)
3623 ; SCALAR-NEXT: movw %di, 12(%rdx)
3624 ; SCALAR-NEXT: movw %cx, 10(%rdx)
3625 ; SCALAR-NEXT: movw %ax, 8(%rdx)
3626 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
3627 ; SCALAR-NEXT: movw %di, 20(%rdx)
3628 ; SCALAR-NEXT: movw %cx, 18(%rdx)
3629 ; SCALAR-NEXT: movw %ax, 16(%rdx)
3630 ; SCALAR-NEXT: movw %r8w, 30(%rdx)
3631 ; SCALAR-NEXT: movw %di, 28(%rdx)
3632 ; SCALAR-NEXT: movw %cx, 26(%rdx)
3633 ; SCALAR-NEXT: movw %ax, 24(%rdx)
3634 ; SCALAR-NEXT: movw %r8w, 38(%rdx)
3635 ; SCALAR-NEXT: movw %di, 36(%rdx)
3636 ; SCALAR-NEXT: movw %cx, 34(%rdx)
3637 ; SCALAR-NEXT: movw %ax, 32(%rdx)
3638 ; SCALAR-NEXT: movw %r8w, 46(%rdx)
3639 ; SCALAR-NEXT: movw %di, 44(%rdx)
3640 ; SCALAR-NEXT: movw %cx, 42(%rdx)
3641 ; SCALAR-NEXT: movw %ax, 40(%rdx)
3644 ; SSE2-LABEL: vec384_v4i16:
3646 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3647 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
3648 ; SSE2-NEXT: pxor %xmm0, %xmm1
3649 ; SSE2-NEXT: movq %xmm1, (%rsi)
3650 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
3651 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3652 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
3653 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
3656 ; AVX1-LABEL: vec384_v4i16:
3658 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3659 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
3660 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
3661 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
3662 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
3663 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
3664 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
3665 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
3666 ; AVX1-NEXT: vzeroupper
3669 ; AVX2-ONLY-LABEL: vec384_v4i16:
3670 ; AVX2-ONLY: # %bb.0:
3671 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3672 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
3673 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
3674 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
3675 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
3676 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
3677 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx)
3678 ; AVX2-ONLY-NEXT: vzeroupper
3679 ; AVX2-ONLY-NEXT: retq
3681 ; AVX512-LABEL: vec384_v4i16:
3683 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3684 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
3685 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
3686 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
3687 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
3688 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
3689 ; AVX512-NEXT: vzeroupper
3691 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
3692 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
3693 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
3694 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
3695 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
3696 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
3697 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
3698 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
3699 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
3700 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
3701 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
3702 %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4
3703 store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32
3704 %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5
3705 store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8
3709 define void @vec384_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3710 ; SCALAR-LABEL: vec384_v4i32:
3712 ; SCALAR-NEXT: movaps (%rdi), %xmm0
3713 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3714 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
3715 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
3716 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
3717 ; SCALAR-NEXT: movaps %xmm0, 32(%rdx)
3720 ; SSE2-LABEL: vec384_v4i32:
3722 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3723 ; SSE2-NEXT: pxor (%rdi), %xmm0
3724 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
3725 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3726 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
3727 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
3730 ; AVX-LABEL: vec384_v4i32:
3732 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3733 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
3734 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
3735 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3736 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3737 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
3739 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
3740 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
3741 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
3742 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
3743 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
3744 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
3745 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
3746 %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2
3747 store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32
3751 define void @vec384_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3752 ; SCALAR-LABEL: vec384_v4f32:
3754 ; SCALAR-NEXT: movaps (%rdi), %xmm0
3755 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3756 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
3757 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
3758 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
3759 ; SCALAR-NEXT: movaps %xmm0, 32(%rdx)
3762 ; SSE2-LABEL: vec384_v4f32:
3764 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
3765 ; SSE2-NEXT: pxor (%rdi), %xmm0
3766 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
3767 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
3768 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
3769 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
3772 ; AVX-LABEL: vec384_v4f32:
3774 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
3775 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
3776 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
3777 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
3778 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
3779 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
3781 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
3782 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
3783 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
3784 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
3785 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
3786 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
3787 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
3788 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
3789 %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2
3790 store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32
3794 define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
3795 ; SCALAR-LABEL: vec384_v6i8:
3797 ; SCALAR-NEXT: movq (%rdi), %rdi
3798 ; SCALAR-NEXT: movq %rdi, %rax
3799 ; SCALAR-NEXT: shrq $40, %rax
3800 ; SCALAR-NEXT: movq %rdi, %rcx
3801 ; SCALAR-NEXT: shrq $32, %rcx
3802 ; SCALAR-NEXT: movl %edi, %r8d
3803 ; SCALAR-NEXT: shrl $24, %r8d
3804 ; SCALAR-NEXT: movl %edi, %r9d
3805 ; SCALAR-NEXT: shrl $16, %r9d
3806 ; SCALAR-NEXT: movl %edi, %r10d
3807 ; SCALAR-NEXT: shrl $8, %r10d
3808 ; SCALAR-NEXT: notb %dil
3809 ; SCALAR-NEXT: movzbl %dil, %edi
3810 ; SCALAR-NEXT: notb %r10b
3811 ; SCALAR-NEXT: movzbl %r10b, %r10d
3812 ; SCALAR-NEXT: shll $8, %r10d
3813 ; SCALAR-NEXT: orl %edi, %r10d
3814 ; SCALAR-NEXT: notb %r9b
3815 ; SCALAR-NEXT: movzbl %r9b, %edi
3816 ; SCALAR-NEXT: notb %r8b
3817 ; SCALAR-NEXT: movzbl %r8b, %r8d
3818 ; SCALAR-NEXT: shll $8, %r8d
3819 ; SCALAR-NEXT: orl %edi, %r8d
3820 ; SCALAR-NEXT: notb %cl
3821 ; SCALAR-NEXT: movzbl %cl, %ecx
3822 ; SCALAR-NEXT: notb %al
3823 ; SCALAR-NEXT: movzbl %al, %eax
3824 ; SCALAR-NEXT: shll $8, %eax
3825 ; SCALAR-NEXT: orl %ecx, %eax
3826 ; SCALAR-NEXT: movw %ax, 4(%rsi)
3827 ; SCALAR-NEXT: shll $16, %r8d
3828 ; SCALAR-NEXT: movzwl %r10w, %ecx
3829 ; SCALAR-NEXT: orl %r8d, %ecx
3830 ; SCALAR-NEXT: movl %ecx, (%rsi)
3831 ; SCALAR-NEXT: movw %ax, 4(%rdx)
3832 ; SCALAR-NEXT: movl %ecx, (%rdx)
3833 ; SCALAR-NEXT: movw %ax, 12(%rdx)
3834 ; SCALAR-NEXT: movl %ecx, 8(%rdx)
3835 ; SCALAR-NEXT: movw %ax, 20(%rdx)
3836 ; SCALAR-NEXT: movl %ecx, 16(%rdx)
3837 ; SCALAR-NEXT: movw %ax, 28(%rdx)
3838 ; SCALAR-NEXT: movl %ecx, 24(%rdx)
3839 ; SCALAR-NEXT: movw %ax, 36(%rdx)
3840 ; SCALAR-NEXT: movl %ecx, 32(%rdx)
3841 ; SCALAR-NEXT: movw %ax, 44(%rdx)
3842 ; SCALAR-NEXT: movl %ecx, 40(%rdx)
3843 ; SCALAR-NEXT: movw %ax, 52(%rdx)
3844 ; SCALAR-NEXT: movl %ecx, 48(%rdx)
3845 ; SCALAR-NEXT: movw %ax, 60(%rdx)
3846 ; SCALAR-NEXT: movl %ecx, 56(%rdx)
3849 ; SSE2-ONLY-LABEL: vec384_v6i8:
3850 ; SSE2-ONLY: # %bb.0:
3851 ; SSE2-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3852 ; SSE2-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
3853 ; SSE2-ONLY-NEXT: pxor %xmm0, %xmm1
3854 ; SSE2-ONLY-NEXT: movd %xmm1, (%rsi)
3855 ; SSE2-ONLY-NEXT: pextrw $2, %xmm1, %eax
3856 ; SSE2-ONLY-NEXT: movw %ax, 4(%rsi)
3857 ; SSE2-ONLY-NEXT: movw %ax, 4(%rdx)
3858 ; SSE2-ONLY-NEXT: movd %xmm1, (%rdx)
3859 ; SSE2-ONLY-NEXT: movw %ax, 12(%rdx)
3860 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
3861 ; SSE2-ONLY-NEXT: movw %ax, 20(%rdx)
3862 ; SSE2-ONLY-NEXT: movd %xmm1, 16(%rdx)
3863 ; SSE2-ONLY-NEXT: movw %ax, 28(%rdx)
3864 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
3865 ; SSE2-ONLY-NEXT: movw %ax, 36(%rdx)
3866 ; SSE2-ONLY-NEXT: movd %xmm1, 32(%rdx)
3867 ; SSE2-ONLY-NEXT: movw %ax, 44(%rdx)
3868 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
3869 ; SSE2-ONLY-NEXT: movw %ax, 52(%rdx)
3870 ; SSE2-ONLY-NEXT: movd %xmm1, 48(%rdx)
3871 ; SSE2-ONLY-NEXT: movw %ax, 60(%rdx)
3872 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
3873 ; SSE2-ONLY-NEXT: retq
3875 ; SSE3-LABEL: vec384_v6i8:
3877 ; SSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3878 ; SSE3-NEXT: pcmpeqd %xmm1, %xmm1
3879 ; SSE3-NEXT: pxor %xmm0, %xmm1
3880 ; SSE3-NEXT: movd %xmm1, (%rsi)
3881 ; SSE3-NEXT: pextrw $2, %xmm1, %eax
3882 ; SSE3-NEXT: movw %ax, 4(%rsi)
3883 ; SSE3-NEXT: movw %ax, 4(%rdx)
3884 ; SSE3-NEXT: movd %xmm1, (%rdx)
3885 ; SSE3-NEXT: movw %ax, 12(%rdx)
3886 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
3887 ; SSE3-NEXT: movw %ax, 20(%rdx)
3888 ; SSE3-NEXT: movd %xmm1, 16(%rdx)
3889 ; SSE3-NEXT: movw %ax, 28(%rdx)
3890 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
3891 ; SSE3-NEXT: movw %ax, 36(%rdx)
3892 ; SSE3-NEXT: movd %xmm1, 32(%rdx)
3893 ; SSE3-NEXT: movw %ax, 44(%rdx)
3894 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
3895 ; SSE3-NEXT: movw %ax, 52(%rdx)
3896 ; SSE3-NEXT: movd %xmm1, 48(%rdx)
3897 ; SSE3-NEXT: movw %ax, 60(%rdx)
3898 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
3901 ; SSSE3-ONLY-LABEL: vec384_v6i8:
3902 ; SSSE3-ONLY: # %bb.0:
3903 ; SSSE3-ONLY-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3904 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm1, %xmm1
3905 ; SSSE3-ONLY-NEXT: pxor %xmm0, %xmm1
3906 ; SSSE3-ONLY-NEXT: movd %xmm1, (%rsi)
3907 ; SSSE3-ONLY-NEXT: pextrw $2, %xmm1, %eax
3908 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rsi)
3909 ; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx)
3910 ; SSSE3-ONLY-NEXT: movd %xmm1, (%rdx)
3911 ; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx)
3912 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
3913 ; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx)
3914 ; SSSE3-ONLY-NEXT: movd %xmm1, 16(%rdx)
3915 ; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx)
3916 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
3917 ; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx)
3918 ; SSSE3-ONLY-NEXT: movd %xmm1, 32(%rdx)
3919 ; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx)
3920 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
3921 ; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx)
3922 ; SSSE3-ONLY-NEXT: movd %xmm1, 48(%rdx)
3923 ; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx)
3924 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
3925 ; SSSE3-ONLY-NEXT: retq
3927 ; SSE41-LABEL: vec384_v6i8:
3929 ; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3930 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
3931 ; SSE41-NEXT: pxor %xmm0, %xmm1
3932 ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rsi)
3933 ; SSE41-NEXT: movd %xmm1, (%rsi)
3934 ; SSE41-NEXT: pextrw $2, %xmm1, 4(%rdx)
3935 ; SSE41-NEXT: movd %xmm1, (%rdx)
3936 ; SSE41-NEXT: pextrw $2, %xmm1, 12(%rdx)
3937 ; SSE41-NEXT: movd %xmm1, 8(%rdx)
3938 ; SSE41-NEXT: pextrw $2, %xmm1, 20(%rdx)
3939 ; SSE41-NEXT: movd %xmm1, 16(%rdx)
3940 ; SSE41-NEXT: pextrw $2, %xmm1, 28(%rdx)
3941 ; SSE41-NEXT: movd %xmm1, 24(%rdx)
3942 ; SSE41-NEXT: pextrw $2, %xmm1, 36(%rdx)
3943 ; SSE41-NEXT: movd %xmm1, 32(%rdx)
3944 ; SSE41-NEXT: pextrw $2, %xmm1, 44(%rdx)
3945 ; SSE41-NEXT: movd %xmm1, 40(%rdx)
3946 ; SSE41-NEXT: pextrw $2, %xmm1, 52(%rdx)
3947 ; SSE41-NEXT: movd %xmm1, 48(%rdx)
3948 ; SSE41-NEXT: pextrw $2, %xmm1, 60(%rdx)
3949 ; SSE41-NEXT: movd %xmm1, 56(%rdx)
3952 ; SSE42-LABEL: vec384_v6i8:
3954 ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
3955 ; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
3956 ; SSE42-NEXT: pxor %xmm0, %xmm1
3957 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rsi)
3958 ; SSE42-NEXT: movd %xmm1, (%rsi)
3959 ; SSE42-NEXT: pextrw $2, %xmm1, 4(%rdx)
3960 ; SSE42-NEXT: movd %xmm1, (%rdx)
3961 ; SSE42-NEXT: pextrw $2, %xmm1, 12(%rdx)
3962 ; SSE42-NEXT: movd %xmm1, 8(%rdx)
3963 ; SSE42-NEXT: pextrw $2, %xmm1, 20(%rdx)
3964 ; SSE42-NEXT: movd %xmm1, 16(%rdx)
3965 ; SSE42-NEXT: pextrw $2, %xmm1, 28(%rdx)
3966 ; SSE42-NEXT: movd %xmm1, 24(%rdx)
3967 ; SSE42-NEXT: pextrw $2, %xmm1, 36(%rdx)
3968 ; SSE42-NEXT: movd %xmm1, 32(%rdx)
3969 ; SSE42-NEXT: pextrw $2, %xmm1, 44(%rdx)
3970 ; SSE42-NEXT: movd %xmm1, 40(%rdx)
3971 ; SSE42-NEXT: pextrw $2, %xmm1, 52(%rdx)
3972 ; SSE42-NEXT: movd %xmm1, 48(%rdx)
3973 ; SSE42-NEXT: pextrw $2, %xmm1, 60(%rdx)
3974 ; SSE42-NEXT: movd %xmm1, 56(%rdx)
3977 ; AVX1-LABEL: vec384_v6i8:
3979 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
3980 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
3981 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
3982 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rsi)
3983 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
3984 ; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdx)
3985 ; AVX1-NEXT: vmovd %xmm0, (%rdx)
3986 ; AVX1-NEXT: vpextrw $2, %xmm0, 12(%rdx)
3987 ; AVX1-NEXT: vmovd %xmm0, 8(%rdx)
3988 ; AVX1-NEXT: vpextrw $2, %xmm0, 20(%rdx)
3989 ; AVX1-NEXT: vmovd %xmm0, 16(%rdx)
3990 ; AVX1-NEXT: vpextrw $2, %xmm0, 28(%rdx)
3991 ; AVX1-NEXT: vmovd %xmm0, 24(%rdx)
3992 ; AVX1-NEXT: vpextrw $2, %xmm0, 36(%rdx)
3993 ; AVX1-NEXT: vmovd %xmm0, 32(%rdx)
3994 ; AVX1-NEXT: vpextrw $2, %xmm0, 44(%rdx)
3995 ; AVX1-NEXT: vmovd %xmm0, 40(%rdx)
3996 ; AVX1-NEXT: vpextrw $2, %xmm0, 52(%rdx)
3997 ; AVX1-NEXT: vmovd %xmm0, 48(%rdx)
3998 ; AVX1-NEXT: vpextrw $2, %xmm0, 60(%rdx)
3999 ; AVX1-NEXT: vmovd %xmm0, 56(%rdx)
4002 ; AVX2-ONLY-LABEL: vec384_v6i8:
4003 ; AVX2-ONLY: # %bb.0:
4004 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4005 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
4006 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
4007 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rsi)
4008 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
4009 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 4(%rdx)
4010 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rdx)
4011 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 12(%rdx)
4012 ; AVX2-ONLY-NEXT: vmovd %xmm0, 8(%rdx)
4013 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 20(%rdx)
4014 ; AVX2-ONLY-NEXT: vmovd %xmm0, 16(%rdx)
4015 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 28(%rdx)
4016 ; AVX2-ONLY-NEXT: vmovd %xmm0, 24(%rdx)
4017 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 36(%rdx)
4018 ; AVX2-ONLY-NEXT: vmovd %xmm0, 32(%rdx)
4019 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 44(%rdx)
4020 ; AVX2-ONLY-NEXT: vmovd %xmm0, 40(%rdx)
4021 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 52(%rdx)
4022 ; AVX2-ONLY-NEXT: vmovd %xmm0, 48(%rdx)
4023 ; AVX2-ONLY-NEXT: vpextrw $2, %xmm0, 60(%rdx)
4024 ; AVX2-ONLY-NEXT: vmovd %xmm0, 56(%rdx)
4025 ; AVX2-ONLY-NEXT: retq
4027 ; AVX512-LABEL: vec384_v6i8:
4029 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4030 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
4031 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi)
4032 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
4033 ; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx)
4034 ; AVX512-NEXT: vmovd %xmm0, (%rdx)
4035 ; AVX512-NEXT: vpextrw $2, %xmm0, 12(%rdx)
4036 ; AVX512-NEXT: vmovd %xmm0, 8(%rdx)
4037 ; AVX512-NEXT: vpextrw $2, %xmm0, 20(%rdx)
4038 ; AVX512-NEXT: vmovd %xmm0, 16(%rdx)
4039 ; AVX512-NEXT: vpextrw $2, %xmm0, 28(%rdx)
4040 ; AVX512-NEXT: vmovd %xmm0, 24(%rdx)
4041 ; AVX512-NEXT: vpextrw $2, %xmm0, 36(%rdx)
4042 ; AVX512-NEXT: vmovd %xmm0, 32(%rdx)
4043 ; AVX512-NEXT: vpextrw $2, %xmm0, 44(%rdx)
4044 ; AVX512-NEXT: vmovd %xmm0, 40(%rdx)
4045 ; AVX512-NEXT: vpextrw $2, %xmm0, 52(%rdx)
4046 ; AVX512-NEXT: vmovd %xmm0, 48(%rdx)
4047 ; AVX512-NEXT: vpextrw $2, %xmm0, 60(%rdx)
4048 ; AVX512-NEXT: vmovd %xmm0, 56(%rdx)
4050 %in.subvec.not = load <6 x i8>, ptr %in.subvec.ptr, align 64
4051 %in.subvec = xor <6 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4052 store <6 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4053 %out.subvec0.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 0
4054 store <6 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4055 %out.subvec1.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 1
4056 store <6 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
4057 %out.subvec2.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 2
4058 store <6 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
4059 %out.subvec3.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 3
4060 store <6 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
4061 %out.subvec4.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 4
4062 store <6 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
4063 %out.subvec5.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 5
4064 store <6 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
4065 %out.subvec6.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 6
4066 store <6 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
4067 %out.subvec7.ptr = getelementptr <6 x i8>, ptr %out.vec.ptr, i64 7
4068 store <6 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
4072 define void @vec384_v6i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4073 ; SCALAR-LABEL: vec384_v6i16:
4075 ; SCALAR-NEXT: movl 8(%rdi), %eax
4076 ; SCALAR-NEXT: movq (%rdi), %rcx
4077 ; SCALAR-NEXT: movq %rcx, %rdi
4078 ; SCALAR-NEXT: shrq $32, %rdi
4079 ; SCALAR-NEXT: movq %rcx, %r8
4080 ; SCALAR-NEXT: shrq $48, %r8
4081 ; SCALAR-NEXT: notl %r8d
4082 ; SCALAR-NEXT: shll $16, %r8d
4083 ; SCALAR-NEXT: notl %edi
4084 ; SCALAR-NEXT: movzwl %di, %edi
4085 ; SCALAR-NEXT: orl %r8d, %edi
4086 ; SCALAR-NEXT: notl %ecx
4087 ; SCALAR-NEXT: notl %eax
4088 ; SCALAR-NEXT: movl %eax, 8(%rsi)
4089 ; SCALAR-NEXT: shlq $32, %rdi
4090 ; SCALAR-NEXT: orq %rdi, %rcx
4091 ; SCALAR-NEXT: movq %rcx, (%rsi)
4092 ; SCALAR-NEXT: movl %eax, 8(%rdx)
4093 ; SCALAR-NEXT: movq %rcx, (%rdx)
4094 ; SCALAR-NEXT: movl %eax, 24(%rdx)
4095 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
4096 ; SCALAR-NEXT: movl %eax, 40(%rdx)
4097 ; SCALAR-NEXT: movq %rcx, 32(%rdx)
4098 ; SCALAR-NEXT: movl %eax, 56(%rdx)
4099 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
4102 ; SSE2-ONLY-LABEL: vec384_v6i16:
4103 ; SSE2-ONLY: # %bb.0:
4104 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
4105 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
4106 ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi)
4107 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4108 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi)
4109 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
4110 ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx)
4111 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
4112 ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx)
4113 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
4114 ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx)
4115 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
4116 ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx)
4117 ; SSE2-ONLY-NEXT: retq
4119 ; SSE3-LABEL: vec384_v6i16:
4121 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
4122 ; SSE3-NEXT: pxor (%rdi), %xmm0
4123 ; SSE3-NEXT: movq %xmm0, (%rsi)
4124 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4125 ; SSE3-NEXT: movd %xmm1, 8(%rsi)
4126 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
4127 ; SSE3-NEXT: movq %xmm0, (%rdx)
4128 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
4129 ; SSE3-NEXT: movq %xmm0, 16(%rdx)
4130 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
4131 ; SSE3-NEXT: movq %xmm0, 32(%rdx)
4132 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
4133 ; SSE3-NEXT: movq %xmm0, 48(%rdx)
4136 ; SSSE3-ONLY-LABEL: vec384_v6i16:
4137 ; SSSE3-ONLY: # %bb.0:
4138 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
4139 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
4140 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi)
4141 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4142 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi)
4143 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
4144 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx)
4145 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
4146 ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx)
4147 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
4148 ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx)
4149 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
4150 ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx)
4151 ; SSSE3-ONLY-NEXT: retq
4153 ; SSE41-LABEL: vec384_v6i16:
4155 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
4156 ; SSE41-NEXT: pxor (%rdi), %xmm0
4157 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi)
4158 ; SSE41-NEXT: movq %xmm0, (%rsi)
4159 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx)
4160 ; SSE41-NEXT: movq %xmm0, (%rdx)
4161 ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx)
4162 ; SSE41-NEXT: movq %xmm0, 16(%rdx)
4163 ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx)
4164 ; SSE41-NEXT: movq %xmm0, 32(%rdx)
4165 ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx)
4166 ; SSE41-NEXT: movq %xmm0, 48(%rdx)
4169 ; SSE42-LABEL: vec384_v6i16:
4171 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
4172 ; SSE42-NEXT: pxor (%rdi), %xmm0
4173 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi)
4174 ; SSE42-NEXT: movq %xmm0, (%rsi)
4175 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx)
4176 ; SSE42-NEXT: movq %xmm0, (%rdx)
4177 ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx)
4178 ; SSE42-NEXT: movq %xmm0, 16(%rdx)
4179 ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx)
4180 ; SSE42-NEXT: movq %xmm0, 32(%rdx)
4181 ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx)
4182 ; SSE42-NEXT: movq %xmm0, 48(%rdx)
4185 ; AVX-LABEL: vec384_v6i16:
4187 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
4188 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
4189 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi)
4190 ; AVX-NEXT: vmovq %xmm0, (%rsi)
4191 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx)
4192 ; AVX-NEXT: vmovq %xmm0, (%rdx)
4193 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx)
4194 ; AVX-NEXT: vmovq %xmm0, 16(%rdx)
4195 ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx)
4196 ; AVX-NEXT: vmovq %xmm0, 32(%rdx)
4197 ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx)
4198 ; AVX-NEXT: vmovq %xmm0, 48(%rdx)
4200 %in.subvec.not = load <6 x i16>, ptr %in.subvec.ptr, align 64
4201 %in.subvec = xor <6 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4202 store <6 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4203 %out.subvec0.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 0
4204 store <6 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4205 %out.subvec1.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 1
4206 store <6 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
4207 %out.subvec2.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 2
4208 store <6 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
4209 %out.subvec3.ptr = getelementptr <6 x i16>, ptr %out.vec.ptr, i64 3
4210 store <6 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
4214 define void @vec384_v6i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4215 ; SCALAR-LABEL: vec384_v6i32:
4217 ; SCALAR-NEXT: movq (%rdi), %rax
4218 ; SCALAR-NEXT: movq 8(%rdi), %rcx
4219 ; SCALAR-NEXT: movq %rax, %r8
4220 ; SCALAR-NEXT: shrq $32, %r8
4221 ; SCALAR-NEXT: movq %rcx, %r9
4222 ; SCALAR-NEXT: shrq $32, %r9
4223 ; SCALAR-NEXT: movq 16(%rdi), %rdi
4224 ; SCALAR-NEXT: movq %rdi, %r10
4225 ; SCALAR-NEXT: shrq $32, %r10
4226 ; SCALAR-NEXT: notl %r10d
4227 ; SCALAR-NEXT: shlq $32, %r10
4228 ; SCALAR-NEXT: notl %edi
4229 ; SCALAR-NEXT: orq %r10, %rdi
4230 ; SCALAR-NEXT: notl %r9d
4231 ; SCALAR-NEXT: shlq $32, %r9
4232 ; SCALAR-NEXT: notl %ecx
4233 ; SCALAR-NEXT: orq %r9, %rcx
4234 ; SCALAR-NEXT: notl %r8d
4235 ; SCALAR-NEXT: shlq $32, %r8
4236 ; SCALAR-NEXT: notl %eax
4237 ; SCALAR-NEXT: orq %r8, %rax
4238 ; SCALAR-NEXT: movq %rax, (%rsi)
4239 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
4240 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
4241 ; SCALAR-NEXT: movq %rax, (%rdx)
4242 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
4243 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
4244 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
4245 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
4246 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4249 ; SSE2-LABEL: vec384_v6i32:
4251 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4252 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
4253 ; SSE2-NEXT: pxor %xmm0, %xmm1
4254 ; SSE2-NEXT: pxor (%rdi), %xmm0
4255 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4256 ; SSE2-NEXT: movq %xmm1, 16(%rsi)
4257 ; SSE2-NEXT: movq %xmm1, 16(%rdx)
4258 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4259 ; SSE2-NEXT: movq %xmm1, 48(%rdx)
4260 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
4263 ; AVX1-LABEL: vec384_v6i32:
4265 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
4266 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
4267 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
4268 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4269 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
4270 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
4271 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
4272 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
4273 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
4274 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
4275 ; AVX1-NEXT: vzeroupper
4278 ; AVX2-LABEL: vec384_v6i32:
4280 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
4281 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
4282 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4283 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
4284 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
4285 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
4286 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
4287 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
4288 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
4289 ; AVX2-NEXT: vzeroupper
4291 %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64
4292 %in.subvec = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
4293 store <6 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
4294 %out.subvec0.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 0
4295 store <6 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
4296 %out.subvec1.ptr = getelementptr <6 x i32>, ptr %out.vec.ptr, i64 1
4297 store <6 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
4301 define void @vec384_v6f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4302 ; SCALAR-LABEL: vec384_v6f32:
4304 ; SCALAR-NEXT: movq (%rdi), %rax
4305 ; SCALAR-NEXT: movq 8(%rdi), %rcx
4306 ; SCALAR-NEXT: movq %rax, %r8
4307 ; SCALAR-NEXT: shrq $32, %r8
4308 ; SCALAR-NEXT: movq %rcx, %r9
4309 ; SCALAR-NEXT: shrq $32, %r9
4310 ; SCALAR-NEXT: movq 16(%rdi), %rdi
4311 ; SCALAR-NEXT: movq %rdi, %r10
4312 ; SCALAR-NEXT: shrq $32, %r10
4313 ; SCALAR-NEXT: notl %r10d
4314 ; SCALAR-NEXT: shlq $32, %r10
4315 ; SCALAR-NEXT: notl %edi
4316 ; SCALAR-NEXT: orq %r10, %rdi
4317 ; SCALAR-NEXT: notl %r9d
4318 ; SCALAR-NEXT: shlq $32, %r9
4319 ; SCALAR-NEXT: notl %ecx
4320 ; SCALAR-NEXT: orq %r9, %rcx
4321 ; SCALAR-NEXT: notl %r8d
4322 ; SCALAR-NEXT: shlq $32, %r8
4323 ; SCALAR-NEXT: notl %eax
4324 ; SCALAR-NEXT: orq %r8, %rax
4325 ; SCALAR-NEXT: movq %rax, (%rsi)
4326 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
4327 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
4328 ; SCALAR-NEXT: movq %rax, (%rdx)
4329 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
4330 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
4331 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
4332 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
4333 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4336 ; SSE2-LABEL: vec384_v6f32:
4338 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4339 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
4340 ; SSE2-NEXT: pxor %xmm0, %xmm1
4341 ; SSE2-NEXT: pxor (%rdi), %xmm0
4342 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4343 ; SSE2-NEXT: movq %xmm1, 16(%rsi)
4344 ; SSE2-NEXT: movq %xmm1, 16(%rdx)
4345 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4346 ; SSE2-NEXT: movq %xmm1, 48(%rdx)
4347 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
4350 ; AVX1-LABEL: vec384_v6f32:
4352 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
4353 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
4354 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
4355 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4356 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
4357 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
4358 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
4359 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
4360 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
4361 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
4362 ; AVX1-NEXT: vzeroupper
4365 ; AVX2-LABEL: vec384_v6f32:
4367 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
4368 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
4369 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4370 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
4371 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
4372 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
4373 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
4374 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
4375 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
4376 ; AVX2-NEXT: vzeroupper
4378 %in.subvec.not = load <6 x i32>, ptr %in.subvec.ptr, align 64
4379 %in.subvec.int = xor <6 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
4380 %in.subvec = bitcast <6 x i32> %in.subvec.int to <6 x float>
4381 store <6 x float> %in.subvec, ptr %out.subvec.ptr, align 64
4382 %out.subvec0.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 0
4383 store <6 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
4384 %out.subvec1.ptr = getelementptr <6 x float>, ptr %out.vec.ptr, i64 1
4385 store <6 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
4389 define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4390 ; SCALAR-LABEL: vec384_v8i8:
4392 ; SCALAR-NEXT: pushq %rbx
4393 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
4394 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
4395 ; SCALAR-NEXT: movzbl 5(%rdi), %r10d
4396 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
4397 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
4398 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
4399 ; SCALAR-NEXT: movzbl (%rdi), %eax
4400 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
4401 ; SCALAR-NEXT: notb %al
4402 ; SCALAR-NEXT: notb %dil
4403 ; SCALAR-NEXT: notb %cl
4404 ; SCALAR-NEXT: notb %r8b
4405 ; SCALAR-NEXT: notb %r9b
4406 ; SCALAR-NEXT: notb %r10b
4407 ; SCALAR-NEXT: notb %r11b
4408 ; SCALAR-NEXT: notb %bl
4409 ; SCALAR-NEXT: movb %bl, 7(%rsi)
4410 ; SCALAR-NEXT: movb %r11b, 6(%rsi)
4411 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
4412 ; SCALAR-NEXT: movb %r9b, 4(%rsi)
4413 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
4414 ; SCALAR-NEXT: movb %cl, 2(%rsi)
4415 ; SCALAR-NEXT: movb %dil, 1(%rsi)
4416 ; SCALAR-NEXT: movb %al, (%rsi)
4417 ; SCALAR-NEXT: movb %bl, 7(%rdx)
4418 ; SCALAR-NEXT: movb %r11b, 6(%rdx)
4419 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
4420 ; SCALAR-NEXT: movb %r9b, 4(%rdx)
4421 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
4422 ; SCALAR-NEXT: movb %cl, 2(%rdx)
4423 ; SCALAR-NEXT: movb %dil, 1(%rdx)
4424 ; SCALAR-NEXT: movb %al, (%rdx)
4425 ; SCALAR-NEXT: movb %bl, 15(%rdx)
4426 ; SCALAR-NEXT: movb %r11b, 14(%rdx)
4427 ; SCALAR-NEXT: movb %r10b, 13(%rdx)
4428 ; SCALAR-NEXT: movb %r9b, 12(%rdx)
4429 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
4430 ; SCALAR-NEXT: movb %cl, 10(%rdx)
4431 ; SCALAR-NEXT: movb %dil, 9(%rdx)
4432 ; SCALAR-NEXT: movb %al, 8(%rdx)
4433 ; SCALAR-NEXT: movb %bl, 23(%rdx)
4434 ; SCALAR-NEXT: movb %r11b, 22(%rdx)
4435 ; SCALAR-NEXT: movb %r10b, 21(%rdx)
4436 ; SCALAR-NEXT: movb %r9b, 20(%rdx)
4437 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
4438 ; SCALAR-NEXT: movb %cl, 18(%rdx)
4439 ; SCALAR-NEXT: movb %dil, 17(%rdx)
4440 ; SCALAR-NEXT: movb %al, 16(%rdx)
4441 ; SCALAR-NEXT: movb %bl, 31(%rdx)
4442 ; SCALAR-NEXT: movb %r11b, 30(%rdx)
4443 ; SCALAR-NEXT: movb %r10b, 29(%rdx)
4444 ; SCALAR-NEXT: movb %r9b, 28(%rdx)
4445 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
4446 ; SCALAR-NEXT: movb %cl, 26(%rdx)
4447 ; SCALAR-NEXT: movb %dil, 25(%rdx)
4448 ; SCALAR-NEXT: movb %al, 24(%rdx)
4449 ; SCALAR-NEXT: movb %bl, 39(%rdx)
4450 ; SCALAR-NEXT: movb %r11b, 38(%rdx)
4451 ; SCALAR-NEXT: movb %r10b, 37(%rdx)
4452 ; SCALAR-NEXT: movb %r9b, 36(%rdx)
4453 ; SCALAR-NEXT: movb %r8b, 35(%rdx)
4454 ; SCALAR-NEXT: movb %cl, 34(%rdx)
4455 ; SCALAR-NEXT: movb %dil, 33(%rdx)
4456 ; SCALAR-NEXT: movb %al, 32(%rdx)
4457 ; SCALAR-NEXT: movb %bl, 47(%rdx)
4458 ; SCALAR-NEXT: movb %r11b, 46(%rdx)
4459 ; SCALAR-NEXT: movb %r10b, 45(%rdx)
4460 ; SCALAR-NEXT: movb %r9b, 44(%rdx)
4461 ; SCALAR-NEXT: movb %r8b, 43(%rdx)
4462 ; SCALAR-NEXT: movb %cl, 42(%rdx)
4463 ; SCALAR-NEXT: movb %dil, 41(%rdx)
4464 ; SCALAR-NEXT: movb %al, 40(%rdx)
4465 ; SCALAR-NEXT: popq %rbx
4468 ; SSE2-LABEL: vec384_v8i8:
4470 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
4471 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
4472 ; SSE2-NEXT: pxor %xmm0, %xmm1
4473 ; SSE2-NEXT: movq %xmm1, (%rsi)
4474 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
4475 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4476 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
4477 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
4480 ; AVX1-LABEL: vec384_v8i8:
4482 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4483 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
4484 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
4485 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
4486 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4487 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
4488 ; AVX1-NEXT: vmovaps %ymm1, (%rdx)
4489 ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx)
4490 ; AVX1-NEXT: vzeroupper
4493 ; AVX2-ONLY-LABEL: vec384_v8i8:
4494 ; AVX2-ONLY: # %bb.0:
4495 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4496 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
4497 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
4498 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
4499 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
4500 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
4501 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, 32(%rdx)
4502 ; AVX2-ONLY-NEXT: vzeroupper
4503 ; AVX2-ONLY-NEXT: retq
4505 ; AVX512-LABEL: vec384_v8i8:
4507 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
4508 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
4509 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
4510 ; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
4511 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
4512 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
4513 ; AVX512-NEXT: vzeroupper
4515 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
4516 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4517 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4518 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
4519 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4520 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
4521 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
4522 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
4523 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
4524 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
4525 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
4526 %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4
4527 store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32
4528 %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5
4529 store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8
4533 define void @vec384_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4534 ; SCALAR-LABEL: vec384_v8i16:
4536 ; SCALAR-NEXT: pushq %rbx
4537 ; SCALAR-NEXT: movzwl 14(%rdi), %ebx
4538 ; SCALAR-NEXT: movl 12(%rdi), %r11d
4539 ; SCALAR-NEXT: movzwl 10(%rdi), %r10d
4540 ; SCALAR-NEXT: movl 8(%rdi), %r9d
4541 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
4542 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
4543 ; SCALAR-NEXT: movl (%rdi), %eax
4544 ; SCALAR-NEXT: movl 4(%rdi), %edi
4545 ; SCALAR-NEXT: notl %eax
4546 ; SCALAR-NEXT: notl %ecx
4547 ; SCALAR-NEXT: notl %edi
4548 ; SCALAR-NEXT: notl %r8d
4549 ; SCALAR-NEXT: notl %r9d
4550 ; SCALAR-NEXT: notl %r10d
4551 ; SCALAR-NEXT: notl %r11d
4552 ; SCALAR-NEXT: notl %ebx
4553 ; SCALAR-NEXT: movw %bx, 14(%rsi)
4554 ; SCALAR-NEXT: movw %r11w, 12(%rsi)
4555 ; SCALAR-NEXT: movw %r10w, 10(%rsi)
4556 ; SCALAR-NEXT: movw %r9w, 8(%rsi)
4557 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
4558 ; SCALAR-NEXT: movw %di, 4(%rsi)
4559 ; SCALAR-NEXT: movw %cx, 2(%rsi)
4560 ; SCALAR-NEXT: movw %ax, (%rsi)
4561 ; SCALAR-NEXT: movw %bx, 14(%rdx)
4562 ; SCALAR-NEXT: movw %r11w, 12(%rdx)
4563 ; SCALAR-NEXT: movw %r10w, 10(%rdx)
4564 ; SCALAR-NEXT: movw %r9w, 8(%rdx)
4565 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
4566 ; SCALAR-NEXT: movw %di, 4(%rdx)
4567 ; SCALAR-NEXT: movw %cx, 2(%rdx)
4568 ; SCALAR-NEXT: movw %ax, (%rdx)
4569 ; SCALAR-NEXT: movw %bx, 30(%rdx)
4570 ; SCALAR-NEXT: movw %r11w, 28(%rdx)
4571 ; SCALAR-NEXT: movw %r10w, 26(%rdx)
4572 ; SCALAR-NEXT: movw %r9w, 24(%rdx)
4573 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
4574 ; SCALAR-NEXT: movw %di, 20(%rdx)
4575 ; SCALAR-NEXT: movw %cx, 18(%rdx)
4576 ; SCALAR-NEXT: movw %ax, 16(%rdx)
4577 ; SCALAR-NEXT: movw %bx, 46(%rdx)
4578 ; SCALAR-NEXT: movw %r11w, 44(%rdx)
4579 ; SCALAR-NEXT: movw %r10w, 42(%rdx)
4580 ; SCALAR-NEXT: movw %r9w, 40(%rdx)
4581 ; SCALAR-NEXT: movw %r8w, 38(%rdx)
4582 ; SCALAR-NEXT: movw %di, 36(%rdx)
4583 ; SCALAR-NEXT: movw %cx, 34(%rdx)
4584 ; SCALAR-NEXT: movw %ax, 32(%rdx)
4585 ; SCALAR-NEXT: popq %rbx
4588 ; SSE2-LABEL: vec384_v8i16:
4590 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4591 ; SSE2-NEXT: pxor (%rdi), %xmm0
4592 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4593 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4594 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
4595 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
4598 ; AVX-LABEL: vec384_v8i16:
4600 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
4601 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
4602 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
4603 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
4604 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
4605 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
4607 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
4608 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4609 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4610 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
4611 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4612 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
4613 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
4614 %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2
4615 store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32
4619 define void @vec384_v12i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4620 ; SCALAR-LABEL: vec384_v12i8:
4622 ; SCALAR-NEXT: pushq %rbp
4623 ; SCALAR-NEXT: pushq %r15
4624 ; SCALAR-NEXT: pushq %r14
4625 ; SCALAR-NEXT: pushq %r12
4626 ; SCALAR-NEXT: pushq %rbx
4627 ; SCALAR-NEXT: movq (%rdi), %r9
4628 ; SCALAR-NEXT: movq 8(%rdi), %rcx
4629 ; SCALAR-NEXT: movl %ecx, %eax
4630 ; SCALAR-NEXT: shrl $8, %eax
4631 ; SCALAR-NEXT: movl %ecx, %edi
4632 ; SCALAR-NEXT: shrl $24, %edi
4633 ; SCALAR-NEXT: movl %ecx, %r8d
4634 ; SCALAR-NEXT: shrl $16, %r8d
4635 ; SCALAR-NEXT: movq %r9, %r10
4636 ; SCALAR-NEXT: shrq $40, %r10
4637 ; SCALAR-NEXT: movq %r9, %r11
4638 ; SCALAR-NEXT: shrq $32, %r11
4639 ; SCALAR-NEXT: movq %r9, %rbx
4640 ; SCALAR-NEXT: shrq $56, %rbx
4641 ; SCALAR-NEXT: movq %r9, %r14
4642 ; SCALAR-NEXT: shrq $48, %r14
4643 ; SCALAR-NEXT: movl %r9d, %ebp
4644 ; SCALAR-NEXT: shrl $8, %ebp
4645 ; SCALAR-NEXT: movl %r9d, %r15d
4646 ; SCALAR-NEXT: shrl $24, %r15d
4647 ; SCALAR-NEXT: movl %r9d, %r12d
4648 ; SCALAR-NEXT: shrl $16, %r12d
4649 ; SCALAR-NEXT: notb %r12b
4650 ; SCALAR-NEXT: movzbl %r12b, %r12d
4651 ; SCALAR-NEXT: notb %r15b
4652 ; SCALAR-NEXT: movzbl %r15b, %r15d
4653 ; SCALAR-NEXT: shll $8, %r15d
4654 ; SCALAR-NEXT: orl %r12d, %r15d
4655 ; SCALAR-NEXT: shll $16, %r15d
4656 ; SCALAR-NEXT: notb %r9b
4657 ; SCALAR-NEXT: movzbl %r9b, %r9d
4658 ; SCALAR-NEXT: notb %bpl
4659 ; SCALAR-NEXT: movzbl %bpl, %ebp
4660 ; SCALAR-NEXT: shll $8, %ebp
4661 ; SCALAR-NEXT: orl %r9d, %ebp
4662 ; SCALAR-NEXT: movzwl %bp, %r9d
4663 ; SCALAR-NEXT: orl %r15d, %r9d
4664 ; SCALAR-NEXT: notb %r14b
4665 ; SCALAR-NEXT: movzbl %r14b, %ebp
4666 ; SCALAR-NEXT: notb %bl
4667 ; SCALAR-NEXT: movzbl %bl, %ebx
4668 ; SCALAR-NEXT: shll $8, %ebx
4669 ; SCALAR-NEXT: orl %ebp, %ebx
4670 ; SCALAR-NEXT: shll $16, %ebx
4671 ; SCALAR-NEXT: notb %r11b
4672 ; SCALAR-NEXT: movzbl %r11b, %r11d
4673 ; SCALAR-NEXT: notb %r10b
4674 ; SCALAR-NEXT: movzbl %r10b, %r10d
4675 ; SCALAR-NEXT: shll $8, %r10d
4676 ; SCALAR-NEXT: orl %r11d, %r10d
4677 ; SCALAR-NEXT: movzwl %r10w, %r10d
4678 ; SCALAR-NEXT: orl %ebx, %r10d
4679 ; SCALAR-NEXT: notb %r8b
4680 ; SCALAR-NEXT: movzbl %r8b, %r8d
4681 ; SCALAR-NEXT: notb %dil
4682 ; SCALAR-NEXT: movzbl %dil, %edi
4683 ; SCALAR-NEXT: shll $8, %edi
4684 ; SCALAR-NEXT: orl %r8d, %edi
4685 ; SCALAR-NEXT: shll $16, %edi
4686 ; SCALAR-NEXT: notb %cl
4687 ; SCALAR-NEXT: movzbl %cl, %ecx
4688 ; SCALAR-NEXT: notb %al
4689 ; SCALAR-NEXT: movzbl %al, %eax
4690 ; SCALAR-NEXT: shll $8, %eax
4691 ; SCALAR-NEXT: orl %ecx, %eax
4692 ; SCALAR-NEXT: movzwl %ax, %eax
4693 ; SCALAR-NEXT: orl %edi, %eax
4694 ; SCALAR-NEXT: movl %eax, 8(%rsi)
4695 ; SCALAR-NEXT: shlq $32, %r10
4696 ; SCALAR-NEXT: orq %r10, %r9
4697 ; SCALAR-NEXT: movq %r9, (%rsi)
4698 ; SCALAR-NEXT: movl %eax, 8(%rdx)
4699 ; SCALAR-NEXT: movq %r9, (%rdx)
4700 ; SCALAR-NEXT: movl %eax, 24(%rdx)
4701 ; SCALAR-NEXT: movq %r9, 16(%rdx)
4702 ; SCALAR-NEXT: movl %eax, 40(%rdx)
4703 ; SCALAR-NEXT: movq %r9, 32(%rdx)
4704 ; SCALAR-NEXT: movl %eax, 56(%rdx)
4705 ; SCALAR-NEXT: movq %r9, 48(%rdx)
4706 ; SCALAR-NEXT: popq %rbx
4707 ; SCALAR-NEXT: popq %r12
4708 ; SCALAR-NEXT: popq %r14
4709 ; SCALAR-NEXT: popq %r15
4710 ; SCALAR-NEXT: popq %rbp
4713 ; SSE2-ONLY-LABEL: vec384_v12i8:
4714 ; SSE2-ONLY: # %bb.0:
4715 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
4716 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
4717 ; SSE2-ONLY-NEXT: movq %xmm0, (%rsi)
4718 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4719 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rsi)
4720 ; SSE2-ONLY-NEXT: movd %xmm1, 8(%rdx)
4721 ; SSE2-ONLY-NEXT: movq %xmm0, (%rdx)
4722 ; SSE2-ONLY-NEXT: movd %xmm1, 24(%rdx)
4723 ; SSE2-ONLY-NEXT: movq %xmm0, 16(%rdx)
4724 ; SSE2-ONLY-NEXT: movd %xmm1, 40(%rdx)
4725 ; SSE2-ONLY-NEXT: movq %xmm0, 32(%rdx)
4726 ; SSE2-ONLY-NEXT: movd %xmm1, 56(%rdx)
4727 ; SSE2-ONLY-NEXT: movq %xmm0, 48(%rdx)
4728 ; SSE2-ONLY-NEXT: retq
4730 ; SSE3-LABEL: vec384_v12i8:
4732 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
4733 ; SSE3-NEXT: pxor (%rdi), %xmm0
4734 ; SSE3-NEXT: movq %xmm0, (%rsi)
4735 ; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4736 ; SSE3-NEXT: movd %xmm1, 8(%rsi)
4737 ; SSE3-NEXT: movd %xmm1, 8(%rdx)
4738 ; SSE3-NEXT: movq %xmm0, (%rdx)
4739 ; SSE3-NEXT: movd %xmm1, 24(%rdx)
4740 ; SSE3-NEXT: movq %xmm0, 16(%rdx)
4741 ; SSE3-NEXT: movd %xmm1, 40(%rdx)
4742 ; SSE3-NEXT: movq %xmm0, 32(%rdx)
4743 ; SSE3-NEXT: movd %xmm1, 56(%rdx)
4744 ; SSE3-NEXT: movq %xmm0, 48(%rdx)
4747 ; SSSE3-ONLY-LABEL: vec384_v12i8:
4748 ; SSSE3-ONLY: # %bb.0:
4749 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
4750 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
4751 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rsi)
4752 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
4753 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rsi)
4754 ; SSSE3-ONLY-NEXT: movd %xmm1, 8(%rdx)
4755 ; SSSE3-ONLY-NEXT: movq %xmm0, (%rdx)
4756 ; SSSE3-ONLY-NEXT: movd %xmm1, 24(%rdx)
4757 ; SSSE3-ONLY-NEXT: movq %xmm0, 16(%rdx)
4758 ; SSSE3-ONLY-NEXT: movd %xmm1, 40(%rdx)
4759 ; SSSE3-ONLY-NEXT: movq %xmm0, 32(%rdx)
4760 ; SSSE3-ONLY-NEXT: movd %xmm1, 56(%rdx)
4761 ; SSSE3-ONLY-NEXT: movq %xmm0, 48(%rdx)
4762 ; SSSE3-ONLY-NEXT: retq
4764 ; SSE41-LABEL: vec384_v12i8:
4766 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
4767 ; SSE41-NEXT: pxor (%rdi), %xmm0
4768 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rsi)
4769 ; SSE41-NEXT: movq %xmm0, (%rsi)
4770 ; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdx)
4771 ; SSE41-NEXT: movq %xmm0, (%rdx)
4772 ; SSE41-NEXT: pextrd $2, %xmm0, 24(%rdx)
4773 ; SSE41-NEXT: movq %xmm0, 16(%rdx)
4774 ; SSE41-NEXT: pextrd $2, %xmm0, 40(%rdx)
4775 ; SSE41-NEXT: movq %xmm0, 32(%rdx)
4776 ; SSE41-NEXT: pextrd $2, %xmm0, 56(%rdx)
4777 ; SSE41-NEXT: movq %xmm0, 48(%rdx)
4780 ; SSE42-LABEL: vec384_v12i8:
4782 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
4783 ; SSE42-NEXT: pxor (%rdi), %xmm0
4784 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rsi)
4785 ; SSE42-NEXT: movq %xmm0, (%rsi)
4786 ; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdx)
4787 ; SSE42-NEXT: movq %xmm0, (%rdx)
4788 ; SSE42-NEXT: pextrd $2, %xmm0, 24(%rdx)
4789 ; SSE42-NEXT: movq %xmm0, 16(%rdx)
4790 ; SSE42-NEXT: pextrd $2, %xmm0, 40(%rdx)
4791 ; SSE42-NEXT: movq %xmm0, 32(%rdx)
4792 ; SSE42-NEXT: pextrd $2, %xmm0, 56(%rdx)
4793 ; SSE42-NEXT: movq %xmm0, 48(%rdx)
4796 ; AVX-LABEL: vec384_v12i8:
4798 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
4799 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
4800 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rsi)
4801 ; AVX-NEXT: vmovq %xmm0, (%rsi)
4802 ; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdx)
4803 ; AVX-NEXT: vmovq %xmm0, (%rdx)
4804 ; AVX-NEXT: vpextrd $2, %xmm0, 24(%rdx)
4805 ; AVX-NEXT: vmovq %xmm0, 16(%rdx)
4806 ; AVX-NEXT: vpextrd $2, %xmm0, 40(%rdx)
4807 ; AVX-NEXT: vmovq %xmm0, 32(%rdx)
4808 ; AVX-NEXT: vpextrd $2, %xmm0, 56(%rdx)
4809 ; AVX-NEXT: vmovq %xmm0, 48(%rdx)
4811 %in.subvec.not = load <12 x i8>, ptr %in.subvec.ptr, align 64
4812 %in.subvec = xor <12 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
4813 store <12 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
4814 %out.subvec0.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 0
4815 store <12 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
4816 %out.subvec1.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 1
4817 store <12 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
4818 %out.subvec2.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 2
4819 store <12 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
4820 %out.subvec3.ptr = getelementptr <12 x i8>, ptr %out.vec.ptr, i64 3
4821 store <12 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
4825 define void @vec384_v12i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4826 ; SCALAR-LABEL: vec384_v12i16:
4828 ; SCALAR-NEXT: pushq %r14
4829 ; SCALAR-NEXT: pushq %rbx
4830 ; SCALAR-NEXT: movq (%rdi), %rax
4831 ; SCALAR-NEXT: movq 8(%rdi), %rcx
4832 ; SCALAR-NEXT: movq %rax, %r8
4833 ; SCALAR-NEXT: shrq $32, %r8
4834 ; SCALAR-NEXT: movq %rax, %r9
4835 ; SCALAR-NEXT: shrq $48, %r9
4836 ; SCALAR-NEXT: movq %rcx, %r10
4837 ; SCALAR-NEXT: shrq $32, %r10
4838 ; SCALAR-NEXT: movq %rcx, %r11
4839 ; SCALAR-NEXT: shrq $48, %r11
4840 ; SCALAR-NEXT: movq 16(%rdi), %rdi
4841 ; SCALAR-NEXT: movq %rdi, %rbx
4842 ; SCALAR-NEXT: shrq $32, %rbx
4843 ; SCALAR-NEXT: movq %rdi, %r14
4844 ; SCALAR-NEXT: shrq $48, %r14
4845 ; SCALAR-NEXT: notl %r14d
4846 ; SCALAR-NEXT: shll $16, %r14d
4847 ; SCALAR-NEXT: notl %ebx
4848 ; SCALAR-NEXT: movzwl %bx, %ebx
4849 ; SCALAR-NEXT: orl %r14d, %ebx
4850 ; SCALAR-NEXT: shlq $32, %rbx
4851 ; SCALAR-NEXT: notl %edi
4852 ; SCALAR-NEXT: orq %rbx, %rdi
4853 ; SCALAR-NEXT: notl %r11d
4854 ; SCALAR-NEXT: shll $16, %r11d
4855 ; SCALAR-NEXT: notl %r10d
4856 ; SCALAR-NEXT: movzwl %r10w, %r10d
4857 ; SCALAR-NEXT: orl %r11d, %r10d
4858 ; SCALAR-NEXT: shlq $32, %r10
4859 ; SCALAR-NEXT: notl %ecx
4860 ; SCALAR-NEXT: orq %r10, %rcx
4861 ; SCALAR-NEXT: notl %r9d
4862 ; SCALAR-NEXT: shll $16, %r9d
4863 ; SCALAR-NEXT: notl %r8d
4864 ; SCALAR-NEXT: movzwl %r8w, %r8d
4865 ; SCALAR-NEXT: orl %r9d, %r8d
4866 ; SCALAR-NEXT: shlq $32, %r8
4867 ; SCALAR-NEXT: notl %eax
4868 ; SCALAR-NEXT: orq %r8, %rax
4869 ; SCALAR-NEXT: movq %rax, (%rsi)
4870 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
4871 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
4872 ; SCALAR-NEXT: movq %rax, (%rdx)
4873 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
4874 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
4875 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
4876 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
4877 ; SCALAR-NEXT: movq %rax, 32(%rdx)
4878 ; SCALAR-NEXT: popq %rbx
4879 ; SCALAR-NEXT: popq %r14
4882 ; SSE2-LABEL: vec384_v12i16:
4884 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
4885 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
4886 ; SSE2-NEXT: pxor %xmm0, %xmm1
4887 ; SSE2-NEXT: pxor (%rdi), %xmm0
4888 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
4889 ; SSE2-NEXT: movq %xmm1, 16(%rsi)
4890 ; SSE2-NEXT: movq %xmm1, 16(%rdx)
4891 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
4892 ; SSE2-NEXT: movq %xmm1, 48(%rdx)
4893 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
4896 ; AVX1-LABEL: vec384_v12i16:
4898 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
4899 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
4900 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
4901 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
4902 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
4903 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
4904 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
4905 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
4906 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
4907 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
4908 ; AVX1-NEXT: vzeroupper
4911 ; AVX2-LABEL: vec384_v12i16:
4913 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
4914 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
4915 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
4916 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
4917 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
4918 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
4919 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
4920 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
4921 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
4922 ; AVX2-NEXT: vzeroupper
4924 %in.subvec.not = load <12 x i16>, ptr %in.subvec.ptr, align 64
4925 %in.subvec = xor <12 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
4926 store <12 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
4927 %out.subvec0.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 0
4928 store <12 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
4929 %out.subvec1.ptr = getelementptr <12 x i16>, ptr %out.vec.ptr, i64 1
4930 store <12 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
4934 define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
4935 ; SCALAR-LABEL: vec384_v16i8:
4937 ; SCALAR-NEXT: pushq %rbp
4938 ; SCALAR-NEXT: pushq %r15
4939 ; SCALAR-NEXT: pushq %r14
4940 ; SCALAR-NEXT: pushq %r13
4941 ; SCALAR-NEXT: pushq %r12
4942 ; SCALAR-NEXT: pushq %rbx
4943 ; SCALAR-NEXT: movzbl 15(%rdi), %eax
4944 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4945 ; SCALAR-NEXT: movzbl 14(%rdi), %eax
4946 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4947 ; SCALAR-NEXT: movzbl 13(%rdi), %eax
4948 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4949 ; SCALAR-NEXT: movzbl 12(%rdi), %r11d
4950 ; SCALAR-NEXT: movzbl 11(%rdi), %r13d
4951 ; SCALAR-NEXT: movzbl 10(%rdi), %r12d
4952 ; SCALAR-NEXT: movzbl 9(%rdi), %ebp
4953 ; SCALAR-NEXT: movzbl 8(%rdi), %r14d
4954 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
4955 ; SCALAR-NEXT: movzbl 6(%rdi), %r10d
4956 ; SCALAR-NEXT: movzbl 5(%rdi), %r15d
4957 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
4958 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
4959 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
4960 ; SCALAR-NEXT: movzbl (%rdi), %eax
4961 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
4962 ; SCALAR-NEXT: notb %al
4963 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4964 ; SCALAR-NEXT: notb %dil
4965 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4966 ; SCALAR-NEXT: notb %cl
4967 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4968 ; SCALAR-NEXT: notb %r8b
4969 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4970 ; SCALAR-NEXT: notb %r9b
4971 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4972 ; SCALAR-NEXT: movl %r15d, %r9d
4973 ; SCALAR-NEXT: notb %r9b
4974 ; SCALAR-NEXT: notb %r10b
4975 ; SCALAR-NEXT: notb %bl
4976 ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4977 ; SCALAR-NEXT: notb %r14b
4978 ; SCALAR-NEXT: notb %bpl
4979 ; SCALAR-NEXT: movl %ebp, %r15d
4980 ; SCALAR-NEXT: notb %r12b
4981 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4982 ; SCALAR-NEXT: notb %r13b
4983 ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4984 ; SCALAR-NEXT: notb %r11b
4985 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
4986 ; SCALAR-NEXT: notb %dil
4987 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
4988 ; SCALAR-NEXT: notb %cl
4989 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4990 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
4991 ; SCALAR-NEXT: notb %r8b
4992 ; SCALAR-NEXT: movb %r8b, 15(%rsi)
4993 ; SCALAR-NEXT: movb %cl, 14(%rsi)
4994 ; SCALAR-NEXT: movl %edi, %eax
4995 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
4996 ; SCALAR-NEXT: movb %dil, 13(%rsi)
4997 ; SCALAR-NEXT: movb %r11b, 12(%rsi)
4998 ; SCALAR-NEXT: movl %r11d, %ebp
4999 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
5000 ; SCALAR-NEXT: movb %r13b, 11(%rsi)
5001 ; SCALAR-NEXT: movb %r12b, 10(%rsi)
5002 ; SCALAR-NEXT: movb %r15b, 9(%rsi)
5003 ; SCALAR-NEXT: movb %r14b, 8(%rsi)
5004 ; SCALAR-NEXT: movb %bl, 7(%rsi)
5005 ; SCALAR-NEXT: movb %r10b, 6(%rsi)
5006 ; SCALAR-NEXT: movl %r10d, %ebx
5007 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
5008 ; SCALAR-NEXT: movb %r9b, 5(%rsi)
5009 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
5010 ; SCALAR-NEXT: movb %r11b, 4(%rsi)
5011 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
5012 ; SCALAR-NEXT: movb %r12b, 3(%rsi)
5013 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
5014 ; SCALAR-NEXT: movb %cl, 2(%rsi)
5015 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
5016 ; SCALAR-NEXT: movb %r13b, 1(%rsi)
5017 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
5018 ; SCALAR-NEXT: movb %r10b, (%rsi)
5019 ; SCALAR-NEXT: movb %r8b, 15(%rdx)
5020 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
5021 ; SCALAR-NEXT: movb %dil, 14(%rdx)
5022 ; SCALAR-NEXT: movb %al, 13(%rdx)
5023 ; SCALAR-NEXT: movb %bpl, 12(%rdx)
5024 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
5025 ; SCALAR-NEXT: movb %al, 11(%rdx)
5026 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
5027 ; SCALAR-NEXT: movb %al, 10(%rdx)
5028 ; SCALAR-NEXT: movb %r15b, 9(%rdx)
5029 ; SCALAR-NEXT: movb %r14b, 8(%rdx)
5030 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
5031 ; SCALAR-NEXT: movb %bpl, 7(%rdx)
5032 ; SCALAR-NEXT: movb %bl, 6(%rdx)
5033 ; SCALAR-NEXT: movb %r9b, 5(%rdx)
5034 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
5035 ; SCALAR-NEXT: movb %r11b, 4(%rdx)
5036 ; SCALAR-NEXT: movb %r12b, 3(%rdx)
5037 ; SCALAR-NEXT: movb %cl, 2(%rdx)
5038 ; SCALAR-NEXT: movl %r13d, %ebx
5039 ; SCALAR-NEXT: movb %r13b, 1(%rdx)
5040 ; SCALAR-NEXT: movl %r10d, %esi
5041 ; SCALAR-NEXT: movb %r10b, (%rdx)
5042 ; SCALAR-NEXT: movb %r8b, 31(%rdx)
5043 ; SCALAR-NEXT: movb %dil, 30(%rdx)
5044 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
5045 ; SCALAR-NEXT: movb %al, 29(%rdx)
5046 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
5047 ; SCALAR-NEXT: movb %r11b, 28(%rdx)
5048 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
5049 ; SCALAR-NEXT: movb %r13b, 27(%rdx)
5050 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
5051 ; SCALAR-NEXT: movb %r12b, 26(%rdx)
5052 ; SCALAR-NEXT: movb %r15b, 25(%rdx)
5053 ; SCALAR-NEXT: movb %r14b, 24(%rdx)
5054 ; SCALAR-NEXT: movb %bpl, 23(%rdx)
5055 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
5056 ; SCALAR-NEXT: movb %r10b, 22(%rdx)
5057 ; SCALAR-NEXT: movb %r9b, 21(%rdx)
5058 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
5059 ; SCALAR-NEXT: movb %r9b, 20(%rdx)
5060 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
5061 ; SCALAR-NEXT: movb %dil, 19(%rdx)
5062 ; SCALAR-NEXT: movb %cl, 18(%rdx)
5063 ; SCALAR-NEXT: movb %bl, 17(%rdx)
5064 ; SCALAR-NEXT: movb %sil, 16(%rdx)
5065 ; SCALAR-NEXT: movb %r8b, 47(%rdx)
5066 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
5067 ; SCALAR-NEXT: movb %r8b, 46(%rdx)
5068 ; SCALAR-NEXT: movb %al, 45(%rdx)
5069 ; SCALAR-NEXT: movb %r11b, 44(%rdx)
5070 ; SCALAR-NEXT: movb %r13b, 43(%rdx)
5071 ; SCALAR-NEXT: movb %r12b, 42(%rdx)
5072 ; SCALAR-NEXT: movb %r15b, 41(%rdx)
5073 ; SCALAR-NEXT: movb %r14b, 40(%rdx)
5074 ; SCALAR-NEXT: movb %bpl, 39(%rdx)
5075 ; SCALAR-NEXT: movb %r10b, 38(%rdx)
5076 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
5077 ; SCALAR-NEXT: movb %al, 37(%rdx)
5078 ; SCALAR-NEXT: movb %r9b, 36(%rdx)
5079 ; SCALAR-NEXT: movb %dil, 35(%rdx)
5080 ; SCALAR-NEXT: movb %cl, 34(%rdx)
5081 ; SCALAR-NEXT: movb %bl, 33(%rdx)
5082 ; SCALAR-NEXT: movb %sil, 32(%rdx)
5083 ; SCALAR-NEXT: popq %rbx
5084 ; SCALAR-NEXT: popq %r12
5085 ; SCALAR-NEXT: popq %r13
5086 ; SCALAR-NEXT: popq %r14
5087 ; SCALAR-NEXT: popq %r15
5088 ; SCALAR-NEXT: popq %rbp
5091 ; SSE2-LABEL: vec384_v16i8:
5093 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5094 ; SSE2-NEXT: pxor (%rdi), %xmm0
5095 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
5096 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5097 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5098 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5101 ; AVX-LABEL: vec384_v16i8:
5103 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5104 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
5105 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
5106 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
5107 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
5108 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
5110 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
5111 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
5112 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
5113 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
5114 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
5115 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
5116 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
5117 %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2
5118 store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32
5122 define void @vec384_v24i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5123 ; SCALAR-LABEL: vec384_v24i8:
5125 ; SCALAR-NEXT: movq (%rdi), %rax
5126 ; SCALAR-NEXT: movq 8(%rdi), %rcx
5127 ; SCALAR-NEXT: movq 16(%rdi), %rdi
5128 ; SCALAR-NEXT: movq %rdi, %r8
5129 ; SCALAR-NEXT: shrq $40, %r8
5130 ; SCALAR-NEXT: movq %rdi, %r9
5131 ; SCALAR-NEXT: shrq $56, %r9
5132 ; SCALAR-NEXT: movq %rdi, %r10
5133 ; SCALAR-NEXT: shrq $48, %r10
5134 ; SCALAR-NEXT: notb %r10b
5135 ; SCALAR-NEXT: movzbl %r10b, %r10d
5136 ; SCALAR-NEXT: notb %r9b
5137 ; SCALAR-NEXT: movzbl %r9b, %r9d
5138 ; SCALAR-NEXT: shll $8, %r9d
5139 ; SCALAR-NEXT: orl %r10d, %r9d
5140 ; SCALAR-NEXT: movq %rdi, %r10
5141 ; SCALAR-NEXT: shrq $32, %r10
5142 ; SCALAR-NEXT: notb %r10b
5143 ; SCALAR-NEXT: movzbl %r10b, %r10d
5144 ; SCALAR-NEXT: notb %r8b
5145 ; SCALAR-NEXT: movzbl %r8b, %r8d
5146 ; SCALAR-NEXT: shll $8, %r8d
5147 ; SCALAR-NEXT: orl %r10d, %r8d
5148 ; SCALAR-NEXT: movl %edi, %r10d
5149 ; SCALAR-NEXT: shrl $24, %r10d
5150 ; SCALAR-NEXT: shll $16, %r9d
5151 ; SCALAR-NEXT: movzwl %r8w, %r8d
5152 ; SCALAR-NEXT: orl %r9d, %r8d
5153 ; SCALAR-NEXT: movl %edi, %r9d
5154 ; SCALAR-NEXT: shrl $16, %r9d
5155 ; SCALAR-NEXT: notb %r9b
5156 ; SCALAR-NEXT: movzbl %r9b, %r9d
5157 ; SCALAR-NEXT: notb %r10b
5158 ; SCALAR-NEXT: movzbl %r10b, %r10d
5159 ; SCALAR-NEXT: shll $8, %r10d
5160 ; SCALAR-NEXT: orl %r9d, %r10d
5161 ; SCALAR-NEXT: movl %edi, %r9d
5162 ; SCALAR-NEXT: shrl $8, %r9d
5163 ; SCALAR-NEXT: notb %dil
5164 ; SCALAR-NEXT: movzbl %dil, %edi
5165 ; SCALAR-NEXT: notb %r9b
5166 ; SCALAR-NEXT: movzbl %r9b, %r11d
5167 ; SCALAR-NEXT: shll $8, %r11d
5168 ; SCALAR-NEXT: orl %edi, %r11d
5169 ; SCALAR-NEXT: movq %rcx, %r9
5170 ; SCALAR-NEXT: shrq $40, %r9
5171 ; SCALAR-NEXT: shll $16, %r10d
5172 ; SCALAR-NEXT: movzwl %r11w, %edi
5173 ; SCALAR-NEXT: orl %r10d, %edi
5174 ; SCALAR-NEXT: movq %rcx, %r10
5175 ; SCALAR-NEXT: shrq $56, %r10
5176 ; SCALAR-NEXT: shlq $32, %r8
5177 ; SCALAR-NEXT: orq %r8, %rdi
5178 ; SCALAR-NEXT: movq %rcx, %r8
5179 ; SCALAR-NEXT: shrq $48, %r8
5180 ; SCALAR-NEXT: notb %r8b
5181 ; SCALAR-NEXT: movzbl %r8b, %r8d
5182 ; SCALAR-NEXT: notb %r10b
5183 ; SCALAR-NEXT: movzbl %r10b, %r10d
5184 ; SCALAR-NEXT: shll $8, %r10d
5185 ; SCALAR-NEXT: orl %r8d, %r10d
5186 ; SCALAR-NEXT: movq %rcx, %r8
5187 ; SCALAR-NEXT: shrq $32, %r8
5188 ; SCALAR-NEXT: notb %r8b
5189 ; SCALAR-NEXT: movzbl %r8b, %r8d
5190 ; SCALAR-NEXT: notb %r9b
5191 ; SCALAR-NEXT: movzbl %r9b, %r9d
5192 ; SCALAR-NEXT: shll $8, %r9d
5193 ; SCALAR-NEXT: orl %r8d, %r9d
5194 ; SCALAR-NEXT: movl %ecx, %r11d
5195 ; SCALAR-NEXT: shrl $24, %r11d
5196 ; SCALAR-NEXT: shll $16, %r10d
5197 ; SCALAR-NEXT: movzwl %r9w, %r8d
5198 ; SCALAR-NEXT: orl %r10d, %r8d
5199 ; SCALAR-NEXT: movl %ecx, %r9d
5200 ; SCALAR-NEXT: shrl $16, %r9d
5201 ; SCALAR-NEXT: notb %r9b
5202 ; SCALAR-NEXT: movzbl %r9b, %r9d
5203 ; SCALAR-NEXT: notb %r11b
5204 ; SCALAR-NEXT: movzbl %r11b, %r10d
5205 ; SCALAR-NEXT: shll $8, %r10d
5206 ; SCALAR-NEXT: orl %r9d, %r10d
5207 ; SCALAR-NEXT: movl %ecx, %r9d
5208 ; SCALAR-NEXT: shrl $8, %r9d
5209 ; SCALAR-NEXT: notb %cl
5210 ; SCALAR-NEXT: movzbl %cl, %ecx
5211 ; SCALAR-NEXT: notb %r9b
5212 ; SCALAR-NEXT: movzbl %r9b, %r11d
5213 ; SCALAR-NEXT: shll $8, %r11d
5214 ; SCALAR-NEXT: orl %ecx, %r11d
5215 ; SCALAR-NEXT: movq %rax, %r9
5216 ; SCALAR-NEXT: shrq $40, %r9
5217 ; SCALAR-NEXT: shll $16, %r10d
5218 ; SCALAR-NEXT: movzwl %r11w, %ecx
5219 ; SCALAR-NEXT: orl %r10d, %ecx
5220 ; SCALAR-NEXT: movq %rax, %r10
5221 ; SCALAR-NEXT: shrq $56, %r10
5222 ; SCALAR-NEXT: shlq $32, %r8
5223 ; SCALAR-NEXT: orq %r8, %rcx
5224 ; SCALAR-NEXT: movq %rax, %r8
5225 ; SCALAR-NEXT: shrq $48, %r8
5226 ; SCALAR-NEXT: notb %r8b
5227 ; SCALAR-NEXT: movzbl %r8b, %r8d
5228 ; SCALAR-NEXT: notb %r10b
5229 ; SCALAR-NEXT: movzbl %r10b, %r10d
5230 ; SCALAR-NEXT: shll $8, %r10d
5231 ; SCALAR-NEXT: orl %r8d, %r10d
5232 ; SCALAR-NEXT: movq %rax, %r8
5233 ; SCALAR-NEXT: shrq $32, %r8
5234 ; SCALAR-NEXT: notb %r8b
5235 ; SCALAR-NEXT: movzbl %r8b, %r8d
5236 ; SCALAR-NEXT: notb %r9b
5237 ; SCALAR-NEXT: movzbl %r9b, %r9d
5238 ; SCALAR-NEXT: shll $8, %r9d
5239 ; SCALAR-NEXT: orl %r8d, %r9d
5240 ; SCALAR-NEXT: movl %eax, %r11d
5241 ; SCALAR-NEXT: shrl $24, %r11d
5242 ; SCALAR-NEXT: shll $16, %r10d
5243 ; SCALAR-NEXT: movzwl %r9w, %r8d
5244 ; SCALAR-NEXT: orl %r10d, %r8d
5245 ; SCALAR-NEXT: movl %eax, %r9d
5246 ; SCALAR-NEXT: shrl $16, %r9d
5247 ; SCALAR-NEXT: notb %r9b
5248 ; SCALAR-NEXT: movzbl %r9b, %r9d
5249 ; SCALAR-NEXT: notb %r11b
5250 ; SCALAR-NEXT: movzbl %r11b, %r10d
5251 ; SCALAR-NEXT: shll $8, %r10d
5252 ; SCALAR-NEXT: orl %r9d, %r10d
5253 ; SCALAR-NEXT: movl %eax, %r9d
5254 ; SCALAR-NEXT: shrl $8, %r9d
5255 ; SCALAR-NEXT: notb %al
5256 ; SCALAR-NEXT: movzbl %al, %eax
5257 ; SCALAR-NEXT: notb %r9b
5258 ; SCALAR-NEXT: movzbl %r9b, %r9d
5259 ; SCALAR-NEXT: shll $8, %r9d
5260 ; SCALAR-NEXT: orl %eax, %r9d
5261 ; SCALAR-NEXT: shll $16, %r10d
5262 ; SCALAR-NEXT: movzwl %r9w, %eax
5263 ; SCALAR-NEXT: orl %r10d, %eax
5264 ; SCALAR-NEXT: shlq $32, %r8
5265 ; SCALAR-NEXT: orq %r8, %rax
5266 ; SCALAR-NEXT: movq %rax, (%rsi)
5267 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
5268 ; SCALAR-NEXT: movq %rdi, 16(%rsi)
5269 ; SCALAR-NEXT: movq %rax, (%rdx)
5270 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
5271 ; SCALAR-NEXT: movq %rdi, 16(%rdx)
5272 ; SCALAR-NEXT: movq %rdi, 48(%rdx)
5273 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
5274 ; SCALAR-NEXT: movq %rax, 32(%rdx)
5277 ; SSE2-LABEL: vec384_v24i8:
5279 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5280 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
5281 ; SSE2-NEXT: pxor %xmm0, %xmm1
5282 ; SSE2-NEXT: pxor (%rdi), %xmm0
5283 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
5284 ; SSE2-NEXT: movq %xmm1, 16(%rsi)
5285 ; SSE2-NEXT: movq %xmm1, 16(%rdx)
5286 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5287 ; SSE2-NEXT: movq %xmm1, 48(%rdx)
5288 ; SSE2-NEXT: movdqu %xmm0, 32(%rdx)
5291 ; AVX1-LABEL: vec384_v24i8:
5293 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
5294 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
5295 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
5296 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
5297 ; AVX1-NEXT: vmovlps %xmm1, 16(%rsi)
5298 ; AVX1-NEXT: vmovaps %xmm0, (%rsi)
5299 ; AVX1-NEXT: vmovlps %xmm1, 16(%rdx)
5300 ; AVX1-NEXT: vmovaps %xmm0, (%rdx)
5301 ; AVX1-NEXT: vmovlps %xmm1, 48(%rdx)
5302 ; AVX1-NEXT: vmovups %xmm0, 32(%rdx)
5303 ; AVX1-NEXT: vzeroupper
5306 ; AVX2-LABEL: vec384_v24i8:
5308 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
5309 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
5310 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5311 ; AVX2-NEXT: vmovq %xmm1, 16(%rsi)
5312 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
5313 ; AVX2-NEXT: vmovq %xmm1, 16(%rdx)
5314 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
5315 ; AVX2-NEXT: vmovq %xmm1, 48(%rdx)
5316 ; AVX2-NEXT: vmovdqu %xmm0, 32(%rdx)
5317 ; AVX2-NEXT: vzeroupper
5319 %in.subvec.not = load <24 x i8>, ptr %in.subvec.ptr, align 64
5320 %in.subvec = xor <24 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
5321 store <24 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
5322 %out.subvec0.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 0
5323 store <24 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
5324 %out.subvec1.ptr = getelementptr <24 x i8>, ptr %out.vec.ptr, i64 1
5325 store <24 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
5329 define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5330 ; SCALAR-LABEL: vec512_v2i8:
5332 ; SCALAR-NEXT: movzbl (%rdi), %eax
5333 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
5334 ; SCALAR-NEXT: notb %al
5335 ; SCALAR-NEXT: notb %cl
5336 ; SCALAR-NEXT: movb %cl, 1(%rsi)
5337 ; SCALAR-NEXT: movb %al, (%rsi)
5338 ; SCALAR-NEXT: movb %cl, 1(%rdx)
5339 ; SCALAR-NEXT: movb %al, (%rdx)
5340 ; SCALAR-NEXT: movb %cl, 3(%rdx)
5341 ; SCALAR-NEXT: movb %al, 2(%rdx)
5342 ; SCALAR-NEXT: movb %cl, 5(%rdx)
5343 ; SCALAR-NEXT: movb %al, 4(%rdx)
5344 ; SCALAR-NEXT: movb %cl, 7(%rdx)
5345 ; SCALAR-NEXT: movb %al, 6(%rdx)
5346 ; SCALAR-NEXT: movb %cl, 9(%rdx)
5347 ; SCALAR-NEXT: movb %al, 8(%rdx)
5348 ; SCALAR-NEXT: movb %cl, 11(%rdx)
5349 ; SCALAR-NEXT: movb %al, 10(%rdx)
5350 ; SCALAR-NEXT: movb %cl, 13(%rdx)
5351 ; SCALAR-NEXT: movb %al, 12(%rdx)
5352 ; SCALAR-NEXT: movb %cl, 15(%rdx)
5353 ; SCALAR-NEXT: movb %al, 14(%rdx)
5354 ; SCALAR-NEXT: movb %cl, 17(%rdx)
5355 ; SCALAR-NEXT: movb %al, 16(%rdx)
5356 ; SCALAR-NEXT: movb %cl, 19(%rdx)
5357 ; SCALAR-NEXT: movb %al, 18(%rdx)
5358 ; SCALAR-NEXT: movb %cl, 21(%rdx)
5359 ; SCALAR-NEXT: movb %al, 20(%rdx)
5360 ; SCALAR-NEXT: movb %cl, 23(%rdx)
5361 ; SCALAR-NEXT: movb %al, 22(%rdx)
5362 ; SCALAR-NEXT: movb %cl, 25(%rdx)
5363 ; SCALAR-NEXT: movb %al, 24(%rdx)
5364 ; SCALAR-NEXT: movb %cl, 27(%rdx)
5365 ; SCALAR-NEXT: movb %al, 26(%rdx)
5366 ; SCALAR-NEXT: movb %cl, 29(%rdx)
5367 ; SCALAR-NEXT: movb %al, 28(%rdx)
5368 ; SCALAR-NEXT: movb %cl, 31(%rdx)
5369 ; SCALAR-NEXT: movb %al, 30(%rdx)
5370 ; SCALAR-NEXT: movb %cl, 33(%rdx)
5371 ; SCALAR-NEXT: movb %al, 32(%rdx)
5372 ; SCALAR-NEXT: movb %cl, 35(%rdx)
5373 ; SCALAR-NEXT: movb %al, 34(%rdx)
5374 ; SCALAR-NEXT: movb %cl, 37(%rdx)
5375 ; SCALAR-NEXT: movb %al, 36(%rdx)
5376 ; SCALAR-NEXT: movb %cl, 39(%rdx)
5377 ; SCALAR-NEXT: movb %al, 38(%rdx)
5378 ; SCALAR-NEXT: movb %cl, 41(%rdx)
5379 ; SCALAR-NEXT: movb %al, 40(%rdx)
5380 ; SCALAR-NEXT: movb %cl, 43(%rdx)
5381 ; SCALAR-NEXT: movb %al, 42(%rdx)
5382 ; SCALAR-NEXT: movb %cl, 45(%rdx)
5383 ; SCALAR-NEXT: movb %al, 44(%rdx)
5384 ; SCALAR-NEXT: movb %cl, 47(%rdx)
5385 ; SCALAR-NEXT: movb %al, 46(%rdx)
5386 ; SCALAR-NEXT: movb %cl, 49(%rdx)
5387 ; SCALAR-NEXT: movb %al, 48(%rdx)
5388 ; SCALAR-NEXT: movb %cl, 51(%rdx)
5389 ; SCALAR-NEXT: movb %al, 50(%rdx)
5390 ; SCALAR-NEXT: movb %cl, 53(%rdx)
5391 ; SCALAR-NEXT: movb %al, 52(%rdx)
5392 ; SCALAR-NEXT: movb %cl, 55(%rdx)
5393 ; SCALAR-NEXT: movb %al, 54(%rdx)
5394 ; SCALAR-NEXT: movb %cl, 57(%rdx)
5395 ; SCALAR-NEXT: movb %al, 56(%rdx)
5396 ; SCALAR-NEXT: movb %cl, 59(%rdx)
5397 ; SCALAR-NEXT: movb %al, 58(%rdx)
5398 ; SCALAR-NEXT: movb %cl, 61(%rdx)
5399 ; SCALAR-NEXT: movb %al, 60(%rdx)
5400 ; SCALAR-NEXT: movb %cl, 63(%rdx)
5401 ; SCALAR-NEXT: movb %al, 62(%rdx)
5404 ; SSE2-ONLY-LABEL: vec512_v2i8:
5405 ; SSE2-ONLY: # %bb.0:
5406 ; SSE2-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
5407 ; SSE2-ONLY-NEXT: pxor (%rdi), %xmm0
5408 ; SSE2-ONLY-NEXT: movd %xmm0, %eax
5409 ; SSE2-ONLY-NEXT: movw %ax, (%rsi)
5410 ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5411 ; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5412 ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx)
5413 ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
5414 ; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
5415 ; SSE2-ONLY-NEXT: movdqa %xmm0, 48(%rdx)
5416 ; SSE2-ONLY-NEXT: retq
5418 ; SSE3-LABEL: vec512_v2i8:
5420 ; SSE3-NEXT: pcmpeqd %xmm0, %xmm0
5421 ; SSE3-NEXT: pxor (%rdi), %xmm0
5422 ; SSE3-NEXT: movd %xmm0, %eax
5423 ; SSE3-NEXT: movw %ax, (%rsi)
5424 ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5425 ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5426 ; SSE3-NEXT: movdqa %xmm0, (%rdx)
5427 ; SSE3-NEXT: movdqa %xmm0, 16(%rdx)
5428 ; SSE3-NEXT: movdqa %xmm0, 32(%rdx)
5429 ; SSE3-NEXT: movdqa %xmm0, 48(%rdx)
5432 ; SSSE3-ONLY-LABEL: vec512_v2i8:
5433 ; SSSE3-ONLY: # %bb.0:
5434 ; SSSE3-ONLY-NEXT: pcmpeqd %xmm0, %xmm0
5435 ; SSSE3-ONLY-NEXT: pxor (%rdi), %xmm0
5436 ; SSSE3-ONLY-NEXT: movd %xmm0, %eax
5437 ; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
5438 ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5439 ; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5440 ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx)
5441 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx)
5442 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx)
5443 ; SSSE3-ONLY-NEXT: movdqa %xmm0, 48(%rdx)
5444 ; SSSE3-ONLY-NEXT: retq
5446 ; SSE41-LABEL: vec512_v2i8:
5448 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
5449 ; SSE41-NEXT: pxor (%rdi), %xmm0
5450 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi)
5451 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5452 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5453 ; SSE41-NEXT: movdqa %xmm0, (%rdx)
5454 ; SSE41-NEXT: movdqa %xmm0, 16(%rdx)
5455 ; SSE41-NEXT: movdqa %xmm0, 32(%rdx)
5456 ; SSE41-NEXT: movdqa %xmm0, 48(%rdx)
5459 ; SSE42-LABEL: vec512_v2i8:
5461 ; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
5462 ; SSE42-NEXT: pxor (%rdi), %xmm0
5463 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
5464 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5465 ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5466 ; SSE42-NEXT: movdqa %xmm0, (%rdx)
5467 ; SSE42-NEXT: movdqa %xmm0, 16(%rdx)
5468 ; SSE42-NEXT: movdqa %xmm0, 32(%rdx)
5469 ; SSE42-NEXT: movdqa %xmm0, 48(%rdx)
5472 ; AVX1-LABEL: vec512_v2i8:
5474 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5475 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
5476 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi)
5477 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
5478 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5479 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5480 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5481 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5482 ; AVX1-NEXT: vzeroupper
5485 ; AVX2-ONLY-LABEL: vec512_v2i8:
5486 ; AVX2-ONLY: # %bb.0:
5487 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5488 ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0
5489 ; AVX2-ONLY-NEXT: vpextrw $0, %xmm0, (%rsi)
5490 ; AVX2-ONLY-NEXT: vpbroadcastw %xmm0, %ymm0
5491 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5492 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5493 ; AVX2-ONLY-NEXT: vzeroupper
5494 ; AVX2-ONLY-NEXT: retq
5496 ; AVX512F-LABEL: vec512_v2i8:
5498 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5499 ; AVX512F-NEXT: vpxor (%rdi), %xmm0, %xmm0
5500 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
5501 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
5502 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
5503 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rdx)
5504 ; AVX512F-NEXT: vzeroupper
5505 ; AVX512F-NEXT: retq
5507 ; AVX512BW-LABEL: vec512_v2i8:
5508 ; AVX512BW: # %bb.0:
5509 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5510 ; AVX512BW-NEXT: vpxor (%rdi), %xmm0, %xmm0
5511 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
5512 ; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0
5513 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
5514 ; AVX512BW-NEXT: vzeroupper
5515 ; AVX512BW-NEXT: retq
5516 %in.subvec.not = load <2 x i8>, ptr %in.subvec.ptr, align 64
5517 %in.subvec = xor <2 x i8> %in.subvec.not, <i8 -1, i8 -1>
5518 store <2 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
5519 %out.subvec0.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 0
5520 store <2 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
5521 %out.subvec1.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 1
5522 store <2 x i8> %in.subvec, ptr %out.subvec1.ptr, align 2
5523 %out.subvec2.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 2
5524 store <2 x i8> %in.subvec, ptr %out.subvec2.ptr, align 4
5525 %out.subvec3.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 3
5526 store <2 x i8> %in.subvec, ptr %out.subvec3.ptr, align 2
5527 %out.subvec4.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 4
5528 store <2 x i8> %in.subvec, ptr %out.subvec4.ptr, align 8
5529 %out.subvec5.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 5
5530 store <2 x i8> %in.subvec, ptr %out.subvec5.ptr, align 2
5531 %out.subvec6.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 6
5532 store <2 x i8> %in.subvec, ptr %out.subvec6.ptr, align 4
5533 %out.subvec7.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 7
5534 store <2 x i8> %in.subvec, ptr %out.subvec7.ptr, align 2
5535 %out.subvec8.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 8
5536 store <2 x i8> %in.subvec, ptr %out.subvec8.ptr, align 16
5537 %out.subvec9.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 9
5538 store <2 x i8> %in.subvec, ptr %out.subvec9.ptr, align 2
5539 %out.subvec10.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 10
5540 store <2 x i8> %in.subvec, ptr %out.subvec10.ptr, align 4
5541 %out.subvec11.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 11
5542 store <2 x i8> %in.subvec, ptr %out.subvec11.ptr, align 2
5543 %out.subvec12.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 12
5544 store <2 x i8> %in.subvec, ptr %out.subvec12.ptr, align 8
5545 %out.subvec13.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 13
5546 store <2 x i8> %in.subvec, ptr %out.subvec13.ptr, align 2
5547 %out.subvec14.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 14
5548 store <2 x i8> %in.subvec, ptr %out.subvec14.ptr, align 4
5549 %out.subvec15.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 15
5550 store <2 x i8> %in.subvec, ptr %out.subvec15.ptr, align 2
5551 %out.subvec16.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 16
5552 store <2 x i8> %in.subvec, ptr %out.subvec16.ptr, align 32
5553 %out.subvec17.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 17
5554 store <2 x i8> %in.subvec, ptr %out.subvec17.ptr, align 2
5555 %out.subvec18.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 18
5556 store <2 x i8> %in.subvec, ptr %out.subvec18.ptr, align 4
5557 %out.subvec19.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 19
5558 store <2 x i8> %in.subvec, ptr %out.subvec19.ptr, align 2
5559 %out.subvec20.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 20
5560 store <2 x i8> %in.subvec, ptr %out.subvec20.ptr, align 8
5561 %out.subvec21.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 21
5562 store <2 x i8> %in.subvec, ptr %out.subvec21.ptr, align 2
5563 %out.subvec22.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 22
5564 store <2 x i8> %in.subvec, ptr %out.subvec22.ptr, align 4
5565 %out.subvec23.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 23
5566 store <2 x i8> %in.subvec, ptr %out.subvec23.ptr, align 2
5567 %out.subvec24.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 24
5568 store <2 x i8> %in.subvec, ptr %out.subvec24.ptr, align 16
5569 %out.subvec25.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 25
5570 store <2 x i8> %in.subvec, ptr %out.subvec25.ptr, align 2
5571 %out.subvec26.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 26
5572 store <2 x i8> %in.subvec, ptr %out.subvec26.ptr, align 4
5573 %out.subvec27.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 27
5574 store <2 x i8> %in.subvec, ptr %out.subvec27.ptr, align 2
5575 %out.subvec28.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 28
5576 store <2 x i8> %in.subvec, ptr %out.subvec28.ptr, align 8
5577 %out.subvec29.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 29
5578 store <2 x i8> %in.subvec, ptr %out.subvec29.ptr, align 2
5579 %out.subvec30.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 30
5580 store <2 x i8> %in.subvec, ptr %out.subvec30.ptr, align 4
5581 %out.subvec31.ptr = getelementptr <2 x i8>, ptr %out.vec.ptr, i64 31
5582 store <2 x i8> %in.subvec, ptr %out.subvec31.ptr, align 2
5586 define void @vec512_v2i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5587 ; SCALAR-LABEL: vec512_v2i16:
5589 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
5590 ; SCALAR-NEXT: movl (%rdi), %eax
5591 ; SCALAR-NEXT: notl %eax
5592 ; SCALAR-NEXT: notl %ecx
5593 ; SCALAR-NEXT: movw %cx, 2(%rsi)
5594 ; SCALAR-NEXT: movw %ax, (%rsi)
5595 ; SCALAR-NEXT: movw %cx, 2(%rdx)
5596 ; SCALAR-NEXT: movw %ax, (%rdx)
5597 ; SCALAR-NEXT: movw %cx, 6(%rdx)
5598 ; SCALAR-NEXT: movw %ax, 4(%rdx)
5599 ; SCALAR-NEXT: movw %cx, 10(%rdx)
5600 ; SCALAR-NEXT: movw %ax, 8(%rdx)
5601 ; SCALAR-NEXT: movw %cx, 14(%rdx)
5602 ; SCALAR-NEXT: movw %ax, 12(%rdx)
5603 ; SCALAR-NEXT: movw %cx, 18(%rdx)
5604 ; SCALAR-NEXT: movw %ax, 16(%rdx)
5605 ; SCALAR-NEXT: movw %cx, 22(%rdx)
5606 ; SCALAR-NEXT: movw %ax, 20(%rdx)
5607 ; SCALAR-NEXT: movw %cx, 26(%rdx)
5608 ; SCALAR-NEXT: movw %ax, 24(%rdx)
5609 ; SCALAR-NEXT: movw %cx, 30(%rdx)
5610 ; SCALAR-NEXT: movw %ax, 28(%rdx)
5611 ; SCALAR-NEXT: movw %cx, 34(%rdx)
5612 ; SCALAR-NEXT: movw %ax, 32(%rdx)
5613 ; SCALAR-NEXT: movw %cx, 38(%rdx)
5614 ; SCALAR-NEXT: movw %ax, 36(%rdx)
5615 ; SCALAR-NEXT: movw %cx, 42(%rdx)
5616 ; SCALAR-NEXT: movw %ax, 40(%rdx)
5617 ; SCALAR-NEXT: movw %cx, 46(%rdx)
5618 ; SCALAR-NEXT: movw %ax, 44(%rdx)
5619 ; SCALAR-NEXT: movw %cx, 50(%rdx)
5620 ; SCALAR-NEXT: movw %ax, 48(%rdx)
5621 ; SCALAR-NEXT: movw %cx, 54(%rdx)
5622 ; SCALAR-NEXT: movw %ax, 52(%rdx)
5623 ; SCALAR-NEXT: movw %cx, 58(%rdx)
5624 ; SCALAR-NEXT: movw %ax, 56(%rdx)
5625 ; SCALAR-NEXT: movw %cx, 62(%rdx)
5626 ; SCALAR-NEXT: movw %ax, 60(%rdx)
5629 ; SSE2-LABEL: vec512_v2i16:
5631 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5632 ; SSE2-NEXT: pxor (%rdi), %xmm0
5633 ; SSE2-NEXT: movd %xmm0, (%rsi)
5634 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5635 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5636 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5637 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5638 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5641 ; AVX1-LABEL: vec512_v2i16:
5643 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5644 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
5645 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
5646 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5647 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5648 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5649 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5650 ; AVX1-NEXT: vzeroupper
5653 ; AVX2-ONLY-LABEL: vec512_v2i16:
5654 ; AVX2-ONLY: # %bb.0:
5655 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5656 ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0
5657 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
5658 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0
5659 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5660 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5661 ; AVX2-ONLY-NEXT: vzeroupper
5662 ; AVX2-ONLY-NEXT: retq
5664 ; AVX512-LABEL: vec512_v2i16:
5666 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5667 ; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
5668 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
5669 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
5670 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
5671 ; AVX512-NEXT: vzeroupper
5673 %in.subvec.not = load <2 x i16>, ptr %in.subvec.ptr, align 64
5674 %in.subvec = xor <2 x i16> %in.subvec.not, <i16 -1, i16 -1>
5675 store <2 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
5676 %out.subvec0.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 0
5677 store <2 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
5678 %out.subvec1.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 1
5679 store <2 x i16> %in.subvec, ptr %out.subvec1.ptr, align 4
5680 %out.subvec2.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 2
5681 store <2 x i16> %in.subvec, ptr %out.subvec2.ptr, align 8
5682 %out.subvec3.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 3
5683 store <2 x i16> %in.subvec, ptr %out.subvec3.ptr, align 4
5684 %out.subvec4.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 4
5685 store <2 x i16> %in.subvec, ptr %out.subvec4.ptr, align 16
5686 %out.subvec5.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 5
5687 store <2 x i16> %in.subvec, ptr %out.subvec5.ptr, align 4
5688 %out.subvec6.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 6
5689 store <2 x i16> %in.subvec, ptr %out.subvec6.ptr, align 8
5690 %out.subvec7.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 7
5691 store <2 x i16> %in.subvec, ptr %out.subvec7.ptr, align 4
5692 %out.subvec8.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 8
5693 store <2 x i16> %in.subvec, ptr %out.subvec8.ptr, align 32
5694 %out.subvec9.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 9
5695 store <2 x i16> %in.subvec, ptr %out.subvec9.ptr, align 4
5696 %out.subvec10.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 10
5697 store <2 x i16> %in.subvec, ptr %out.subvec10.ptr, align 8
5698 %out.subvec11.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 11
5699 store <2 x i16> %in.subvec, ptr %out.subvec11.ptr, align 4
5700 %out.subvec12.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 12
5701 store <2 x i16> %in.subvec, ptr %out.subvec12.ptr, align 16
5702 %out.subvec13.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 13
5703 store <2 x i16> %in.subvec, ptr %out.subvec13.ptr, align 4
5704 %out.subvec14.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 14
5705 store <2 x i16> %in.subvec, ptr %out.subvec14.ptr, align 8
5706 %out.subvec15.ptr = getelementptr <2 x i16>, ptr %out.vec.ptr, i64 15
5707 store <2 x i16> %in.subvec, ptr %out.subvec15.ptr, align 4
5711 define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5712 ; SCALAR-LABEL: vec512_v2i32:
5714 ; SCALAR-NEXT: movl (%rdi), %eax
5715 ; SCALAR-NEXT: movl 4(%rdi), %ecx
5716 ; SCALAR-NEXT: notl %eax
5717 ; SCALAR-NEXT: notl %ecx
5718 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
5719 ; SCALAR-NEXT: movl %eax, (%rsi)
5720 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
5721 ; SCALAR-NEXT: movl %eax, (%rdx)
5722 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
5723 ; SCALAR-NEXT: movl %eax, 8(%rdx)
5724 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
5725 ; SCALAR-NEXT: movl %eax, 16(%rdx)
5726 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
5727 ; SCALAR-NEXT: movl %eax, 24(%rdx)
5728 ; SCALAR-NEXT: movl %ecx, 36(%rdx)
5729 ; SCALAR-NEXT: movl %eax, 32(%rdx)
5730 ; SCALAR-NEXT: movl %ecx, 44(%rdx)
5731 ; SCALAR-NEXT: movl %eax, 40(%rdx)
5732 ; SCALAR-NEXT: movl %ecx, 52(%rdx)
5733 ; SCALAR-NEXT: movl %eax, 48(%rdx)
5734 ; SCALAR-NEXT: movl %ecx, 60(%rdx)
5735 ; SCALAR-NEXT: movl %eax, 56(%rdx)
5738 ; SSE2-LABEL: vec512_v2i32:
5740 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
5741 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
5742 ; SSE2-NEXT: pxor %xmm0, %xmm1
5743 ; SSE2-NEXT: movq %xmm1, (%rsi)
5744 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5745 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5746 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5747 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5748 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5751 ; AVX1-LABEL: vec512_v2i32:
5753 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5754 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5755 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
5756 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
5757 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5758 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5759 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5760 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5761 ; AVX1-NEXT: vzeroupper
5764 ; AVX2-ONLY-LABEL: vec512_v2i32:
5765 ; AVX2-ONLY: # %bb.0:
5766 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5767 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5768 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
5769 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
5770 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
5771 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5772 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5773 ; AVX2-ONLY-NEXT: vzeroupper
5774 ; AVX2-ONLY-NEXT: retq
5776 ; AVX512-LABEL: vec512_v2i32:
5778 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5779 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
5780 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
5781 ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
5782 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
5783 ; AVX512-NEXT: vzeroupper
5785 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
5786 %in.subvec = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
5787 store <2 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
5788 %out.subvec0.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 0
5789 store <2 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
5790 %out.subvec1.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 1
5791 store <2 x i32> %in.subvec, ptr %out.subvec1.ptr, align 8
5792 %out.subvec2.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 2
5793 store <2 x i32> %in.subvec, ptr %out.subvec2.ptr, align 16
5794 %out.subvec3.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 3
5795 store <2 x i32> %in.subvec, ptr %out.subvec3.ptr, align 8
5796 %out.subvec4.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 4
5797 store <2 x i32> %in.subvec, ptr %out.subvec4.ptr, align 32
5798 %out.subvec5.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 5
5799 store <2 x i32> %in.subvec, ptr %out.subvec5.ptr, align 8
5800 %out.subvec6.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 6
5801 store <2 x i32> %in.subvec, ptr %out.subvec6.ptr, align 16
5802 %out.subvec7.ptr = getelementptr <2 x i32>, ptr %out.vec.ptr, i64 7
5803 store <2 x i32> %in.subvec, ptr %out.subvec7.ptr, align 8
5807 define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5808 ; SCALAR-LABEL: vec512_v2f32:
5810 ; SCALAR-NEXT: movl (%rdi), %eax
5811 ; SCALAR-NEXT: movl 4(%rdi), %ecx
5812 ; SCALAR-NEXT: notl %eax
5813 ; SCALAR-NEXT: notl %ecx
5814 ; SCALAR-NEXT: movl %ecx, 4(%rsi)
5815 ; SCALAR-NEXT: movl %eax, (%rsi)
5816 ; SCALAR-NEXT: movl %ecx, 4(%rdx)
5817 ; SCALAR-NEXT: movl %eax, (%rdx)
5818 ; SCALAR-NEXT: movl %ecx, 12(%rdx)
5819 ; SCALAR-NEXT: movl %eax, 8(%rdx)
5820 ; SCALAR-NEXT: movl %ecx, 20(%rdx)
5821 ; SCALAR-NEXT: movl %eax, 16(%rdx)
5822 ; SCALAR-NEXT: movl %ecx, 28(%rdx)
5823 ; SCALAR-NEXT: movl %eax, 24(%rdx)
5824 ; SCALAR-NEXT: movl %ecx, 36(%rdx)
5825 ; SCALAR-NEXT: movl %eax, 32(%rdx)
5826 ; SCALAR-NEXT: movl %ecx, 44(%rdx)
5827 ; SCALAR-NEXT: movl %eax, 40(%rdx)
5828 ; SCALAR-NEXT: movl %ecx, 52(%rdx)
5829 ; SCALAR-NEXT: movl %eax, 48(%rdx)
5830 ; SCALAR-NEXT: movl %ecx, 60(%rdx)
5831 ; SCALAR-NEXT: movl %eax, 56(%rdx)
5834 ; SSE2-LABEL: vec512_v2f32:
5836 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
5837 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
5838 ; SSE2-NEXT: pxor %xmm0, %xmm1
5839 ; SSE2-NEXT: movq %xmm1, (%rsi)
5840 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
5841 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5842 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5843 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5844 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5847 ; AVX1-LABEL: vec512_v2f32:
5849 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5850 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5851 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
5852 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
5853 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
5854 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5855 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
5856 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
5857 ; AVX1-NEXT: vzeroupper
5860 ; AVX2-ONLY-LABEL: vec512_v2f32:
5861 ; AVX2-ONLY: # %bb.0:
5862 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5863 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
5864 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
5865 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
5866 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
5867 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
5868 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
5869 ; AVX2-ONLY-NEXT: vzeroupper
5870 ; AVX2-ONLY-NEXT: retq
5872 ; AVX512-LABEL: vec512_v2f32:
5874 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
5875 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
5876 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
5877 ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
5878 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
5879 ; AVX512-NEXT: vzeroupper
5881 %in.subvec.not = load <2 x i32>, ptr %in.subvec.ptr, align 64
5882 %in.subvec.int = xor <2 x i32> %in.subvec.not, <i32 -1, i32 -1>
5883 %in.subvec = bitcast <2 x i32> %in.subvec.int to <2 x float>
5884 store <2 x float> %in.subvec, ptr %out.subvec.ptr, align 64
5885 %out.subvec0.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 0
5886 store <2 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
5887 %out.subvec1.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 1
5888 store <2 x float> %in.subvec, ptr %out.subvec1.ptr, align 8
5889 %out.subvec2.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 2
5890 store <2 x float> %in.subvec, ptr %out.subvec2.ptr, align 16
5891 %out.subvec3.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 3
5892 store <2 x float> %in.subvec, ptr %out.subvec3.ptr, align 8
5893 %out.subvec4.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 4
5894 store <2 x float> %in.subvec, ptr %out.subvec4.ptr, align 32
5895 %out.subvec5.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 5
5896 store <2 x float> %in.subvec, ptr %out.subvec5.ptr, align 8
5897 %out.subvec6.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 6
5898 store <2 x float> %in.subvec, ptr %out.subvec6.ptr, align 16
5899 %out.subvec7.ptr = getelementptr <2 x float>, ptr %out.vec.ptr, i64 7
5900 store <2 x float> %in.subvec, ptr %out.subvec7.ptr, align 8
5904 define void @vec512_v2i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5905 ; SCALAR-LABEL: vec512_v2i64:
5907 ; SCALAR-NEXT: movq (%rdi), %rax
5908 ; SCALAR-NEXT: movq 8(%rdi), %rcx
5909 ; SCALAR-NEXT: notq %rax
5910 ; SCALAR-NEXT: notq %rcx
5911 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
5912 ; SCALAR-NEXT: movq %rax, (%rsi)
5913 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
5914 ; SCALAR-NEXT: movq %rax, (%rdx)
5915 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
5916 ; SCALAR-NEXT: movq %rax, 16(%rdx)
5917 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
5918 ; SCALAR-NEXT: movq %rax, 32(%rdx)
5919 ; SCALAR-NEXT: movq %rcx, 56(%rdx)
5920 ; SCALAR-NEXT: movq %rax, 48(%rdx)
5923 ; SSE2-LABEL: vec512_v2i64:
5925 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5926 ; SSE2-NEXT: pxor (%rdi), %xmm0
5927 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
5928 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5929 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5930 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5931 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5934 ; AVX-LABEL: vec512_v2i64:
5936 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5937 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
5938 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
5939 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
5940 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
5941 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
5942 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
5944 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
5945 %in.subvec = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
5946 store <2 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
5947 %out.subvec0.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 0
5948 store <2 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
5949 %out.subvec1.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 1
5950 store <2 x i64> %in.subvec, ptr %out.subvec1.ptr, align 16
5951 %out.subvec2.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 2
5952 store <2 x i64> %in.subvec, ptr %out.subvec2.ptr, align 32
5953 %out.subvec3.ptr = getelementptr <2 x i64>, ptr %out.vec.ptr, i64 3
5954 store <2 x i64> %in.subvec, ptr %out.subvec3.ptr, align 16
5958 define void @vec512_v2f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
5959 ; SCALAR-LABEL: vec512_v2f64:
5961 ; SCALAR-NEXT: movq (%rdi), %rax
5962 ; SCALAR-NEXT: movq 8(%rdi), %rcx
5963 ; SCALAR-NEXT: notq %rax
5964 ; SCALAR-NEXT: notq %rcx
5965 ; SCALAR-NEXT: movq %rcx, 8(%rsi)
5966 ; SCALAR-NEXT: movq %rax, (%rsi)
5967 ; SCALAR-NEXT: movq %rcx, 8(%rdx)
5968 ; SCALAR-NEXT: movq %rax, (%rdx)
5969 ; SCALAR-NEXT: movq %rcx, 24(%rdx)
5970 ; SCALAR-NEXT: movq %rax, 16(%rdx)
5971 ; SCALAR-NEXT: movq %rcx, 40(%rdx)
5972 ; SCALAR-NEXT: movq %rax, 32(%rdx)
5973 ; SCALAR-NEXT: movq %rcx, 56(%rdx)
5974 ; SCALAR-NEXT: movq %rax, 48(%rdx)
5977 ; SSE2-LABEL: vec512_v2f64:
5979 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
5980 ; SSE2-NEXT: pxor (%rdi), %xmm0
5981 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
5982 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
5983 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
5984 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
5985 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
5988 ; AVX-LABEL: vec512_v2f64:
5990 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
5991 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
5992 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
5993 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
5994 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
5995 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
5996 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
5998 %in.subvec.not = load <2 x i64>, ptr %in.subvec.ptr, align 64
5999 %in.subvec.int = xor <2 x i64> %in.subvec.not, <i64 -1, i64 -1>
6000 %in.subvec = bitcast <2 x i64> %in.subvec.int to <2 x double>
6001 store <2 x double> %in.subvec, ptr %out.subvec.ptr, align 64
6002 %out.subvec0.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 0
6003 store <2 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
6004 %out.subvec1.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 1
6005 store <2 x double> %in.subvec, ptr %out.subvec1.ptr, align 16
6006 %out.subvec2.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 2
6007 store <2 x double> %in.subvec, ptr %out.subvec2.ptr, align 32
6008 %out.subvec3.ptr = getelementptr <2 x double>, ptr %out.vec.ptr, i64 3
6009 store <2 x double> %in.subvec, ptr %out.subvec3.ptr, align 16
6013 define void @vec512_v2i128(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6014 ; ALL-LABEL: vec512_v2i128:
6016 ; ALL-NEXT: movq 16(%rdi), %rax
6017 ; ALL-NEXT: movq 24(%rdi), %rcx
6018 ; ALL-NEXT: movq (%rdi), %r8
6019 ; ALL-NEXT: movq 8(%rdi), %rdi
6020 ; ALL-NEXT: notq %rdi
6021 ; ALL-NEXT: notq %r8
6022 ; ALL-NEXT: notq %rcx
6023 ; ALL-NEXT: notq %rax
6024 ; ALL-NEXT: movq %rax, 16(%rsi)
6025 ; ALL-NEXT: movq %rcx, 24(%rsi)
6026 ; ALL-NEXT: movq %r8, (%rsi)
6027 ; ALL-NEXT: movq %rdi, 8(%rsi)
6028 ; ALL-NEXT: movq %rax, 16(%rdx)
6029 ; ALL-NEXT: movq %rcx, 24(%rdx)
6030 ; ALL-NEXT: movq %r8, (%rdx)
6031 ; ALL-NEXT: movq %rdi, 8(%rdx)
6032 ; ALL-NEXT: movq %rax, 48(%rdx)
6033 ; ALL-NEXT: movq %rcx, 56(%rdx)
6034 ; ALL-NEXT: movq %r8, 32(%rdx)
6035 ; ALL-NEXT: movq %rdi, 40(%rdx)
6037 %in.subvec.not = load <2 x i128>, ptr %in.subvec.ptr, align 64
6038 %in.subvec = xor <2 x i128> %in.subvec.not, <i128 -1, i128 -1>
6039 store <2 x i128> %in.subvec, ptr %out.subvec.ptr, align 64
6040 %out.subvec0.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 0
6041 store <2 x i128> %in.subvec, ptr %out.subvec0.ptr, align 64
6042 %out.subvec1.ptr = getelementptr <2 x i128>, ptr %out.vec.ptr, i64 1
6043 store <2 x i128> %in.subvec, ptr %out.subvec1.ptr, align 32
6047 define void @vec512_v4i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6048 ; SCALAR-LABEL: vec512_v4i8:
6050 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
6051 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
6052 ; SCALAR-NEXT: movzbl (%rdi), %eax
6053 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
6054 ; SCALAR-NEXT: notb %al
6055 ; SCALAR-NEXT: notb %dil
6056 ; SCALAR-NEXT: notb %cl
6057 ; SCALAR-NEXT: notb %r8b
6058 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
6059 ; SCALAR-NEXT: movb %cl, 2(%rsi)
6060 ; SCALAR-NEXT: movb %dil, 1(%rsi)
6061 ; SCALAR-NEXT: movb %al, (%rsi)
6062 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
6063 ; SCALAR-NEXT: movb %cl, 2(%rdx)
6064 ; SCALAR-NEXT: movb %dil, 1(%rdx)
6065 ; SCALAR-NEXT: movb %al, (%rdx)
6066 ; SCALAR-NEXT: movb %r8b, 7(%rdx)
6067 ; SCALAR-NEXT: movb %cl, 6(%rdx)
6068 ; SCALAR-NEXT: movb %dil, 5(%rdx)
6069 ; SCALAR-NEXT: movb %al, 4(%rdx)
6070 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
6071 ; SCALAR-NEXT: movb %cl, 10(%rdx)
6072 ; SCALAR-NEXT: movb %dil, 9(%rdx)
6073 ; SCALAR-NEXT: movb %al, 8(%rdx)
6074 ; SCALAR-NEXT: movb %r8b, 15(%rdx)
6075 ; SCALAR-NEXT: movb %cl, 14(%rdx)
6076 ; SCALAR-NEXT: movb %dil, 13(%rdx)
6077 ; SCALAR-NEXT: movb %al, 12(%rdx)
6078 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
6079 ; SCALAR-NEXT: movb %cl, 18(%rdx)
6080 ; SCALAR-NEXT: movb %dil, 17(%rdx)
6081 ; SCALAR-NEXT: movb %al, 16(%rdx)
6082 ; SCALAR-NEXT: movb %r8b, 23(%rdx)
6083 ; SCALAR-NEXT: movb %cl, 22(%rdx)
6084 ; SCALAR-NEXT: movb %dil, 21(%rdx)
6085 ; SCALAR-NEXT: movb %al, 20(%rdx)
6086 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
6087 ; SCALAR-NEXT: movb %cl, 26(%rdx)
6088 ; SCALAR-NEXT: movb %dil, 25(%rdx)
6089 ; SCALAR-NEXT: movb %al, 24(%rdx)
6090 ; SCALAR-NEXT: movb %r8b, 31(%rdx)
6091 ; SCALAR-NEXT: movb %cl, 30(%rdx)
6092 ; SCALAR-NEXT: movb %dil, 29(%rdx)
6093 ; SCALAR-NEXT: movb %al, 28(%rdx)
6094 ; SCALAR-NEXT: movb %r8b, 35(%rdx)
6095 ; SCALAR-NEXT: movb %cl, 34(%rdx)
6096 ; SCALAR-NEXT: movb %dil, 33(%rdx)
6097 ; SCALAR-NEXT: movb %al, 32(%rdx)
6098 ; SCALAR-NEXT: movb %r8b, 39(%rdx)
6099 ; SCALAR-NEXT: movb %cl, 38(%rdx)
6100 ; SCALAR-NEXT: movb %dil, 37(%rdx)
6101 ; SCALAR-NEXT: movb %al, 36(%rdx)
6102 ; SCALAR-NEXT: movb %r8b, 43(%rdx)
6103 ; SCALAR-NEXT: movb %cl, 42(%rdx)
6104 ; SCALAR-NEXT: movb %dil, 41(%rdx)
6105 ; SCALAR-NEXT: movb %al, 40(%rdx)
6106 ; SCALAR-NEXT: movb %r8b, 47(%rdx)
6107 ; SCALAR-NEXT: movb %cl, 46(%rdx)
6108 ; SCALAR-NEXT: movb %dil, 45(%rdx)
6109 ; SCALAR-NEXT: movb %al, 44(%rdx)
6110 ; SCALAR-NEXT: movb %r8b, 51(%rdx)
6111 ; SCALAR-NEXT: movb %cl, 50(%rdx)
6112 ; SCALAR-NEXT: movb %dil, 49(%rdx)
6113 ; SCALAR-NEXT: movb %al, 48(%rdx)
6114 ; SCALAR-NEXT: movb %r8b, 55(%rdx)
6115 ; SCALAR-NEXT: movb %cl, 54(%rdx)
6116 ; SCALAR-NEXT: movb %dil, 53(%rdx)
6117 ; SCALAR-NEXT: movb %al, 52(%rdx)
6118 ; SCALAR-NEXT: movb %r8b, 59(%rdx)
6119 ; SCALAR-NEXT: movb %cl, 58(%rdx)
6120 ; SCALAR-NEXT: movb %dil, 57(%rdx)
6121 ; SCALAR-NEXT: movb %al, 56(%rdx)
6122 ; SCALAR-NEXT: movb %r8b, 63(%rdx)
6123 ; SCALAR-NEXT: movb %cl, 62(%rdx)
6124 ; SCALAR-NEXT: movb %dil, 61(%rdx)
6125 ; SCALAR-NEXT: movb %al, 60(%rdx)
6128 ; SSE2-LABEL: vec512_v4i8:
6130 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6131 ; SSE2-NEXT: pxor (%rdi), %xmm0
6132 ; SSE2-NEXT: movd %xmm0, (%rsi)
6133 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6134 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6135 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6136 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6137 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6140 ; AVX1-LABEL: vec512_v4i8:
6142 ; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6143 ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0
6144 ; AVX1-NEXT: vmovd %xmm0, (%rsi)
6145 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
6146 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6147 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6148 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6149 ; AVX1-NEXT: vzeroupper
6152 ; AVX2-ONLY-LABEL: vec512_v4i8:
6153 ; AVX2-ONLY: # %bb.0:
6154 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6155 ; AVX2-ONLY-NEXT: vpxor (%rdi), %xmm0, %xmm0
6156 ; AVX2-ONLY-NEXT: vmovd %xmm0, (%rsi)
6157 ; AVX2-ONLY-NEXT: vpbroadcastd %xmm0, %ymm0
6158 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
6159 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
6160 ; AVX2-ONLY-NEXT: vzeroupper
6161 ; AVX2-ONLY-NEXT: retq
6163 ; AVX512-LABEL: vec512_v4i8:
6165 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6166 ; AVX512-NEXT: vpxor (%rdi), %xmm0, %xmm0
6167 ; AVX512-NEXT: vmovd %xmm0, (%rsi)
6168 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
6169 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
6170 ; AVX512-NEXT: vzeroupper
6172 %in.subvec.not = load <4 x i8>, ptr %in.subvec.ptr, align 64
6173 %in.subvec = xor <4 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1>
6174 store <4 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
6175 %out.subvec0.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 0
6176 store <4 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
6177 %out.subvec1.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 1
6178 store <4 x i8> %in.subvec, ptr %out.subvec1.ptr, align 4
6179 %out.subvec2.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 2
6180 store <4 x i8> %in.subvec, ptr %out.subvec2.ptr, align 8
6181 %out.subvec3.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 3
6182 store <4 x i8> %in.subvec, ptr %out.subvec3.ptr, align 4
6183 %out.subvec4.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 4
6184 store <4 x i8> %in.subvec, ptr %out.subvec4.ptr, align 16
6185 %out.subvec5.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 5
6186 store <4 x i8> %in.subvec, ptr %out.subvec5.ptr, align 4
6187 %out.subvec6.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 6
6188 store <4 x i8> %in.subvec, ptr %out.subvec6.ptr, align 8
6189 %out.subvec7.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 7
6190 store <4 x i8> %in.subvec, ptr %out.subvec7.ptr, align 4
6191 %out.subvec8.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 8
6192 store <4 x i8> %in.subvec, ptr %out.subvec8.ptr, align 32
6193 %out.subvec9.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 9
6194 store <4 x i8> %in.subvec, ptr %out.subvec9.ptr, align 4
6195 %out.subvec10.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 10
6196 store <4 x i8> %in.subvec, ptr %out.subvec10.ptr, align 8
6197 %out.subvec11.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 11
6198 store <4 x i8> %in.subvec, ptr %out.subvec11.ptr, align 4
6199 %out.subvec12.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 12
6200 store <4 x i8> %in.subvec, ptr %out.subvec12.ptr, align 16
6201 %out.subvec13.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 13
6202 store <4 x i8> %in.subvec, ptr %out.subvec13.ptr, align 4
6203 %out.subvec14.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 14
6204 store <4 x i8> %in.subvec, ptr %out.subvec14.ptr, align 8
6205 %out.subvec15.ptr = getelementptr <4 x i8>, ptr %out.vec.ptr, i64 15
6206 store <4 x i8> %in.subvec, ptr %out.subvec15.ptr, align 4
6210 define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6211 ; SCALAR-LABEL: vec512_v4i16:
6213 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
6214 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
6215 ; SCALAR-NEXT: movl (%rdi), %eax
6216 ; SCALAR-NEXT: movl 4(%rdi), %edi
6217 ; SCALAR-NEXT: notl %eax
6218 ; SCALAR-NEXT: notl %ecx
6219 ; SCALAR-NEXT: notl %edi
6220 ; SCALAR-NEXT: notl %r8d
6221 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
6222 ; SCALAR-NEXT: movw %di, 4(%rsi)
6223 ; SCALAR-NEXT: movw %cx, 2(%rsi)
6224 ; SCALAR-NEXT: movw %ax, (%rsi)
6225 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
6226 ; SCALAR-NEXT: movw %di, 4(%rdx)
6227 ; SCALAR-NEXT: movw %cx, 2(%rdx)
6228 ; SCALAR-NEXT: movw %ax, (%rdx)
6229 ; SCALAR-NEXT: movw %r8w, 14(%rdx)
6230 ; SCALAR-NEXT: movw %di, 12(%rdx)
6231 ; SCALAR-NEXT: movw %cx, 10(%rdx)
6232 ; SCALAR-NEXT: movw %ax, 8(%rdx)
6233 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
6234 ; SCALAR-NEXT: movw %di, 20(%rdx)
6235 ; SCALAR-NEXT: movw %cx, 18(%rdx)
6236 ; SCALAR-NEXT: movw %ax, 16(%rdx)
6237 ; SCALAR-NEXT: movw %r8w, 30(%rdx)
6238 ; SCALAR-NEXT: movw %di, 28(%rdx)
6239 ; SCALAR-NEXT: movw %cx, 26(%rdx)
6240 ; SCALAR-NEXT: movw %ax, 24(%rdx)
6241 ; SCALAR-NEXT: movw %r8w, 38(%rdx)
6242 ; SCALAR-NEXT: movw %di, 36(%rdx)
6243 ; SCALAR-NEXT: movw %cx, 34(%rdx)
6244 ; SCALAR-NEXT: movw %ax, 32(%rdx)
6245 ; SCALAR-NEXT: movw %r8w, 46(%rdx)
6246 ; SCALAR-NEXT: movw %di, 44(%rdx)
6247 ; SCALAR-NEXT: movw %cx, 42(%rdx)
6248 ; SCALAR-NEXT: movw %ax, 40(%rdx)
6249 ; SCALAR-NEXT: movw %r8w, 54(%rdx)
6250 ; SCALAR-NEXT: movw %di, 52(%rdx)
6251 ; SCALAR-NEXT: movw %cx, 50(%rdx)
6252 ; SCALAR-NEXT: movw %ax, 48(%rdx)
6253 ; SCALAR-NEXT: movw %r8w, 62(%rdx)
6254 ; SCALAR-NEXT: movw %di, 60(%rdx)
6255 ; SCALAR-NEXT: movw %cx, 58(%rdx)
6256 ; SCALAR-NEXT: movw %ax, 56(%rdx)
6259 ; SSE2-LABEL: vec512_v4i16:
6261 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
6262 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
6263 ; SSE2-NEXT: pxor %xmm0, %xmm1
6264 ; SSE2-NEXT: movq %xmm1, (%rsi)
6265 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
6266 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6267 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6268 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6269 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6272 ; AVX1-LABEL: vec512_v4i16:
6274 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6275 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
6276 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
6277 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
6278 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6279 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6280 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6281 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6282 ; AVX1-NEXT: vzeroupper
6285 ; AVX2-ONLY-LABEL: vec512_v4i16:
6286 ; AVX2-ONLY: # %bb.0:
6287 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6288 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
6289 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
6290 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
6291 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
6292 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
6293 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
6294 ; AVX2-ONLY-NEXT: vzeroupper
6295 ; AVX2-ONLY-NEXT: retq
6297 ; AVX512-LABEL: vec512_v4i16:
6299 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6300 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
6301 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
6302 ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
6303 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
6304 ; AVX512-NEXT: vzeroupper
6306 %in.subvec.not = load <4 x i16>, ptr %in.subvec.ptr, align 64
6307 %in.subvec = xor <4 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1>
6308 store <4 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
6309 %out.subvec0.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 0
6310 store <4 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
6311 %out.subvec1.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 1
6312 store <4 x i16> %in.subvec, ptr %out.subvec1.ptr, align 8
6313 %out.subvec2.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 2
6314 store <4 x i16> %in.subvec, ptr %out.subvec2.ptr, align 16
6315 %out.subvec3.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 3
6316 store <4 x i16> %in.subvec, ptr %out.subvec3.ptr, align 8
6317 %out.subvec4.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 4
6318 store <4 x i16> %in.subvec, ptr %out.subvec4.ptr, align 32
6319 %out.subvec5.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 5
6320 store <4 x i16> %in.subvec, ptr %out.subvec5.ptr, align 8
6321 %out.subvec6.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 6
6322 store <4 x i16> %in.subvec, ptr %out.subvec6.ptr, align 16
6323 %out.subvec7.ptr = getelementptr <4 x i16>, ptr %out.vec.ptr, i64 7
6324 store <4 x i16> %in.subvec, ptr %out.subvec7.ptr, align 8
6328 define void @vec512_v4i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6329 ; SCALAR-LABEL: vec512_v4i32:
6331 ; SCALAR-NEXT: movaps (%rdi), %xmm0
6332 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6333 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
6334 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
6335 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
6336 ; SCALAR-NEXT: movaps %xmm0, 32(%rdx)
6337 ; SCALAR-NEXT: movaps %xmm0, 48(%rdx)
6340 ; SSE2-LABEL: vec512_v4i32:
6342 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6343 ; SSE2-NEXT: pxor (%rdi), %xmm0
6344 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6345 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6346 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6347 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6348 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6351 ; AVX-LABEL: vec512_v4i32:
6353 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6354 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
6355 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
6356 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
6357 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
6358 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
6359 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
6361 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
6362 %in.subvec = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
6363 store <4 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
6364 %out.subvec0.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 0
6365 store <4 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
6366 %out.subvec1.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 1
6367 store <4 x i32> %in.subvec, ptr %out.subvec1.ptr, align 16
6368 %out.subvec2.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 2
6369 store <4 x i32> %in.subvec, ptr %out.subvec2.ptr, align 32
6370 %out.subvec3.ptr = getelementptr <4 x i32>, ptr %out.vec.ptr, i64 3
6371 store <4 x i32> %in.subvec, ptr %out.subvec3.ptr, align 16
6375 define void @vec512_v4f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6376 ; SCALAR-LABEL: vec512_v4f32:
6378 ; SCALAR-NEXT: movaps (%rdi), %xmm0
6379 ; SCALAR-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6380 ; SCALAR-NEXT: movaps %xmm0, (%rsi)
6381 ; SCALAR-NEXT: movaps %xmm0, (%rdx)
6382 ; SCALAR-NEXT: movaps %xmm0, 16(%rdx)
6383 ; SCALAR-NEXT: movaps %xmm0, 32(%rdx)
6384 ; SCALAR-NEXT: movaps %xmm0, 48(%rdx)
6387 ; SSE2-LABEL: vec512_v4f32:
6389 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6390 ; SSE2-NEXT: pxor (%rdi), %xmm0
6391 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6392 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6393 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6394 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6395 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6398 ; AVX-LABEL: vec512_v4f32:
6400 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6401 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
6402 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
6403 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
6404 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
6405 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
6406 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
6408 %in.subvec.not = load <4 x i32>, ptr %in.subvec.ptr, align 64
6409 %in.subvec.int = xor <4 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1>
6410 %in.subvec = bitcast <4 x i32> %in.subvec.int to <4 x float>
6411 store <4 x float> %in.subvec, ptr %out.subvec.ptr, align 64
6412 %out.subvec0.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 0
6413 store <4 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
6414 %out.subvec1.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 1
6415 store <4 x float> %in.subvec, ptr %out.subvec1.ptr, align 16
6416 %out.subvec2.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 2
6417 store <4 x float> %in.subvec, ptr %out.subvec2.ptr, align 32
6418 %out.subvec3.ptr = getelementptr <4 x float>, ptr %out.vec.ptr, i64 3
6419 store <4 x float> %in.subvec, ptr %out.subvec3.ptr, align 16
6423 define void @vec512_v4i64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6424 ; SCALAR-LABEL: vec512_v4i64:
6426 ; SCALAR-NEXT: movq 24(%rdi), %rax
6427 ; SCALAR-NEXT: movq 16(%rdi), %rcx
6428 ; SCALAR-NEXT: movq (%rdi), %r8
6429 ; SCALAR-NEXT: movq 8(%rdi), %rdi
6430 ; SCALAR-NEXT: notq %r8
6431 ; SCALAR-NEXT: notq %rdi
6432 ; SCALAR-NEXT: notq %rcx
6433 ; SCALAR-NEXT: notq %rax
6434 ; SCALAR-NEXT: movq %rax, 24(%rsi)
6435 ; SCALAR-NEXT: movq %rcx, 16(%rsi)
6436 ; SCALAR-NEXT: movq %rdi, 8(%rsi)
6437 ; SCALAR-NEXT: movq %r8, (%rsi)
6438 ; SCALAR-NEXT: movq %rax, 24(%rdx)
6439 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
6440 ; SCALAR-NEXT: movq %rdi, 8(%rdx)
6441 ; SCALAR-NEXT: movq %r8, (%rdx)
6442 ; SCALAR-NEXT: movq %rax, 56(%rdx)
6443 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
6444 ; SCALAR-NEXT: movq %rdi, 40(%rdx)
6445 ; SCALAR-NEXT: movq %r8, 32(%rdx)
6448 ; SSE2-LABEL: vec512_v4i64:
6450 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6451 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6452 ; SSE2-NEXT: pxor %xmm0, %xmm1
6453 ; SSE2-NEXT: pxor (%rdi), %xmm0
6454 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6455 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
6456 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6457 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
6458 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
6459 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6462 ; AVX1-LABEL: vec512_v4i64:
6464 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
6465 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
6466 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
6467 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
6468 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6469 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6470 ; AVX1-NEXT: vzeroupper
6473 ; AVX2-LABEL: vec512_v4i64:
6475 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
6476 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
6477 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
6478 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
6479 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
6480 ; AVX2-NEXT: vzeroupper
6482 %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64
6483 %in.subvec = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1>
6484 store <4 x i64> %in.subvec, ptr %out.subvec.ptr, align 64
6485 %out.subvec0.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 0
6486 store <4 x i64> %in.subvec, ptr %out.subvec0.ptr, align 64
6487 %out.subvec1.ptr = getelementptr <4 x i64>, ptr %out.vec.ptr, i64 1
6488 store <4 x i64> %in.subvec, ptr %out.subvec1.ptr, align 32
6492 define void @vec512_v4f64(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6493 ; SCALAR-LABEL: vec512_v4f64:
6495 ; SCALAR-NEXT: movq 24(%rdi), %rax
6496 ; SCALAR-NEXT: movq 16(%rdi), %rcx
6497 ; SCALAR-NEXT: movq (%rdi), %r8
6498 ; SCALAR-NEXT: movq 8(%rdi), %rdi
6499 ; SCALAR-NEXT: notq %r8
6500 ; SCALAR-NEXT: notq %rdi
6501 ; SCALAR-NEXT: notq %rcx
6502 ; SCALAR-NEXT: notq %rax
6503 ; SCALAR-NEXT: movq %rax, 24(%rsi)
6504 ; SCALAR-NEXT: movq %rcx, 16(%rsi)
6505 ; SCALAR-NEXT: movq %rdi, 8(%rsi)
6506 ; SCALAR-NEXT: movq %r8, (%rsi)
6507 ; SCALAR-NEXT: movq %rax, 24(%rdx)
6508 ; SCALAR-NEXT: movq %rcx, 16(%rdx)
6509 ; SCALAR-NEXT: movq %rdi, 8(%rdx)
6510 ; SCALAR-NEXT: movq %r8, (%rdx)
6511 ; SCALAR-NEXT: movq %rax, 56(%rdx)
6512 ; SCALAR-NEXT: movq %rcx, 48(%rdx)
6513 ; SCALAR-NEXT: movq %rdi, 40(%rdx)
6514 ; SCALAR-NEXT: movq %r8, 32(%rdx)
6517 ; SSE2-LABEL: vec512_v4f64:
6519 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6520 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6521 ; SSE2-NEXT: pxor %xmm0, %xmm1
6522 ; SSE2-NEXT: pxor (%rdi), %xmm0
6523 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6524 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
6525 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6526 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
6527 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
6528 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6531 ; AVX1-LABEL: vec512_v4f64:
6533 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
6534 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
6535 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
6536 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
6537 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6538 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6539 ; AVX1-NEXT: vzeroupper
6542 ; AVX2-LABEL: vec512_v4f64:
6544 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
6545 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
6546 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
6547 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
6548 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
6549 ; AVX2-NEXT: vzeroupper
6551 %in.subvec.not = load <4 x i64>, ptr %in.subvec.ptr, align 64
6552 %in.subvec.int = xor <4 x i64> %in.subvec.not, <i64 -1, i64 -1, i64 -1, i64 -1>
6553 %in.subvec = bitcast <4 x i64> %in.subvec.int to <4 x double>
6554 store <4 x double> %in.subvec, ptr %out.subvec.ptr, align 64
6555 %out.subvec0.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 0
6556 store <4 x double> %in.subvec, ptr %out.subvec0.ptr, align 64
6557 %out.subvec1.ptr = getelementptr <4 x double>, ptr %out.vec.ptr, i64 1
6558 store <4 x double> %in.subvec, ptr %out.subvec1.ptr, align 32
6562 define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6563 ; SCALAR-LABEL: vec512_v8i8:
6565 ; SCALAR-NEXT: pushq %rbx
6566 ; SCALAR-NEXT: movzbl 7(%rdi), %ebx
6567 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
6568 ; SCALAR-NEXT: movzbl 5(%rdi), %r10d
6569 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
6570 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
6571 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
6572 ; SCALAR-NEXT: movzbl (%rdi), %eax
6573 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
6574 ; SCALAR-NEXT: notb %al
6575 ; SCALAR-NEXT: notb %dil
6576 ; SCALAR-NEXT: notb %cl
6577 ; SCALAR-NEXT: notb %r8b
6578 ; SCALAR-NEXT: notb %r9b
6579 ; SCALAR-NEXT: notb %r10b
6580 ; SCALAR-NEXT: notb %r11b
6581 ; SCALAR-NEXT: notb %bl
6582 ; SCALAR-NEXT: movb %bl, 7(%rsi)
6583 ; SCALAR-NEXT: movb %r11b, 6(%rsi)
6584 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
6585 ; SCALAR-NEXT: movb %r9b, 4(%rsi)
6586 ; SCALAR-NEXT: movb %r8b, 3(%rsi)
6587 ; SCALAR-NEXT: movb %cl, 2(%rsi)
6588 ; SCALAR-NEXT: movb %dil, 1(%rsi)
6589 ; SCALAR-NEXT: movb %al, (%rsi)
6590 ; SCALAR-NEXT: movb %bl, 7(%rdx)
6591 ; SCALAR-NEXT: movb %r11b, 6(%rdx)
6592 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
6593 ; SCALAR-NEXT: movb %r9b, 4(%rdx)
6594 ; SCALAR-NEXT: movb %r8b, 3(%rdx)
6595 ; SCALAR-NEXT: movb %cl, 2(%rdx)
6596 ; SCALAR-NEXT: movb %dil, 1(%rdx)
6597 ; SCALAR-NEXT: movb %al, (%rdx)
6598 ; SCALAR-NEXT: movb %bl, 15(%rdx)
6599 ; SCALAR-NEXT: movb %r11b, 14(%rdx)
6600 ; SCALAR-NEXT: movb %r10b, 13(%rdx)
6601 ; SCALAR-NEXT: movb %r9b, 12(%rdx)
6602 ; SCALAR-NEXT: movb %r8b, 11(%rdx)
6603 ; SCALAR-NEXT: movb %cl, 10(%rdx)
6604 ; SCALAR-NEXT: movb %dil, 9(%rdx)
6605 ; SCALAR-NEXT: movb %al, 8(%rdx)
6606 ; SCALAR-NEXT: movb %bl, 23(%rdx)
6607 ; SCALAR-NEXT: movb %r11b, 22(%rdx)
6608 ; SCALAR-NEXT: movb %r10b, 21(%rdx)
6609 ; SCALAR-NEXT: movb %r9b, 20(%rdx)
6610 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
6611 ; SCALAR-NEXT: movb %cl, 18(%rdx)
6612 ; SCALAR-NEXT: movb %dil, 17(%rdx)
6613 ; SCALAR-NEXT: movb %al, 16(%rdx)
6614 ; SCALAR-NEXT: movb %bl, 31(%rdx)
6615 ; SCALAR-NEXT: movb %r11b, 30(%rdx)
6616 ; SCALAR-NEXT: movb %r10b, 29(%rdx)
6617 ; SCALAR-NEXT: movb %r9b, 28(%rdx)
6618 ; SCALAR-NEXT: movb %r8b, 27(%rdx)
6619 ; SCALAR-NEXT: movb %cl, 26(%rdx)
6620 ; SCALAR-NEXT: movb %dil, 25(%rdx)
6621 ; SCALAR-NEXT: movb %al, 24(%rdx)
6622 ; SCALAR-NEXT: movb %bl, 39(%rdx)
6623 ; SCALAR-NEXT: movb %r11b, 38(%rdx)
6624 ; SCALAR-NEXT: movb %r10b, 37(%rdx)
6625 ; SCALAR-NEXT: movb %r9b, 36(%rdx)
6626 ; SCALAR-NEXT: movb %r8b, 35(%rdx)
6627 ; SCALAR-NEXT: movb %cl, 34(%rdx)
6628 ; SCALAR-NEXT: movb %dil, 33(%rdx)
6629 ; SCALAR-NEXT: movb %al, 32(%rdx)
6630 ; SCALAR-NEXT: movb %bl, 47(%rdx)
6631 ; SCALAR-NEXT: movb %r11b, 46(%rdx)
6632 ; SCALAR-NEXT: movb %r10b, 45(%rdx)
6633 ; SCALAR-NEXT: movb %r9b, 44(%rdx)
6634 ; SCALAR-NEXT: movb %r8b, 43(%rdx)
6635 ; SCALAR-NEXT: movb %cl, 42(%rdx)
6636 ; SCALAR-NEXT: movb %dil, 41(%rdx)
6637 ; SCALAR-NEXT: movb %al, 40(%rdx)
6638 ; SCALAR-NEXT: movb %bl, 55(%rdx)
6639 ; SCALAR-NEXT: movb %r11b, 54(%rdx)
6640 ; SCALAR-NEXT: movb %r10b, 53(%rdx)
6641 ; SCALAR-NEXT: movb %r9b, 52(%rdx)
6642 ; SCALAR-NEXT: movb %r8b, 51(%rdx)
6643 ; SCALAR-NEXT: movb %cl, 50(%rdx)
6644 ; SCALAR-NEXT: movb %dil, 49(%rdx)
6645 ; SCALAR-NEXT: movb %al, 48(%rdx)
6646 ; SCALAR-NEXT: movb %bl, 63(%rdx)
6647 ; SCALAR-NEXT: movb %r11b, 62(%rdx)
6648 ; SCALAR-NEXT: movb %r10b, 61(%rdx)
6649 ; SCALAR-NEXT: movb %r9b, 60(%rdx)
6650 ; SCALAR-NEXT: movb %r8b, 59(%rdx)
6651 ; SCALAR-NEXT: movb %cl, 58(%rdx)
6652 ; SCALAR-NEXT: movb %dil, 57(%rdx)
6653 ; SCALAR-NEXT: movb %al, 56(%rdx)
6654 ; SCALAR-NEXT: popq %rbx
6657 ; SSE2-LABEL: vec512_v8i8:
6659 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
6660 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
6661 ; SSE2-NEXT: pxor %xmm0, %xmm1
6662 ; SSE2-NEXT: movq %xmm1, (%rsi)
6663 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
6664 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6665 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6666 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6667 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6670 ; AVX1-LABEL: vec512_v8i8:
6672 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6673 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
6674 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
6675 ; AVX1-NEXT: vmovq %xmm0, (%rsi)
6676 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
6677 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6678 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6679 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6680 ; AVX1-NEXT: vzeroupper
6683 ; AVX2-ONLY-LABEL: vec512_v8i8:
6684 ; AVX2-ONLY: # %bb.0:
6685 ; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6686 ; AVX2-ONLY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
6687 ; AVX2-ONLY-NEXT: vpxor %xmm1, %xmm0, %xmm0
6688 ; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
6689 ; AVX2-ONLY-NEXT: vpbroadcastq %xmm0, %ymm0
6690 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
6691 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%rdx)
6692 ; AVX2-ONLY-NEXT: vzeroupper
6693 ; AVX2-ONLY-NEXT: retq
6695 ; AVX512-LABEL: vec512_v8i8:
6697 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
6698 ; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
6699 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
6700 ; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
6701 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
6702 ; AVX512-NEXT: vzeroupper
6704 %in.subvec.not = load <8 x i8>, ptr %in.subvec.ptr, align 64
6705 %in.subvec = xor <8 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
6706 store <8 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
6707 %out.subvec0.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 0
6708 store <8 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
6709 %out.subvec1.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 1
6710 store <8 x i8> %in.subvec, ptr %out.subvec1.ptr, align 8
6711 %out.subvec2.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 2
6712 store <8 x i8> %in.subvec, ptr %out.subvec2.ptr, align 16
6713 %out.subvec3.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 3
6714 store <8 x i8> %in.subvec, ptr %out.subvec3.ptr, align 8
6715 %out.subvec4.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 4
6716 store <8 x i8> %in.subvec, ptr %out.subvec4.ptr, align 32
6717 %out.subvec5.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 5
6718 store <8 x i8> %in.subvec, ptr %out.subvec5.ptr, align 8
6719 %out.subvec6.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 6
6720 store <8 x i8> %in.subvec, ptr %out.subvec6.ptr, align 16
6721 %out.subvec7.ptr = getelementptr <8 x i8>, ptr %out.vec.ptr, i64 7
6722 store <8 x i8> %in.subvec, ptr %out.subvec7.ptr, align 8
6726 define void @vec512_v8i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6727 ; SCALAR-LABEL: vec512_v8i16:
6729 ; SCALAR-NEXT: pushq %rbx
6730 ; SCALAR-NEXT: movzwl 14(%rdi), %ebx
6731 ; SCALAR-NEXT: movl 12(%rdi), %r11d
6732 ; SCALAR-NEXT: movzwl 10(%rdi), %r10d
6733 ; SCALAR-NEXT: movl 8(%rdi), %r9d
6734 ; SCALAR-NEXT: movzwl 6(%rdi), %r8d
6735 ; SCALAR-NEXT: movzwl 2(%rdi), %ecx
6736 ; SCALAR-NEXT: movl (%rdi), %eax
6737 ; SCALAR-NEXT: movl 4(%rdi), %edi
6738 ; SCALAR-NEXT: notl %eax
6739 ; SCALAR-NEXT: notl %ecx
6740 ; SCALAR-NEXT: notl %edi
6741 ; SCALAR-NEXT: notl %r8d
6742 ; SCALAR-NEXT: notl %r9d
6743 ; SCALAR-NEXT: notl %r10d
6744 ; SCALAR-NEXT: notl %r11d
6745 ; SCALAR-NEXT: notl %ebx
6746 ; SCALAR-NEXT: movw %bx, 14(%rsi)
6747 ; SCALAR-NEXT: movw %r11w, 12(%rsi)
6748 ; SCALAR-NEXT: movw %r10w, 10(%rsi)
6749 ; SCALAR-NEXT: movw %r9w, 8(%rsi)
6750 ; SCALAR-NEXT: movw %r8w, 6(%rsi)
6751 ; SCALAR-NEXT: movw %di, 4(%rsi)
6752 ; SCALAR-NEXT: movw %cx, 2(%rsi)
6753 ; SCALAR-NEXT: movw %ax, (%rsi)
6754 ; SCALAR-NEXT: movw %bx, 14(%rdx)
6755 ; SCALAR-NEXT: movw %r11w, 12(%rdx)
6756 ; SCALAR-NEXT: movw %r10w, 10(%rdx)
6757 ; SCALAR-NEXT: movw %r9w, 8(%rdx)
6758 ; SCALAR-NEXT: movw %r8w, 6(%rdx)
6759 ; SCALAR-NEXT: movw %di, 4(%rdx)
6760 ; SCALAR-NEXT: movw %cx, 2(%rdx)
6761 ; SCALAR-NEXT: movw %ax, (%rdx)
6762 ; SCALAR-NEXT: movw %bx, 30(%rdx)
6763 ; SCALAR-NEXT: movw %r11w, 28(%rdx)
6764 ; SCALAR-NEXT: movw %r10w, 26(%rdx)
6765 ; SCALAR-NEXT: movw %r9w, 24(%rdx)
6766 ; SCALAR-NEXT: movw %r8w, 22(%rdx)
6767 ; SCALAR-NEXT: movw %di, 20(%rdx)
6768 ; SCALAR-NEXT: movw %cx, 18(%rdx)
6769 ; SCALAR-NEXT: movw %ax, 16(%rdx)
6770 ; SCALAR-NEXT: movw %bx, 46(%rdx)
6771 ; SCALAR-NEXT: movw %r11w, 44(%rdx)
6772 ; SCALAR-NEXT: movw %r10w, 42(%rdx)
6773 ; SCALAR-NEXT: movw %r9w, 40(%rdx)
6774 ; SCALAR-NEXT: movw %r8w, 38(%rdx)
6775 ; SCALAR-NEXT: movw %di, 36(%rdx)
6776 ; SCALAR-NEXT: movw %cx, 34(%rdx)
6777 ; SCALAR-NEXT: movw %ax, 32(%rdx)
6778 ; SCALAR-NEXT: movw %bx, 62(%rdx)
6779 ; SCALAR-NEXT: movw %r11w, 60(%rdx)
6780 ; SCALAR-NEXT: movw %r10w, 58(%rdx)
6781 ; SCALAR-NEXT: movw %r9w, 56(%rdx)
6782 ; SCALAR-NEXT: movw %r8w, 54(%rdx)
6783 ; SCALAR-NEXT: movw %di, 52(%rdx)
6784 ; SCALAR-NEXT: movw %cx, 50(%rdx)
6785 ; SCALAR-NEXT: movw %ax, 48(%rdx)
6786 ; SCALAR-NEXT: popq %rbx
6789 ; SSE2-LABEL: vec512_v8i16:
6791 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6792 ; SSE2-NEXT: pxor (%rdi), %xmm0
6793 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6794 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6795 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
6796 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6797 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
6800 ; AVX-LABEL: vec512_v8i16:
6802 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
6803 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
6804 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
6805 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
6806 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
6807 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
6808 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
6810 %in.subvec.not = load <8 x i16>, ptr %in.subvec.ptr, align 64
6811 %in.subvec = xor <8 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
6812 store <8 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
6813 %out.subvec0.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 0
6814 store <8 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
6815 %out.subvec1.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 1
6816 store <8 x i16> %in.subvec, ptr %out.subvec1.ptr, align 16
6817 %out.subvec2.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 2
6818 store <8 x i16> %in.subvec, ptr %out.subvec2.ptr, align 32
6819 %out.subvec3.ptr = getelementptr <8 x i16>, ptr %out.vec.ptr, i64 3
6820 store <8 x i16> %in.subvec, ptr %out.subvec3.ptr, align 16
6824 define void @vec512_v8i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6825 ; SCALAR-LABEL: vec512_v8i32:
6827 ; SCALAR-NEXT: pushq %rbx
6828 ; SCALAR-NEXT: movl 28(%rdi), %ebx
6829 ; SCALAR-NEXT: movl 24(%rdi), %r11d
6830 ; SCALAR-NEXT: movl 20(%rdi), %r10d
6831 ; SCALAR-NEXT: movl 16(%rdi), %r9d
6832 ; SCALAR-NEXT: movl 12(%rdi), %r8d
6833 ; SCALAR-NEXT: movl 8(%rdi), %ecx
6834 ; SCALAR-NEXT: movl (%rdi), %eax
6835 ; SCALAR-NEXT: movl 4(%rdi), %edi
6836 ; SCALAR-NEXT: notl %eax
6837 ; SCALAR-NEXT: notl %edi
6838 ; SCALAR-NEXT: notl %ecx
6839 ; SCALAR-NEXT: notl %r8d
6840 ; SCALAR-NEXT: notl %r9d
6841 ; SCALAR-NEXT: notl %r10d
6842 ; SCALAR-NEXT: notl %r11d
6843 ; SCALAR-NEXT: notl %ebx
6844 ; SCALAR-NEXT: movl %ebx, 28(%rsi)
6845 ; SCALAR-NEXT: movl %r11d, 24(%rsi)
6846 ; SCALAR-NEXT: movl %r10d, 20(%rsi)
6847 ; SCALAR-NEXT: movl %r9d, 16(%rsi)
6848 ; SCALAR-NEXT: movl %r8d, 12(%rsi)
6849 ; SCALAR-NEXT: movl %ecx, 8(%rsi)
6850 ; SCALAR-NEXT: movl %edi, 4(%rsi)
6851 ; SCALAR-NEXT: movl %eax, (%rsi)
6852 ; SCALAR-NEXT: movl %ebx, 28(%rdx)
6853 ; SCALAR-NEXT: movl %r11d, 24(%rdx)
6854 ; SCALAR-NEXT: movl %r10d, 20(%rdx)
6855 ; SCALAR-NEXT: movl %r9d, 16(%rdx)
6856 ; SCALAR-NEXT: movl %r8d, 12(%rdx)
6857 ; SCALAR-NEXT: movl %ecx, 8(%rdx)
6858 ; SCALAR-NEXT: movl %edi, 4(%rdx)
6859 ; SCALAR-NEXT: movl %eax, (%rdx)
6860 ; SCALAR-NEXT: movl %ebx, 60(%rdx)
6861 ; SCALAR-NEXT: movl %r11d, 56(%rdx)
6862 ; SCALAR-NEXT: movl %r10d, 52(%rdx)
6863 ; SCALAR-NEXT: movl %r9d, 48(%rdx)
6864 ; SCALAR-NEXT: movl %r8d, 44(%rdx)
6865 ; SCALAR-NEXT: movl %ecx, 40(%rdx)
6866 ; SCALAR-NEXT: movl %edi, 36(%rdx)
6867 ; SCALAR-NEXT: movl %eax, 32(%rdx)
6868 ; SCALAR-NEXT: popq %rbx
6871 ; SSE2-LABEL: vec512_v8i32:
6873 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6874 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6875 ; SSE2-NEXT: pxor %xmm0, %xmm1
6876 ; SSE2-NEXT: pxor (%rdi), %xmm0
6877 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6878 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
6879 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6880 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
6881 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
6882 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6885 ; AVX1-LABEL: vec512_v8i32:
6887 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
6888 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
6889 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
6890 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
6891 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6892 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6893 ; AVX1-NEXT: vzeroupper
6896 ; AVX2-LABEL: vec512_v8i32:
6898 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
6899 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
6900 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
6901 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
6902 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
6903 ; AVX2-NEXT: vzeroupper
6905 %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64
6906 %in.subvec = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
6907 store <8 x i32> %in.subvec, ptr %out.subvec.ptr, align 64
6908 %out.subvec0.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 0
6909 store <8 x i32> %in.subvec, ptr %out.subvec0.ptr, align 64
6910 %out.subvec1.ptr = getelementptr <8 x i32>, ptr %out.vec.ptr, i64 1
6911 store <8 x i32> %in.subvec, ptr %out.subvec1.ptr, align 32
6915 define void @vec512_v8f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
6916 ; SCALAR-LABEL: vec512_v8f32:
6918 ; SCALAR-NEXT: pushq %rbx
6919 ; SCALAR-NEXT: movl 28(%rdi), %ebx
6920 ; SCALAR-NEXT: movl 24(%rdi), %r11d
6921 ; SCALAR-NEXT: movl 20(%rdi), %r10d
6922 ; SCALAR-NEXT: movl 16(%rdi), %r9d
6923 ; SCALAR-NEXT: movl 12(%rdi), %r8d
6924 ; SCALAR-NEXT: movl 8(%rdi), %ecx
6925 ; SCALAR-NEXT: movl (%rdi), %eax
6926 ; SCALAR-NEXT: movl 4(%rdi), %edi
6927 ; SCALAR-NEXT: notl %eax
6928 ; SCALAR-NEXT: notl %edi
6929 ; SCALAR-NEXT: notl %ecx
6930 ; SCALAR-NEXT: notl %r8d
6931 ; SCALAR-NEXT: notl %r9d
6932 ; SCALAR-NEXT: notl %r10d
6933 ; SCALAR-NEXT: notl %r11d
6934 ; SCALAR-NEXT: notl %ebx
6935 ; SCALAR-NEXT: movl %ebx, 28(%rsi)
6936 ; SCALAR-NEXT: movl %r11d, 24(%rsi)
6937 ; SCALAR-NEXT: movl %r10d, 20(%rsi)
6938 ; SCALAR-NEXT: movl %r9d, 16(%rsi)
6939 ; SCALAR-NEXT: movl %r8d, 12(%rsi)
6940 ; SCALAR-NEXT: movl %ecx, 8(%rsi)
6941 ; SCALAR-NEXT: movl %edi, 4(%rsi)
6942 ; SCALAR-NEXT: movl %eax, (%rsi)
6943 ; SCALAR-NEXT: movl %ebx, 28(%rdx)
6944 ; SCALAR-NEXT: movl %r11d, 24(%rdx)
6945 ; SCALAR-NEXT: movl %r10d, 20(%rdx)
6946 ; SCALAR-NEXT: movl %r9d, 16(%rdx)
6947 ; SCALAR-NEXT: movl %r8d, 12(%rdx)
6948 ; SCALAR-NEXT: movl %ecx, 8(%rdx)
6949 ; SCALAR-NEXT: movl %edi, 4(%rdx)
6950 ; SCALAR-NEXT: movl %eax, (%rdx)
6951 ; SCALAR-NEXT: movl %ebx, 60(%rdx)
6952 ; SCALAR-NEXT: movl %r11d, 56(%rdx)
6953 ; SCALAR-NEXT: movl %r10d, 52(%rdx)
6954 ; SCALAR-NEXT: movl %r9d, 48(%rdx)
6955 ; SCALAR-NEXT: movl %r8d, 44(%rdx)
6956 ; SCALAR-NEXT: movl %ecx, 40(%rdx)
6957 ; SCALAR-NEXT: movl %edi, 36(%rdx)
6958 ; SCALAR-NEXT: movl %eax, 32(%rdx)
6959 ; SCALAR-NEXT: popq %rbx
6962 ; SSE2-LABEL: vec512_v8f32:
6964 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
6965 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
6966 ; SSE2-NEXT: pxor %xmm0, %xmm1
6967 ; SSE2-NEXT: pxor (%rdi), %xmm0
6968 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
6969 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
6970 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
6971 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
6972 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
6973 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
6976 ; AVX1-LABEL: vec512_v8f32:
6978 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
6979 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
6980 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
6981 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
6982 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
6983 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
6984 ; AVX1-NEXT: vzeroupper
6987 ; AVX2-LABEL: vec512_v8f32:
6989 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
6990 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
6991 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
6992 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
6993 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
6994 ; AVX2-NEXT: vzeroupper
6996 %in.subvec.not = load <8 x i32>, ptr %in.subvec.ptr, align 64
6997 %in.subvec.int = xor <8 x i32> %in.subvec.not, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
6998 %in.subvec = bitcast <8 x i32> %in.subvec.int to <8 x float>
6999 store <8 x float> %in.subvec, ptr %out.subvec.ptr, align 64
7000 %out.subvec0.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 0
7001 store <8 x float> %in.subvec, ptr %out.subvec0.ptr, align 64
7002 %out.subvec1.ptr = getelementptr <8 x float>, ptr %out.vec.ptr, i64 1
7003 store <8 x float> %in.subvec, ptr %out.subvec1.ptr, align 32
7007 define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
7008 ; SCALAR-LABEL: vec512_v16i8:
7010 ; SCALAR-NEXT: pushq %rbp
7011 ; SCALAR-NEXT: pushq %r15
7012 ; SCALAR-NEXT: pushq %r14
7013 ; SCALAR-NEXT: pushq %r13
7014 ; SCALAR-NEXT: pushq %r12
7015 ; SCALAR-NEXT: pushq %rbx
7016 ; SCALAR-NEXT: movzbl 15(%rdi), %eax
7017 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7018 ; SCALAR-NEXT: movzbl 14(%rdi), %eax
7019 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7020 ; SCALAR-NEXT: movzbl 13(%rdi), %eax
7021 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7022 ; SCALAR-NEXT: movzbl 12(%rdi), %r10d
7023 ; SCALAR-NEXT: movzbl 11(%rdi), %r13d
7024 ; SCALAR-NEXT: movzbl 10(%rdi), %r12d
7025 ; SCALAR-NEXT: movzbl 9(%rdi), %r15d
7026 ; SCALAR-NEXT: movzbl 8(%rdi), %r14d
7027 ; SCALAR-NEXT: movzbl 7(%rdi), %ebp
7028 ; SCALAR-NEXT: movzbl 6(%rdi), %r11d
7029 ; SCALAR-NEXT: movzbl 5(%rdi), %ebx
7030 ; SCALAR-NEXT: movzbl 4(%rdi), %r9d
7031 ; SCALAR-NEXT: movzbl 3(%rdi), %r8d
7032 ; SCALAR-NEXT: movzbl 2(%rdi), %ecx
7033 ; SCALAR-NEXT: movzbl (%rdi), %eax
7034 ; SCALAR-NEXT: movzbl 1(%rdi), %edi
7035 ; SCALAR-NEXT: notb %al
7036 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7037 ; SCALAR-NEXT: notb %dil
7038 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7039 ; SCALAR-NEXT: notb %cl
7040 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7041 ; SCALAR-NEXT: notb %r8b
7042 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7043 ; SCALAR-NEXT: notb %r9b
7044 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7045 ; SCALAR-NEXT: movl %ebx, %r9d
7046 ; SCALAR-NEXT: notb %r9b
7047 ; SCALAR-NEXT: notb %r11b
7048 ; SCALAR-NEXT: movl %r11d, %ebx
7049 ; SCALAR-NEXT: notb %bpl
7050 ; SCALAR-NEXT: notb %r14b
7051 ; SCALAR-NEXT: notb %r15b
7052 ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7053 ; SCALAR-NEXT: notb %r12b
7054 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7055 ; SCALAR-NEXT: notb %r13b
7056 ; SCALAR-NEXT: notb %r10b
7057 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7058 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7059 ; SCALAR-NEXT: notb %dil
7060 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7061 ; SCALAR-NEXT: notb %r8b
7062 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7063 ; SCALAR-NEXT: notb %r11b
7064 ; SCALAR-NEXT: movb %r11b, 15(%rsi)
7065 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7066 ; SCALAR-NEXT: movb %r8b, 14(%rsi)
7067 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7068 ; SCALAR-NEXT: movl %edi, %eax
7069 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7070 ; SCALAR-NEXT: movb %dil, 13(%rsi)
7071 ; SCALAR-NEXT: movb %r10b, 12(%rsi)
7072 ; SCALAR-NEXT: movb %r13b, 11(%rsi)
7073 ; SCALAR-NEXT: movb %r12b, 10(%rsi)
7074 ; SCALAR-NEXT: movb %r15b, 9(%rsi)
7075 ; SCALAR-NEXT: movb %r14b, 8(%rsi)
7076 ; SCALAR-NEXT: movl %r14d, %r12d
7077 ; SCALAR-NEXT: movb %bpl, 7(%rsi)
7078 ; SCALAR-NEXT: movl %ebp, %r14d
7079 ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7080 ; SCALAR-NEXT: movb %bl, 6(%rsi)
7081 ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7082 ; SCALAR-NEXT: movb %r9b, 5(%rsi)
7083 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7084 ; SCALAR-NEXT: movb %cl, 4(%rsi)
7085 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
7086 ; SCALAR-NEXT: movb %bpl, 3(%rsi)
7087 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7088 ; SCALAR-NEXT: movb %dil, 2(%rsi)
7089 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7090 ; SCALAR-NEXT: movb %cl, 1(%rsi)
7091 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
7092 ; SCALAR-NEXT: movb %r10b, (%rsi)
7093 ; SCALAR-NEXT: movb %r11b, 15(%rdx)
7094 ; SCALAR-NEXT: movb %r8b, 14(%rdx)
7095 ; SCALAR-NEXT: movb %al, 13(%rdx)
7096 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7097 ; SCALAR-NEXT: movb %al, 12(%rdx)
7098 ; SCALAR-NEXT: movb %r13b, 11(%rdx)
7099 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
7100 ; SCALAR-NEXT: movb %r15b, 10(%rdx)
7101 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7102 ; SCALAR-NEXT: movb %sil, 9(%rdx)
7103 ; SCALAR-NEXT: movb %r12b, 8(%rdx)
7104 ; SCALAR-NEXT: movb %r14b, 7(%rdx)
7105 ; SCALAR-NEXT: movb %bl, 6(%rdx)
7106 ; SCALAR-NEXT: movb %r9b, 5(%rdx)
7107 ; SCALAR-NEXT: movl %r9d, %r11d
7108 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7109 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7110 ; SCALAR-NEXT: movb %r8b, 4(%rdx)
7111 ; SCALAR-NEXT: movb %bpl, 3(%rdx)
7112 ; SCALAR-NEXT: movb %dil, 2(%rdx)
7113 ; SCALAR-NEXT: movb %cl, 1(%rdx)
7114 ; SCALAR-NEXT: movl %ecx, %r14d
7115 ; SCALAR-NEXT: movl %r10d, %esi
7116 ; SCALAR-NEXT: movb %r10b, (%rdx)
7117 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7118 ; SCALAR-NEXT: movb %cl, 31(%rdx)
7119 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
7120 ; SCALAR-NEXT: movb %r9b, 30(%rdx)
7121 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7122 ; SCALAR-NEXT: movb %dil, 29(%rdx)
7123 ; SCALAR-NEXT: movb %al, 28(%rdx)
7124 ; SCALAR-NEXT: movl %eax, %r10d
7125 ; SCALAR-NEXT: movb %r13b, 27(%rdx)
7126 ; SCALAR-NEXT: movb %r15b, 26(%rdx)
7127 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
7128 ; SCALAR-NEXT: movb %r15b, 25(%rdx)
7129 ; SCALAR-NEXT: movl %r12d, %ebp
7130 ; SCALAR-NEXT: movb %r12b, 24(%rdx)
7131 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
7132 ; SCALAR-NEXT: movb %bl, 23(%rdx)
7133 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7134 ; SCALAR-NEXT: movb %al, 22(%rdx)
7135 ; SCALAR-NEXT: movb %r11b, 21(%rdx)
7136 ; SCALAR-NEXT: movb %r8b, 20(%rdx)
7137 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7138 ; SCALAR-NEXT: movb %r8b, 19(%rdx)
7139 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7140 ; SCALAR-NEXT: movb %r8b, 18(%rdx)
7141 ; SCALAR-NEXT: movb %r14b, 17(%rdx)
7142 ; SCALAR-NEXT: movb %sil, 16(%rdx)
7143 ; SCALAR-NEXT: movl %esi, %r11d
7144 ; SCALAR-NEXT: movb %cl, 47(%rdx)
7145 ; SCALAR-NEXT: movb %r9b, 46(%rdx)
7146 ; SCALAR-NEXT: movb %dil, 45(%rdx)
7147 ; SCALAR-NEXT: movb %r10b, 44(%rdx)
7148 ; SCALAR-NEXT: movb %r13b, 43(%rdx)
7149 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
7150 ; SCALAR-NEXT: movb %r12b, 42(%rdx)
7151 ; SCALAR-NEXT: movb %r15b, 41(%rdx)
7152 ; SCALAR-NEXT: movl %ebp, %r14d
7153 ; SCALAR-NEXT: movb %bpl, 40(%rdx)
7154 ; SCALAR-NEXT: movl %ebx, %ebp
7155 ; SCALAR-NEXT: movb %bl, 39(%rdx)
7156 ; SCALAR-NEXT: movl %eax, %ebx
7157 ; SCALAR-NEXT: movb %al, 38(%rdx)
7158 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7159 ; SCALAR-NEXT: movb %cl, 37(%rdx)
7160 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7161 ; SCALAR-NEXT: movb %al, 36(%rdx)
7162 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7163 ; SCALAR-NEXT: movb %sil, 35(%rdx)
7164 ; SCALAR-NEXT: movb %r8b, 34(%rdx)
7165 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
7166 ; SCALAR-NEXT: movb %r9b, 33(%rdx)
7167 ; SCALAR-NEXT: movb %r11b, 32(%rdx)
7168 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7169 ; SCALAR-NEXT: movb %r11b, 63(%rdx)
7170 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7171 ; SCALAR-NEXT: movb %r11b, 62(%rdx)
7172 ; SCALAR-NEXT: movb %dil, 61(%rdx)
7173 ; SCALAR-NEXT: movb %r10b, 60(%rdx)
7174 ; SCALAR-NEXT: movb %r13b, 59(%rdx)
7175 ; SCALAR-NEXT: movb %r12b, 58(%rdx)
7176 ; SCALAR-NEXT: movb %r15b, 57(%rdx)
7177 ; SCALAR-NEXT: movb %r14b, 56(%rdx)
7178 ; SCALAR-NEXT: movb %bpl, 55(%rdx)
7179 ; SCALAR-NEXT: movb %bl, 54(%rdx)
7180 ; SCALAR-NEXT: movb %cl, 53(%rdx)
7181 ; SCALAR-NEXT: movb %al, 52(%rdx)
7182 ; SCALAR-NEXT: movb %sil, 51(%rdx)
7183 ; SCALAR-NEXT: movb %r8b, 50(%rdx)
7184 ; SCALAR-NEXT: movb %r9b, 49(%rdx)
7185 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7186 ; SCALAR-NEXT: movb %al, 48(%rdx)
7187 ; SCALAR-NEXT: popq %rbx
7188 ; SCALAR-NEXT: popq %r12
7189 ; SCALAR-NEXT: popq %r13
7190 ; SCALAR-NEXT: popq %r14
7191 ; SCALAR-NEXT: popq %r15
7192 ; SCALAR-NEXT: popq %rbp
7195 ; SSE2-LABEL: vec512_v16i8:
7197 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
7198 ; SSE2-NEXT: pxor (%rdi), %xmm0
7199 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
7200 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
7201 ; SSE2-NEXT: movdqa %xmm0, 16(%rdx)
7202 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
7203 ; SSE2-NEXT: movdqa %xmm0, 48(%rdx)
7206 ; AVX-LABEL: vec512_v16i8:
7208 ; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
7209 ; AVX-NEXT: vpxor (%rdi), %xmm0, %xmm0
7210 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
7211 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
7212 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
7213 ; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
7214 ; AVX-NEXT: vmovdqa %xmm0, 48(%rdx)
7216 %in.subvec.not = load <16 x i8>, ptr %in.subvec.ptr, align 64
7217 %in.subvec = xor <16 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
7218 store <16 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
7219 %out.subvec0.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 0
7220 store <16 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
7221 %out.subvec1.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 1
7222 store <16 x i8> %in.subvec, ptr %out.subvec1.ptr, align 16
7223 %out.subvec2.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 2
7224 store <16 x i8> %in.subvec, ptr %out.subvec2.ptr, align 32
7225 %out.subvec3.ptr = getelementptr <16 x i8>, ptr %out.vec.ptr, i64 3
7226 store <16 x i8> %in.subvec, ptr %out.subvec3.ptr, align 16
7230 define void @vec512_v16i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
7231 ; SCALAR-LABEL: vec512_v16i16:
7233 ; SCALAR-NEXT: pushq %rbp
7234 ; SCALAR-NEXT: pushq %r15
7235 ; SCALAR-NEXT: pushq %r14
7236 ; SCALAR-NEXT: pushq %r13
7237 ; SCALAR-NEXT: pushq %r12
7238 ; SCALAR-NEXT: pushq %rbx
7239 ; SCALAR-NEXT: movzwl 30(%rdi), %eax
7240 ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7241 ; SCALAR-NEXT: movl 28(%rdi), %eax
7242 ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7243 ; SCALAR-NEXT: movzwl 26(%rdi), %eax
7244 ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7245 ; SCALAR-NEXT: movl 24(%rdi), %r13d
7246 ; SCALAR-NEXT: movzwl 22(%rdi), %r12d
7247 ; SCALAR-NEXT: movl 20(%rdi), %r15d
7248 ; SCALAR-NEXT: movzwl 18(%rdi), %r14d
7249 ; SCALAR-NEXT: movl 16(%rdi), %ebx
7250 ; SCALAR-NEXT: movzwl 14(%rdi), %r11d
7251 ; SCALAR-NEXT: movl 12(%rdi), %r10d
7252 ; SCALAR-NEXT: movzwl 10(%rdi), %r9d
7253 ; SCALAR-NEXT: movl 8(%rdi), %r8d
7254 ; SCALAR-NEXT: movzwl 6(%rdi), %ecx
7255 ; SCALAR-NEXT: movzwl 2(%rdi), %ebp
7256 ; SCALAR-NEXT: movl (%rdi), %eax
7257 ; SCALAR-NEXT: movl 4(%rdi), %edi
7258 ; SCALAR-NEXT: notl %eax
7259 ; SCALAR-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7260 ; SCALAR-NEXT: notl %ebp
7261 ; SCALAR-NEXT: movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7262 ; SCALAR-NEXT: notl %edi
7263 ; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7264 ; SCALAR-NEXT: notl %ecx
7265 ; SCALAR-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7266 ; SCALAR-NEXT: notl %r8d
7267 ; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7268 ; SCALAR-NEXT: notl %r9d
7269 ; SCALAR-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7270 ; SCALAR-NEXT: movl %r10d, %edi
7271 ; SCALAR-NEXT: notl %edi
7272 ; SCALAR-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7273 ; SCALAR-NEXT: notl %r11d
7274 ; SCALAR-NEXT: movl %r11d, %r9d
7275 ; SCALAR-NEXT: notl %ebx
7276 ; SCALAR-NEXT: movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7277 ; SCALAR-NEXT: notl %r14d
7278 ; SCALAR-NEXT: notl %r15d
7279 ; SCALAR-NEXT: notl %r12d
7280 ; SCALAR-NEXT: notl %r13d
7281 ; SCALAR-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7282 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
7283 ; SCALAR-NEXT: notl %r10d
7284 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload
7285 ; SCALAR-NEXT: notl %r11d
7286 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
7287 ; SCALAR-NEXT: notl %r8d
7288 ; SCALAR-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7289 ; SCALAR-NEXT: movw %r8w, 30(%rsi)
7290 ; SCALAR-NEXT: movw %r11w, 28(%rsi)
7291 ; SCALAR-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7292 ; SCALAR-NEXT: movw %r10w, 26(%rsi)
7293 ; SCALAR-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
7294 ; SCALAR-NEXT: movw %r13w, 24(%rsi)
7295 ; SCALAR-NEXT: movw %r12w, 22(%rsi)
7296 ; SCALAR-NEXT: movw %r15w, 20(%rsi)
7297 ; SCALAR-NEXT: movw %r14w, 18(%rsi)
7298 ; SCALAR-NEXT: movw %bx, 16(%rsi)
7299 ; SCALAR-NEXT: movw %r9w, 14(%rsi)
7300 ; SCALAR-NEXT: movw %di, 12(%rsi)
7301 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Reload
7302 ; SCALAR-NEXT: movw %bp, 10(%rsi)
7303 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
7304 ; SCALAR-NEXT: movw %di, 8(%rsi)
7305 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
7306 ; SCALAR-NEXT: movw %cx, 6(%rsi)
7307 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
7308 ; SCALAR-NEXT: movw %r8w, 4(%rsi)
7309 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
7310 ; SCALAR-NEXT: movw %ax, 2(%rsi)
7311 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7312 ; SCALAR-NEXT: movw %bx, (%rsi)
7313 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
7314 ; SCALAR-NEXT: movw %r13w, 30(%rdx)
7315 ; SCALAR-NEXT: movw %r11w, 28(%rdx)
7316 ; SCALAR-NEXT: movw %r10w, 26(%rdx)
7317 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
7318 ; SCALAR-NEXT: movw %si, 24(%rdx)
7319 ; SCALAR-NEXT: movw %r12w, 22(%rdx)
7320 ; SCALAR-NEXT: movw %r15w, 20(%rdx)
7321 ; SCALAR-NEXT: movw %r14w, 18(%rdx)
7322 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload
7323 ; SCALAR-NEXT: movw %r11w, 16(%rdx)
7324 ; SCALAR-NEXT: movw %r9w, 14(%rdx)
7325 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
7326 ; SCALAR-NEXT: movw %r10w, 12(%rdx)
7327 ; SCALAR-NEXT: movw %bp, 10(%rdx)
7328 ; SCALAR-NEXT: movw %di, 8(%rdx)
7329 ; SCALAR-NEXT: movw %cx, 6(%rdx)
7330 ; SCALAR-NEXT: movw %r8w, 4(%rdx)
7331 ; SCALAR-NEXT: movw %ax, 2(%rdx)
7332 ; SCALAR-NEXT: movl %ebx, %esi
7333 ; SCALAR-NEXT: movw %si, (%rdx)
7334 ; SCALAR-NEXT: movw %r13w, 62(%rdx)
7335 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7336 ; SCALAR-NEXT: movw %bx, 60(%rdx)
7337 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7338 ; SCALAR-NEXT: movw %bx, 58(%rdx)
7339 ; SCALAR-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
7340 ; SCALAR-NEXT: movw %bx, 56(%rdx)
7341 ; SCALAR-NEXT: movw %r12w, 54(%rdx)
7342 ; SCALAR-NEXT: movw %r15w, 52(%rdx)
7343 ; SCALAR-NEXT: movw %r14w, 50(%rdx)
7344 ; SCALAR-NEXT: movw %r11w, 48(%rdx)
7345 ; SCALAR-NEXT: movw %r9w, 46(%rdx)
7346 ; SCALAR-NEXT: movw %r10w, 44(%rdx)
7347 ; SCALAR-NEXT: movw %bp, 42(%rdx)
7348 ; SCALAR-NEXT: movw %di, 40(%rdx)
7349 ; SCALAR-NEXT: movw %cx, 38(%rdx)
7350 ; SCALAR-NEXT: movw %r8w, 36(%rdx)
7351 ; SCALAR-NEXT: movw %ax, 34(%rdx)
7352 ; SCALAR-NEXT: movw %si, 32(%rdx)
7353 ; SCALAR-NEXT: popq %rbx
7354 ; SCALAR-NEXT: popq %r12
7355 ; SCALAR-NEXT: popq %r13
7356 ; SCALAR-NEXT: popq %r14
7357 ; SCALAR-NEXT: popq %r15
7358 ; SCALAR-NEXT: popq %rbp
7361 ; SSE2-LABEL: vec512_v16i16:
7363 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
7364 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
7365 ; SSE2-NEXT: pxor %xmm0, %xmm1
7366 ; SSE2-NEXT: pxor (%rdi), %xmm0
7367 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
7368 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
7369 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
7370 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
7371 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
7372 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
7375 ; AVX1-LABEL: vec512_v16i16:
7377 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
7378 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
7379 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
7380 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
7381 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
7382 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
7383 ; AVX1-NEXT: vzeroupper
7386 ; AVX2-LABEL: vec512_v16i16:
7388 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
7389 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
7390 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
7391 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
7392 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
7393 ; AVX2-NEXT: vzeroupper
7395 %in.subvec.not = load <16 x i16>, ptr %in.subvec.ptr, align 64
7396 %in.subvec = xor <16 x i16> %in.subvec.not, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
7397 store <16 x i16> %in.subvec, ptr %out.subvec.ptr, align 64
7398 %out.subvec0.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 0
7399 store <16 x i16> %in.subvec, ptr %out.subvec0.ptr, align 64
7400 %out.subvec1.ptr = getelementptr <16 x i16>, ptr %out.vec.ptr, i64 1
7401 store <16 x i16> %in.subvec, ptr %out.subvec1.ptr, align 32
7405 define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.ptr) nounwind {
7406 ; SCALAR-LABEL: vec512_v32i8:
7408 ; SCALAR-NEXT: pushq %rbp
7409 ; SCALAR-NEXT: pushq %r15
7410 ; SCALAR-NEXT: pushq %r14
7411 ; SCALAR-NEXT: pushq %r13
7412 ; SCALAR-NEXT: pushq %r12
7413 ; SCALAR-NEXT: pushq %rbx
7414 ; SCALAR-NEXT: movzbl 16(%rdi), %eax
7415 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7416 ; SCALAR-NEXT: movzbl 15(%rdi), %eax
7417 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7418 ; SCALAR-NEXT: movzbl 14(%rdi), %eax
7419 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7420 ; SCALAR-NEXT: movzbl 13(%rdi), %eax
7421 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7422 ; SCALAR-NEXT: movzbl 12(%rdi), %r13d
7423 ; SCALAR-NEXT: movzbl 11(%rdi), %eax
7424 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7425 ; SCALAR-NEXT: movzbl 10(%rdi), %r12d
7426 ; SCALAR-NEXT: movzbl 9(%rdi), %r15d
7427 ; SCALAR-NEXT: movzbl 8(%rdi), %r14d
7428 ; SCALAR-NEXT: movzbl 7(%rdi), %ebp
7429 ; SCALAR-NEXT: movzbl 6(%rdi), %ebx
7430 ; SCALAR-NEXT: movzbl 5(%rdi), %r11d
7431 ; SCALAR-NEXT: movzbl 4(%rdi), %r10d
7432 ; SCALAR-NEXT: movzbl 3(%rdi), %r9d
7433 ; SCALAR-NEXT: movzbl 2(%rdi), %r8d
7434 ; SCALAR-NEXT: movzbl (%rdi), %eax
7435 ; SCALAR-NEXT: movzbl 1(%rdi), %ecx
7436 ; SCALAR-NEXT: notb %al
7437 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7438 ; SCALAR-NEXT: notb %cl
7439 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7440 ; SCALAR-NEXT: notb %r8b
7441 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7442 ; SCALAR-NEXT: notb %r9b
7443 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7444 ; SCALAR-NEXT: notb %r10b
7445 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7446 ; SCALAR-NEXT: notb %r11b
7447 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7448 ; SCALAR-NEXT: notb %bl
7449 ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7450 ; SCALAR-NEXT: notb %bpl
7451 ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7452 ; SCALAR-NEXT: notb %r14b
7453 ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7454 ; SCALAR-NEXT: notb %r15b
7455 ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7456 ; SCALAR-NEXT: notb %r12b
7457 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7458 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7459 ; SCALAR-NEXT: notb %r11b
7460 ; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7461 ; SCALAR-NEXT: notb %r13b
7462 ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7463 ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7464 ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7465 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7466 ; SCALAR-NEXT: notb %r8b
7467 ; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
7468 ; SCALAR-NEXT: movzbl 17(%rdi), %eax
7469 ; SCALAR-NEXT: notb %al
7470 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7471 ; SCALAR-NEXT: movzbl 18(%rdi), %eax
7472 ; SCALAR-NEXT: notb %al
7473 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7474 ; SCALAR-NEXT: movzbl 19(%rdi), %eax
7475 ; SCALAR-NEXT: notb %al
7476 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7477 ; SCALAR-NEXT: movzbl 20(%rdi), %eax
7478 ; SCALAR-NEXT: notb %al
7479 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7480 ; SCALAR-NEXT: movzbl 21(%rdi), %ebp
7481 ; SCALAR-NEXT: notb %bpl
7482 ; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7483 ; SCALAR-NEXT: movzbl 22(%rdi), %ebx
7484 ; SCALAR-NEXT: notb %bl
7485 ; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7486 ; SCALAR-NEXT: movzbl 23(%rdi), %r10d
7487 ; SCALAR-NEXT: notb %r10b
7488 ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7489 ; SCALAR-NEXT: movzbl 24(%rdi), %r9d
7490 ; SCALAR-NEXT: notb %r9b
7491 ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7492 ; SCALAR-NEXT: movzbl 25(%rdi), %ecx
7493 ; SCALAR-NEXT: notb %cl
7494 ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7495 ; SCALAR-NEXT: movzbl 26(%rdi), %r14d
7496 ; SCALAR-NEXT: notb %r14b
7497 ; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7498 ; SCALAR-NEXT: movzbl 27(%rdi), %r15d
7499 ; SCALAR-NEXT: notb %r15b
7500 ; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7501 ; SCALAR-NEXT: movzbl 28(%rdi), %r12d
7502 ; SCALAR-NEXT: notb %r12b
7503 ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7504 ; SCALAR-NEXT: movzbl 29(%rdi), %r13d
7505 ; SCALAR-NEXT: notb %r13b
7506 ; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7507 ; SCALAR-NEXT: movzbl 30(%rdi), %eax
7508 ; SCALAR-NEXT: notb %al
7509 ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7510 ; SCALAR-NEXT: movzbl 31(%rdi), %edi
7511 ; SCALAR-NEXT: notb %dil
7512 ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7513 ; SCALAR-NEXT: movb %dil, 31(%rsi)
7514 ; SCALAR-NEXT: movb %al, 30(%rsi)
7515 ; SCALAR-NEXT: movb %r13b, 29(%rsi)
7516 ; SCALAR-NEXT: movb %r12b, 28(%rsi)
7517 ; SCALAR-NEXT: movb %r15b, 27(%rsi)
7518 ; SCALAR-NEXT: movb %r14b, 26(%rsi)
7519 ; SCALAR-NEXT: movb %cl, 25(%rsi)
7520 ; SCALAR-NEXT: movb %r9b, 24(%rsi)
7521 ; SCALAR-NEXT: movb %r10b, 23(%rsi)
7522 ; SCALAR-NEXT: movb %bl, 22(%rsi)
7523 ; SCALAR-NEXT: movb %bpl, 21(%rsi)
7524 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
7525 ; SCALAR-NEXT: movb %bpl, 20(%rsi)
7526 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7527 ; SCALAR-NEXT: movb %al, 19(%rsi)
7528 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7529 ; SCALAR-NEXT: movb %al, 18(%rsi)
7530 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7531 ; SCALAR-NEXT: movb %al, 17(%rsi)
7532 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7533 ; SCALAR-NEXT: movb %cl, 16(%rsi)
7534 ; SCALAR-NEXT: movb %r8b, 15(%rsi)
7535 ; SCALAR-NEXT: movl %r8d, %r14d
7536 ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
7537 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
7538 ; SCALAR-NEXT: movb %bl, 14(%rsi)
7539 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7540 ; SCALAR-NEXT: movb %al, 13(%rsi)
7541 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7542 ; SCALAR-NEXT: movb %al, 12(%rsi)
7543 ; SCALAR-NEXT: movb %r11b, 11(%rsi)
7544 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7545 ; SCALAR-NEXT: movb %dil, 10(%rsi)
7546 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7547 ; SCALAR-NEXT: movb %dil, 9(%rsi)
7548 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7549 ; SCALAR-NEXT: movb %dil, 8(%rsi)
7550 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
7551 ; SCALAR-NEXT: movb %r11b, 7(%rsi)
7552 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
7553 ; SCALAR-NEXT: movb %r13b, 6(%rsi)
7554 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
7555 ; SCALAR-NEXT: movb %r10b, 5(%rsi)
7556 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
7557 ; SCALAR-NEXT: movb %r12b, 4(%rsi)
7558 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
7559 ; SCALAR-NEXT: movb %r9b, 3(%rsi)
7560 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
7561 ; SCALAR-NEXT: movb %r15b, 2(%rsi)
7562 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
7563 ; SCALAR-NEXT: movb %r8b, 1(%rsi)
7564 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
7565 ; SCALAR-NEXT: movb %dil, (%rsi)
7566 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7567 ; SCALAR-NEXT: movb %sil, 31(%rdx)
7568 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7569 ; SCALAR-NEXT: movb %sil, 30(%rdx)
7570 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7571 ; SCALAR-NEXT: movb %sil, 29(%rdx)
7572 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7573 ; SCALAR-NEXT: movb %sil, 28(%rdx)
7574 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7575 ; SCALAR-NEXT: movb %sil, 27(%rdx)
7576 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7577 ; SCALAR-NEXT: movb %sil, 26(%rdx)
7578 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7579 ; SCALAR-NEXT: movb %sil, 25(%rdx)
7580 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7581 ; SCALAR-NEXT: movb %sil, 24(%rdx)
7582 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7583 ; SCALAR-NEXT: movb %sil, 23(%rdx)
7584 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7585 ; SCALAR-NEXT: movb %sil, 22(%rdx)
7586 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7587 ; SCALAR-NEXT: movb %sil, 21(%rdx)
7588 ; SCALAR-NEXT: movb %bpl, 20(%rdx)
7589 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7590 ; SCALAR-NEXT: movb %sil, 19(%rdx)
7591 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7592 ; SCALAR-NEXT: movb %sil, 18(%rdx)
7593 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7594 ; SCALAR-NEXT: movb %sil, 17(%rdx)
7595 ; SCALAR-NEXT: movb %cl, 16(%rdx)
7596 ; SCALAR-NEXT: movb %r14b, 15(%rdx)
7597 ; SCALAR-NEXT: movb %bl, 14(%rdx)
7598 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
7599 ; SCALAR-NEXT: movb %cl, 13(%rdx)
7600 ; SCALAR-NEXT: movb %al, 12(%rdx)
7601 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
7602 ; SCALAR-NEXT: movb %sil, 11(%rdx)
7603 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
7604 ; SCALAR-NEXT: movb %bl, 10(%rdx)
7605 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
7606 ; SCALAR-NEXT: movb %r14b, 9(%rdx)
7607 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
7608 ; SCALAR-NEXT: movb %bpl, 8(%rdx)
7609 ; SCALAR-NEXT: movb %r11b, 7(%rdx)
7610 ; SCALAR-NEXT: movb %r13b, 6(%rdx)
7611 ; SCALAR-NEXT: movb %r10b, 5(%rdx)
7612 ; SCALAR-NEXT: movb %r12b, 4(%rdx)
7613 ; SCALAR-NEXT: movb %r9b, 3(%rdx)
7614 ; SCALAR-NEXT: movb %r15b, 2(%rdx)
7615 ; SCALAR-NEXT: movb %r8b, 1(%rdx)
7616 ; SCALAR-NEXT: movb %dil, (%rdx)
7617 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7618 ; SCALAR-NEXT: movb %al, 63(%rdx)
7619 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7620 ; SCALAR-NEXT: movb %al, 62(%rdx)
7621 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7622 ; SCALAR-NEXT: movb %al, 61(%rdx)
7623 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7624 ; SCALAR-NEXT: movb %al, 60(%rdx)
7625 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7626 ; SCALAR-NEXT: movb %al, 59(%rdx)
7627 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7628 ; SCALAR-NEXT: movb %al, 58(%rdx)
7629 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7630 ; SCALAR-NEXT: movb %al, 57(%rdx)
7631 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7632 ; SCALAR-NEXT: movb %al, 56(%rdx)
7633 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7634 ; SCALAR-NEXT: movb %al, 55(%rdx)
7635 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7636 ; SCALAR-NEXT: movb %al, 54(%rdx)
7637 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7638 ; SCALAR-NEXT: movb %al, 53(%rdx)
7639 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7640 ; SCALAR-NEXT: movb %al, 52(%rdx)
7641 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7642 ; SCALAR-NEXT: movb %al, 51(%rdx)
7643 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7644 ; SCALAR-NEXT: movb %al, 50(%rdx)
7645 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7646 ; SCALAR-NEXT: movb %al, 49(%rdx)
7647 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7648 ; SCALAR-NEXT: movb %al, 48(%rdx)
7649 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7650 ; SCALAR-NEXT: movb %al, 47(%rdx)
7651 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7652 ; SCALAR-NEXT: movb %al, 46(%rdx)
7653 ; SCALAR-NEXT: movb %cl, 45(%rdx)
7654 ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
7655 ; SCALAR-NEXT: movb %al, 44(%rdx)
7656 ; SCALAR-NEXT: movb %sil, 43(%rdx)
7657 ; SCALAR-NEXT: movb %bl, 42(%rdx)
7658 ; SCALAR-NEXT: movb %r14b, 41(%rdx)
7659 ; SCALAR-NEXT: movb %bpl, 40(%rdx)
7660 ; SCALAR-NEXT: movb %r11b, 39(%rdx)
7661 ; SCALAR-NEXT: movb %r13b, 38(%rdx)
7662 ; SCALAR-NEXT: movb %r10b, 37(%rdx)
7663 ; SCALAR-NEXT: movb %r12b, 36(%rdx)
7664 ; SCALAR-NEXT: movb %r9b, 35(%rdx)
7665 ; SCALAR-NEXT: movb %r15b, 34(%rdx)
7666 ; SCALAR-NEXT: movb %r8b, 33(%rdx)
7667 ; SCALAR-NEXT: movb %dil, 32(%rdx)
7668 ; SCALAR-NEXT: popq %rbx
7669 ; SCALAR-NEXT: popq %r12
7670 ; SCALAR-NEXT: popq %r13
7671 ; SCALAR-NEXT: popq %r14
7672 ; SCALAR-NEXT: popq %r15
7673 ; SCALAR-NEXT: popq %rbp
7676 ; SSE2-LABEL: vec512_v32i8:
7678 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
7679 ; SSE2-NEXT: movdqa 16(%rdi), %xmm1
7680 ; SSE2-NEXT: pxor %xmm0, %xmm1
7681 ; SSE2-NEXT: pxor (%rdi), %xmm0
7682 ; SSE2-NEXT: movdqa %xmm0, (%rsi)
7683 ; SSE2-NEXT: movdqa %xmm1, 16(%rsi)
7684 ; SSE2-NEXT: movdqa %xmm0, (%rdx)
7685 ; SSE2-NEXT: movdqa %xmm1, 16(%rdx)
7686 ; SSE2-NEXT: movdqa %xmm1, 48(%rdx)
7687 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx)
7690 ; AVX1-LABEL: vec512_v32i8:
7692 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
7693 ; AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
7694 ; AVX1-NEXT: vxorps (%rdi), %ymm0, %ymm0
7695 ; AVX1-NEXT: vmovaps %ymm0, (%rsi)
7696 ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
7697 ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx)
7698 ; AVX1-NEXT: vzeroupper
7701 ; AVX2-LABEL: vec512_v32i8:
7703 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
7704 ; AVX2-NEXT: vpxor (%rdi), %ymm0, %ymm0
7705 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
7706 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
7707 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
7708 ; AVX2-NEXT: vzeroupper
7710 %in.subvec.not = load <32 x i8>, ptr %in.subvec.ptr, align 64
7711 %in.subvec = xor <32 x i8> %in.subvec.not, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
7712 store <32 x i8> %in.subvec, ptr %out.subvec.ptr, align 64
7713 %out.subvec0.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 0
7714 store <32 x i8> %in.subvec, ptr %out.subvec0.ptr, align 64
7715 %out.subvec1.ptr = getelementptr <32 x i8>, ptr %out.vec.ptr, i64 1
7716 store <32 x i8> %in.subvec, ptr %out.subvec1.ptr, align 32
7719 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: