1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
11 ; Make sure we don't crash with avx512bw and xop
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
14 define i8 @test_bitreverse_i8(i8 %a) nounwind {
15 ; SSE-LABEL: test_bitreverse_i8:
17 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi
18 ; SSE-NEXT: rolb $4, %dil
19 ; SSE-NEXT: movl %edi, %eax
20 ; SSE-NEXT: andb $51, %al
21 ; SSE-NEXT: shlb $2, %al
22 ; SSE-NEXT: andb $-52, %dil
23 ; SSE-NEXT: shrb $2, %dil
24 ; SSE-NEXT: orb %al, %dil
25 ; SSE-NEXT: movl %edi, %eax
26 ; SSE-NEXT: andb $85, %al
27 ; SSE-NEXT: addb %al, %al
28 ; SSE-NEXT: andb $-86, %dil
30 ; SSE-NEXT: addl %edi, %eax
31 ; SSE-NEXT: # kill: def $al killed $al killed $eax
34 ; AVX-LABEL: test_bitreverse_i8:
36 ; AVX-NEXT: # kill: def $edi killed $edi def $rdi
37 ; AVX-NEXT: rolb $4, %dil
38 ; AVX-NEXT: movl %edi, %eax
39 ; AVX-NEXT: andb $51, %al
40 ; AVX-NEXT: shlb $2, %al
41 ; AVX-NEXT: andb $-52, %dil
42 ; AVX-NEXT: shrb $2, %dil
43 ; AVX-NEXT: orb %al, %dil
44 ; AVX-NEXT: movl %edi, %eax
45 ; AVX-NEXT: andb $85, %al
46 ; AVX-NEXT: addb %al, %al
47 ; AVX-NEXT: andb $-86, %dil
49 ; AVX-NEXT: addl %edi, %eax
50 ; AVX-NEXT: # kill: def $al killed $al killed $eax
53 ; XOP-LABEL: test_bitreverse_i8:
55 ; XOP-NEXT: vmovd %edi, %xmm0
56 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
57 ; XOP-NEXT: vpextrb $0, %xmm0, %eax
58 ; XOP-NEXT: # kill: def $al killed $al killed $eax
60 %b = call i8 @llvm.bitreverse.i8(i8 %a)
64 define i16 @test_bitreverse_i16(i16 %a) nounwind {
65 ; SSE-LABEL: test_bitreverse_i16:
67 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi
68 ; SSE-NEXT: rolw $8, %di
69 ; SSE-NEXT: movl %edi, %eax
70 ; SSE-NEXT: andl $3855, %eax # imm = 0xF0F
71 ; SSE-NEXT: shll $4, %eax
72 ; SSE-NEXT: andl $61680, %edi # imm = 0xF0F0
73 ; SSE-NEXT: shrl $4, %edi
74 ; SSE-NEXT: orl %eax, %edi
75 ; SSE-NEXT: movl %edi, %eax
76 ; SSE-NEXT: andl $13107, %eax # imm = 0x3333
77 ; SSE-NEXT: andl $52428, %edi # imm = 0xCCCC
78 ; SSE-NEXT: shrl $2, %edi
79 ; SSE-NEXT: leal (%rdi,%rax,4), %eax
80 ; SSE-NEXT: movl %eax, %ecx
81 ; SSE-NEXT: andl $21845, %ecx # imm = 0x5555
82 ; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA
84 ; SSE-NEXT: leal (%rax,%rcx,2), %eax
85 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
88 ; AVX-LABEL: test_bitreverse_i16:
90 ; AVX-NEXT: # kill: def $edi killed $edi def $rdi
91 ; AVX-NEXT: rolw $8, %di
92 ; AVX-NEXT: movl %edi, %eax
93 ; AVX-NEXT: andl $3855, %eax # imm = 0xF0F
94 ; AVX-NEXT: shll $4, %eax
95 ; AVX-NEXT: andl $61680, %edi # imm = 0xF0F0
96 ; AVX-NEXT: shrl $4, %edi
97 ; AVX-NEXT: orl %eax, %edi
98 ; AVX-NEXT: movl %edi, %eax
99 ; AVX-NEXT: andl $13107, %eax # imm = 0x3333
100 ; AVX-NEXT: andl $52428, %edi # imm = 0xCCCC
101 ; AVX-NEXT: shrl $2, %edi
102 ; AVX-NEXT: leal (%rdi,%rax,4), %eax
103 ; AVX-NEXT: movl %eax, %ecx
104 ; AVX-NEXT: andl $21845, %ecx # imm = 0x5555
105 ; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA
106 ; AVX-NEXT: shrl %eax
107 ; AVX-NEXT: leal (%rax,%rcx,2), %eax
108 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
111 ; XOP-LABEL: test_bitreverse_i16:
113 ; XOP-NEXT: vmovd %edi, %xmm0
114 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
115 ; XOP-NEXT: vmovd %xmm0, %eax
116 ; XOP-NEXT: # kill: def $ax killed $ax killed $eax
118 %b = call i16 @llvm.bitreverse.i16(i16 %a)
122 define i32 @test_bitreverse_i32(i32 %a) nounwind {
123 ; SSE-LABEL: test_bitreverse_i32:
125 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi
126 ; SSE-NEXT: bswapl %edi
127 ; SSE-NEXT: movl %edi, %eax
128 ; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
129 ; SSE-NEXT: shll $4, %eax
130 ; SSE-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
131 ; SSE-NEXT: shrl $4, %edi
132 ; SSE-NEXT: orl %eax, %edi
133 ; SSE-NEXT: movl %edi, %eax
134 ; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333
135 ; SSE-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
136 ; SSE-NEXT: shrl $2, %edi
137 ; SSE-NEXT: leal (%rdi,%rax,4), %eax
138 ; SSE-NEXT: movl %eax, %ecx
139 ; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555
140 ; SSE-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
141 ; SSE-NEXT: shrl %eax
142 ; SSE-NEXT: leal (%rax,%rcx,2), %eax
145 ; AVX-LABEL: test_bitreverse_i32:
147 ; AVX-NEXT: # kill: def $edi killed $edi def $rdi
148 ; AVX-NEXT: bswapl %edi
149 ; AVX-NEXT: movl %edi, %eax
150 ; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
151 ; AVX-NEXT: shll $4, %eax
152 ; AVX-NEXT: andl $-252645136, %edi # imm = 0xF0F0F0F0
153 ; AVX-NEXT: shrl $4, %edi
154 ; AVX-NEXT: orl %eax, %edi
155 ; AVX-NEXT: movl %edi, %eax
156 ; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333
157 ; AVX-NEXT: andl $-858993460, %edi # imm = 0xCCCCCCCC
158 ; AVX-NEXT: shrl $2, %edi
159 ; AVX-NEXT: leal (%rdi,%rax,4), %eax
160 ; AVX-NEXT: movl %eax, %ecx
161 ; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555
162 ; AVX-NEXT: andl $-1431655766, %eax # imm = 0xAAAAAAAA
163 ; AVX-NEXT: shrl %eax
164 ; AVX-NEXT: leal (%rax,%rcx,2), %eax
167 ; XOP-LABEL: test_bitreverse_i32:
169 ; XOP-NEXT: vmovd %edi, %xmm0
170 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
171 ; XOP-NEXT: vmovd %xmm0, %eax
173 %b = call i32 @llvm.bitreverse.i32(i32 %a)
177 define i64 @test_bitreverse_i64(i64 %a) nounwind {
178 ; SSE-LABEL: test_bitreverse_i64:
180 ; SSE-NEXT: bswapq %rdi
181 ; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
182 ; SSE-NEXT: andq %rdi, %rax
183 ; SSE-NEXT: shlq $4, %rax
184 ; SSE-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
185 ; SSE-NEXT: andq %rdi, %rcx
186 ; SSE-NEXT: shrq $4, %rcx
187 ; SSE-NEXT: orq %rax, %rcx
188 ; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
189 ; SSE-NEXT: andq %rcx, %rax
190 ; SSE-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
191 ; SSE-NEXT: andq %rcx, %rdx
192 ; SSE-NEXT: shrq $2, %rdx
193 ; SSE-NEXT: leaq (%rdx,%rax,4), %rax
194 ; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
195 ; SSE-NEXT: andq %rax, %rcx
196 ; SSE-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
197 ; SSE-NEXT: andq %rax, %rdx
198 ; SSE-NEXT: shrq %rdx
199 ; SSE-NEXT: leaq (%rdx,%rcx,2), %rax
202 ; AVX-LABEL: test_bitreverse_i64:
204 ; AVX-NEXT: bswapq %rdi
205 ; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
206 ; AVX-NEXT: andq %rdi, %rax
207 ; AVX-NEXT: shlq $4, %rax
208 ; AVX-NEXT: movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
209 ; AVX-NEXT: andq %rdi, %rcx
210 ; AVX-NEXT: shrq $4, %rcx
211 ; AVX-NEXT: orq %rax, %rcx
212 ; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
213 ; AVX-NEXT: andq %rcx, %rax
214 ; AVX-NEXT: movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
215 ; AVX-NEXT: andq %rcx, %rdx
216 ; AVX-NEXT: shrq $2, %rdx
217 ; AVX-NEXT: leaq (%rdx,%rax,4), %rax
218 ; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
219 ; AVX-NEXT: andq %rax, %rcx
220 ; AVX-NEXT: movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
221 ; AVX-NEXT: andq %rax, %rdx
222 ; AVX-NEXT: shrq %rdx
223 ; AVX-NEXT: leaq (%rdx,%rcx,2), %rax
226 ; XOP-LABEL: test_bitreverse_i64:
228 ; XOP-NEXT: vmovq %rdi, %xmm0
229 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
230 ; XOP-NEXT: vmovq %xmm0, %rax
232 %b = call i64 @llvm.bitreverse.i64(i64 %a)
236 define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
237 ; SSE2-LABEL: test_bitreverse_v16i8:
239 ; SSE2-NEXT: movdqa %xmm0, %xmm1
240 ; SSE2-NEXT: psllw $4, %xmm1
241 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
242 ; SSE2-NEXT: psrlw $4, %xmm0
243 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
244 ; SSE2-NEXT: por %xmm1, %xmm0
245 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
246 ; SSE2-NEXT: pand %xmm0, %xmm1
247 ; SSE2-NEXT: psllw $2, %xmm1
248 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
249 ; SSE2-NEXT: psrlw $2, %xmm0
250 ; SSE2-NEXT: por %xmm1, %xmm0
251 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
252 ; SSE2-NEXT: pand %xmm0, %xmm1
253 ; SSE2-NEXT: paddb %xmm1, %xmm1
254 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
255 ; SSE2-NEXT: psrlw $1, %xmm0
256 ; SSE2-NEXT: por %xmm1, %xmm0
259 ; SSSE3-LABEL: test_bitreverse_v16i8:
261 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
262 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
263 ; SSSE3-NEXT: pand %xmm1, %xmm2
264 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
265 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
266 ; SSSE3-NEXT: psrlw $4, %xmm0
267 ; SSSE3-NEXT: pand %xmm1, %xmm0
268 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
269 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
270 ; SSSE3-NEXT: por %xmm3, %xmm1
271 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
274 ; AVX-LABEL: test_bitreverse_v16i8:
276 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
277 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
278 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
279 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
280 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
281 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
282 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
283 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
284 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
287 ; XOP-LABEL: test_bitreverse_v16i8:
289 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
291 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
295 define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
296 ; SSE2-LABEL: test_bitreverse_v8i16:
298 ; SSE2-NEXT: pxor %xmm1, %xmm1
299 ; SSE2-NEXT: movdqa %xmm0, %xmm2
300 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
301 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
302 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
303 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
304 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
305 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
306 ; SSE2-NEXT: packuswb %xmm2, %xmm0
307 ; SSE2-NEXT: movdqa %xmm0, %xmm1
308 ; SSE2-NEXT: psllw $4, %xmm1
309 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
310 ; SSE2-NEXT: psrlw $4, %xmm0
311 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
312 ; SSE2-NEXT: por %xmm1, %xmm0
313 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
314 ; SSE2-NEXT: pand %xmm0, %xmm1
315 ; SSE2-NEXT: psllw $2, %xmm1
316 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
317 ; SSE2-NEXT: psrlw $2, %xmm0
318 ; SSE2-NEXT: por %xmm1, %xmm0
319 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
320 ; SSE2-NEXT: pand %xmm0, %xmm1
321 ; SSE2-NEXT: paddb %xmm1, %xmm1
322 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
323 ; SSE2-NEXT: psrlw $1, %xmm0
324 ; SSE2-NEXT: por %xmm1, %xmm0
327 ; SSSE3-LABEL: test_bitreverse_v8i16:
329 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
330 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
331 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
332 ; SSSE3-NEXT: pand %xmm1, %xmm2
333 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
334 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
335 ; SSSE3-NEXT: psrlw $4, %xmm0
336 ; SSSE3-NEXT: pand %xmm1, %xmm0
337 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
338 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
339 ; SSSE3-NEXT: por %xmm3, %xmm1
340 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
343 ; AVX-LABEL: test_bitreverse_v8i16:
345 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
346 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
347 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
348 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
349 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
350 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
351 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
352 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
353 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
354 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
357 ; XOP-LABEL: test_bitreverse_v8i16:
359 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
361 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
365 define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
366 ; SSE2-LABEL: test_bitreverse_v4i32:
368 ; SSE2-NEXT: pxor %xmm1, %xmm1
369 ; SSE2-NEXT: movdqa %xmm0, %xmm2
370 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
371 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
372 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
373 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
374 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
375 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
376 ; SSE2-NEXT: packuswb %xmm2, %xmm0
377 ; SSE2-NEXT: movdqa %xmm0, %xmm1
378 ; SSE2-NEXT: psllw $4, %xmm1
379 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
380 ; SSE2-NEXT: psrlw $4, %xmm0
381 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
382 ; SSE2-NEXT: por %xmm1, %xmm0
383 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
384 ; SSE2-NEXT: pand %xmm0, %xmm1
385 ; SSE2-NEXT: psllw $2, %xmm1
386 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
387 ; SSE2-NEXT: psrlw $2, %xmm0
388 ; SSE2-NEXT: por %xmm1, %xmm0
389 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
390 ; SSE2-NEXT: pand %xmm0, %xmm1
391 ; SSE2-NEXT: paddb %xmm1, %xmm1
392 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
393 ; SSE2-NEXT: psrlw $1, %xmm0
394 ; SSE2-NEXT: por %xmm1, %xmm0
397 ; SSSE3-LABEL: test_bitreverse_v4i32:
399 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
400 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
401 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
402 ; SSSE3-NEXT: pand %xmm1, %xmm2
403 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
404 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
405 ; SSSE3-NEXT: psrlw $4, %xmm0
406 ; SSSE3-NEXT: pand %xmm1, %xmm0
407 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
408 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
409 ; SSSE3-NEXT: por %xmm3, %xmm1
410 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
413 ; AVX-LABEL: test_bitreverse_v4i32:
415 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
416 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
417 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
418 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
419 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
420 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
421 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
422 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
423 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
424 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
427 ; XOP-LABEL: test_bitreverse_v4i32:
429 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
431 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
435 define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
436 ; SSE2-LABEL: test_bitreverse_v2i64:
438 ; SSE2-NEXT: pxor %xmm1, %xmm1
439 ; SSE2-NEXT: movdqa %xmm0, %xmm2
440 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
441 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
442 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
443 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
444 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
445 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
446 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
447 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
448 ; SSE2-NEXT: packuswb %xmm2, %xmm0
449 ; SSE2-NEXT: movdqa %xmm0, %xmm1
450 ; SSE2-NEXT: psllw $4, %xmm1
451 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
452 ; SSE2-NEXT: psrlw $4, %xmm0
453 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
454 ; SSE2-NEXT: por %xmm1, %xmm0
455 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
456 ; SSE2-NEXT: pand %xmm0, %xmm1
457 ; SSE2-NEXT: psllw $2, %xmm1
458 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
459 ; SSE2-NEXT: psrlw $2, %xmm0
460 ; SSE2-NEXT: por %xmm1, %xmm0
461 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
462 ; SSE2-NEXT: pand %xmm0, %xmm1
463 ; SSE2-NEXT: paddb %xmm1, %xmm1
464 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
465 ; SSE2-NEXT: psrlw $1, %xmm0
466 ; SSE2-NEXT: por %xmm1, %xmm0
469 ; SSSE3-LABEL: test_bitreverse_v2i64:
471 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
472 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
473 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
474 ; SSSE3-NEXT: pand %xmm1, %xmm2
475 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
476 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
477 ; SSSE3-NEXT: psrlw $4, %xmm0
478 ; SSSE3-NEXT: pand %xmm1, %xmm0
479 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
480 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
481 ; SSSE3-NEXT: por %xmm3, %xmm1
482 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
485 ; AVX-LABEL: test_bitreverse_v2i64:
487 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
488 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
489 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
490 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
491 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
492 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
493 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
494 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
495 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
496 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
499 ; XOP-LABEL: test_bitreverse_v2i64:
501 ; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
503 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
507 define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
508 ; SSE2-LABEL: test_bitreverse_v32i8:
510 ; SSE2-NEXT: movdqa %xmm1, %xmm2
511 ; SSE2-NEXT: movdqa %xmm0, %xmm3
512 ; SSE2-NEXT: psllw $4, %xmm3
513 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
514 ; SSE2-NEXT: movdqa %xmm1, %xmm4
515 ; SSE2-NEXT: pandn %xmm3, %xmm4
516 ; SSE2-NEXT: psrlw $4, %xmm0
517 ; SSE2-NEXT: pand %xmm1, %xmm0
518 ; SSE2-NEXT: por %xmm4, %xmm0
519 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
520 ; SSE2-NEXT: movdqa %xmm0, %xmm4
521 ; SSE2-NEXT: pand %xmm3, %xmm4
522 ; SSE2-NEXT: psllw $2, %xmm4
523 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
524 ; SSE2-NEXT: pand %xmm5, %xmm0
525 ; SSE2-NEXT: psrlw $2, %xmm0
526 ; SSE2-NEXT: por %xmm4, %xmm0
527 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
528 ; SSE2-NEXT: movdqa %xmm0, %xmm6
529 ; SSE2-NEXT: pand %xmm4, %xmm6
530 ; SSE2-NEXT: paddb %xmm6, %xmm6
531 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
532 ; SSE2-NEXT: pand %xmm7, %xmm0
533 ; SSE2-NEXT: psrlw $1, %xmm0
534 ; SSE2-NEXT: por %xmm6, %xmm0
535 ; SSE2-NEXT: movdqa %xmm2, %xmm6
536 ; SSE2-NEXT: psllw $4, %xmm6
537 ; SSE2-NEXT: psrlw $4, %xmm2
538 ; SSE2-NEXT: pand %xmm1, %xmm2
539 ; SSE2-NEXT: pandn %xmm6, %xmm1
540 ; SSE2-NEXT: por %xmm2, %xmm1
541 ; SSE2-NEXT: pand %xmm1, %xmm3
542 ; SSE2-NEXT: psllw $2, %xmm3
543 ; SSE2-NEXT: pand %xmm5, %xmm1
544 ; SSE2-NEXT: psrlw $2, %xmm1
545 ; SSE2-NEXT: por %xmm3, %xmm1
546 ; SSE2-NEXT: pand %xmm1, %xmm4
547 ; SSE2-NEXT: paddb %xmm4, %xmm4
548 ; SSE2-NEXT: pand %xmm7, %xmm1
549 ; SSE2-NEXT: psrlw $1, %xmm1
550 ; SSE2-NEXT: por %xmm4, %xmm1
553 ; SSSE3-LABEL: test_bitreverse_v32i8:
555 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
556 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
557 ; SSSE3-NEXT: pand %xmm4, %xmm2
558 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
559 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
560 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
561 ; SSSE3-NEXT: psrlw $4, %xmm0
562 ; SSSE3-NEXT: pand %xmm4, %xmm0
563 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
564 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
565 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
566 ; SSSE3-NEXT: por %xmm6, %xmm3
567 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
568 ; SSSE3-NEXT: pand %xmm4, %xmm0
569 ; SSSE3-NEXT: pshufb %xmm0, %xmm5
570 ; SSSE3-NEXT: psrlw $4, %xmm1
571 ; SSSE3-NEXT: pand %xmm4, %xmm1
572 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
573 ; SSSE3-NEXT: por %xmm5, %xmm2
574 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
575 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
578 ; AVX1-LABEL: test_bitreverse_v32i8:
580 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
581 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
582 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
583 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
584 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
585 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
586 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
587 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
588 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
589 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
590 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3
591 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
592 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
593 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
594 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
595 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
596 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
599 ; AVX2-LABEL: test_bitreverse_v32i8:
601 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
602 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
603 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
604 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
605 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
606 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
607 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
608 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
609 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
612 ; AVX512-LABEL: test_bitreverse_v32i8:
614 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
615 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
616 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
617 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
618 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
619 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
620 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
621 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
622 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
625 ; XOPAVX1-LABEL: test_bitreverse_v32i8:
627 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
628 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
629 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
630 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
631 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
634 ; XOPAVX2-LABEL: test_bitreverse_v32i8:
636 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
637 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
638 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
639 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
640 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
642 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
646 define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
647 ; SSE2-LABEL: test_bitreverse_v16i16:
649 ; SSE2-NEXT: movdqa %xmm1, %xmm2
650 ; SSE2-NEXT: pxor %xmm4, %xmm4
651 ; SSE2-NEXT: movdqa %xmm0, %xmm1
652 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
653 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
654 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
655 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
656 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
657 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
658 ; SSE2-NEXT: packuswb %xmm1, %xmm0
659 ; SSE2-NEXT: movdqa %xmm0, %xmm3
660 ; SSE2-NEXT: psllw $4, %xmm3
661 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
662 ; SSE2-NEXT: movdqa %xmm1, %xmm5
663 ; SSE2-NEXT: pandn %xmm3, %xmm5
664 ; SSE2-NEXT: psrlw $4, %xmm0
665 ; SSE2-NEXT: pand %xmm1, %xmm0
666 ; SSE2-NEXT: por %xmm5, %xmm0
667 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
668 ; SSE2-NEXT: movdqa %xmm0, %xmm5
669 ; SSE2-NEXT: pand %xmm3, %xmm5
670 ; SSE2-NEXT: psllw $2, %xmm5
671 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
672 ; SSE2-NEXT: pand %xmm8, %xmm0
673 ; SSE2-NEXT: psrlw $2, %xmm0
674 ; SSE2-NEXT: por %xmm5, %xmm0
675 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
676 ; SSE2-NEXT: movdqa %xmm0, %xmm6
677 ; SSE2-NEXT: pand %xmm5, %xmm6
678 ; SSE2-NEXT: paddb %xmm6, %xmm6
679 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
680 ; SSE2-NEXT: pand %xmm7, %xmm0
681 ; SSE2-NEXT: psrlw $1, %xmm0
682 ; SSE2-NEXT: por %xmm6, %xmm0
683 ; SSE2-NEXT: movdqa %xmm2, %xmm6
684 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
685 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
686 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
687 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
688 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
689 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
690 ; SSE2-NEXT: packuswb %xmm6, %xmm2
691 ; SSE2-NEXT: movdqa %xmm2, %xmm4
692 ; SSE2-NEXT: psllw $4, %xmm4
693 ; SSE2-NEXT: psrlw $4, %xmm2
694 ; SSE2-NEXT: pand %xmm1, %xmm2
695 ; SSE2-NEXT: pandn %xmm4, %xmm1
696 ; SSE2-NEXT: por %xmm2, %xmm1
697 ; SSE2-NEXT: pand %xmm1, %xmm3
698 ; SSE2-NEXT: psllw $2, %xmm3
699 ; SSE2-NEXT: pand %xmm8, %xmm1
700 ; SSE2-NEXT: psrlw $2, %xmm1
701 ; SSE2-NEXT: por %xmm3, %xmm1
702 ; SSE2-NEXT: pand %xmm1, %xmm5
703 ; SSE2-NEXT: paddb %xmm5, %xmm5
704 ; SSE2-NEXT: pand %xmm7, %xmm1
705 ; SSE2-NEXT: psrlw $1, %xmm1
706 ; SSE2-NEXT: por %xmm5, %xmm1
709 ; SSSE3-LABEL: test_bitreverse_v16i16:
711 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
712 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
713 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
714 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
715 ; SSSE3-NEXT: pand %xmm5, %xmm2
716 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
717 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
718 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
719 ; SSSE3-NEXT: psrlw $4, %xmm0
720 ; SSSE3-NEXT: pand %xmm5, %xmm0
721 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
722 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
723 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
724 ; SSSE3-NEXT: por %xmm7, %xmm3
725 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
726 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
727 ; SSSE3-NEXT: pand %xmm5, %xmm0
728 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
729 ; SSSE3-NEXT: psrlw $4, %xmm1
730 ; SSSE3-NEXT: pand %xmm5, %xmm1
731 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
732 ; SSSE3-NEXT: por %xmm6, %xmm2
733 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
734 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
737 ; AVX1-LABEL: test_bitreverse_v16i16:
739 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
740 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
741 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
742 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
743 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
744 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
745 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
746 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
747 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
748 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
749 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
750 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
751 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
752 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
753 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
754 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
755 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
756 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
757 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
758 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
761 ; AVX2-LABEL: test_bitreverse_v16i16:
763 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
764 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
765 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
766 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
767 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
768 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
769 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
770 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
771 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
772 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
775 ; AVX512-LABEL: test_bitreverse_v16i16:
777 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
778 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
779 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
780 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
781 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
782 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
783 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
784 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
785 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
786 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
789 ; XOPAVX1-LABEL: test_bitreverse_v16i16:
791 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
792 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
793 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
794 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
795 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
798 ; XOPAVX2-LABEL: test_bitreverse_v16i16:
800 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
801 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
802 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
803 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
804 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
806 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
810 define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
811 ; SSE2-LABEL: test_bitreverse_v8i32:
813 ; SSE2-NEXT: movdqa %xmm1, %xmm2
814 ; SSE2-NEXT: pxor %xmm4, %xmm4
815 ; SSE2-NEXT: movdqa %xmm0, %xmm1
816 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
817 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
818 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
819 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
820 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
821 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
822 ; SSE2-NEXT: packuswb %xmm1, %xmm0
823 ; SSE2-NEXT: movdqa %xmm0, %xmm3
824 ; SSE2-NEXT: psllw $4, %xmm3
825 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
826 ; SSE2-NEXT: movdqa %xmm1, %xmm5
827 ; SSE2-NEXT: pandn %xmm3, %xmm5
828 ; SSE2-NEXT: psrlw $4, %xmm0
829 ; SSE2-NEXT: pand %xmm1, %xmm0
830 ; SSE2-NEXT: por %xmm5, %xmm0
831 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
832 ; SSE2-NEXT: movdqa %xmm0, %xmm5
833 ; SSE2-NEXT: pand %xmm3, %xmm5
834 ; SSE2-NEXT: psllw $2, %xmm5
835 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
836 ; SSE2-NEXT: pand %xmm8, %xmm0
837 ; SSE2-NEXT: psrlw $2, %xmm0
838 ; SSE2-NEXT: por %xmm5, %xmm0
839 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
840 ; SSE2-NEXT: movdqa %xmm0, %xmm6
841 ; SSE2-NEXT: pand %xmm5, %xmm6
842 ; SSE2-NEXT: paddb %xmm6, %xmm6
843 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
844 ; SSE2-NEXT: pand %xmm7, %xmm0
845 ; SSE2-NEXT: psrlw $1, %xmm0
846 ; SSE2-NEXT: por %xmm6, %xmm0
847 ; SSE2-NEXT: movdqa %xmm2, %xmm6
848 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
849 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
850 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
851 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
852 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
853 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
854 ; SSE2-NEXT: packuswb %xmm6, %xmm2
855 ; SSE2-NEXT: movdqa %xmm2, %xmm4
856 ; SSE2-NEXT: psllw $4, %xmm4
857 ; SSE2-NEXT: psrlw $4, %xmm2
858 ; SSE2-NEXT: pand %xmm1, %xmm2
859 ; SSE2-NEXT: pandn %xmm4, %xmm1
860 ; SSE2-NEXT: por %xmm2, %xmm1
861 ; SSE2-NEXT: pand %xmm1, %xmm3
862 ; SSE2-NEXT: psllw $2, %xmm3
863 ; SSE2-NEXT: pand %xmm8, %xmm1
864 ; SSE2-NEXT: psrlw $2, %xmm1
865 ; SSE2-NEXT: por %xmm3, %xmm1
866 ; SSE2-NEXT: pand %xmm1, %xmm5
867 ; SSE2-NEXT: paddb %xmm5, %xmm5
868 ; SSE2-NEXT: pand %xmm7, %xmm1
869 ; SSE2-NEXT: psrlw $1, %xmm1
870 ; SSE2-NEXT: por %xmm5, %xmm1
873 ; SSSE3-LABEL: test_bitreverse_v8i32:
875 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
876 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
877 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
878 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
879 ; SSSE3-NEXT: pand %xmm5, %xmm2
880 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
881 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
882 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
883 ; SSSE3-NEXT: psrlw $4, %xmm0
884 ; SSSE3-NEXT: pand %xmm5, %xmm0
885 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
886 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
887 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
888 ; SSSE3-NEXT: por %xmm7, %xmm3
889 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
890 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
891 ; SSSE3-NEXT: pand %xmm5, %xmm0
892 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
893 ; SSSE3-NEXT: psrlw $4, %xmm1
894 ; SSSE3-NEXT: pand %xmm5, %xmm1
895 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
896 ; SSSE3-NEXT: por %xmm6, %xmm2
897 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
898 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
901 ; AVX1-LABEL: test_bitreverse_v8i32:
903 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
904 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
905 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
906 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
907 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
908 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
909 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
910 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
911 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
912 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
913 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
914 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
915 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
916 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
917 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
918 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
919 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
920 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
921 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
922 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
925 ; AVX2-LABEL: test_bitreverse_v8i32:
927 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
928 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
929 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
930 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
931 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
932 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
933 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
934 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
935 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
936 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
939 ; AVX512-LABEL: test_bitreverse_v8i32:
941 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
942 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
943 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
944 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
945 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
946 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
947 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
948 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
949 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
950 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
953 ; XOPAVX1-LABEL: test_bitreverse_v8i32:
955 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
956 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
957 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
958 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
959 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
962 ; XOPAVX2-LABEL: test_bitreverse_v8i32:
964 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
965 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
966 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
967 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
968 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
970 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
974 define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
975 ; SSE2-LABEL: test_bitreverse_v4i64:
977 ; SSE2-NEXT: movdqa %xmm1, %xmm2
978 ; SSE2-NEXT: pxor %xmm4, %xmm4
979 ; SSE2-NEXT: movdqa %xmm0, %xmm1
980 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
981 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
982 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
983 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
984 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
985 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
986 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
987 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
988 ; SSE2-NEXT: packuswb %xmm1, %xmm0
989 ; SSE2-NEXT: movdqa %xmm0, %xmm3
990 ; SSE2-NEXT: psllw $4, %xmm3
991 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
992 ; SSE2-NEXT: movdqa %xmm1, %xmm5
993 ; SSE2-NEXT: pandn %xmm3, %xmm5
994 ; SSE2-NEXT: psrlw $4, %xmm0
995 ; SSE2-NEXT: pand %xmm1, %xmm0
996 ; SSE2-NEXT: por %xmm5, %xmm0
997 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
998 ; SSE2-NEXT: movdqa %xmm0, %xmm5
999 ; SSE2-NEXT: pand %xmm3, %xmm5
1000 ; SSE2-NEXT: psllw $2, %xmm5
1001 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1002 ; SSE2-NEXT: pand %xmm8, %xmm0
1003 ; SSE2-NEXT: psrlw $2, %xmm0
1004 ; SSE2-NEXT: por %xmm5, %xmm0
1005 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1006 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1007 ; SSE2-NEXT: pand %xmm5, %xmm6
1008 ; SSE2-NEXT: paddb %xmm6, %xmm6
1009 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1010 ; SSE2-NEXT: pand %xmm7, %xmm0
1011 ; SSE2-NEXT: psrlw $1, %xmm0
1012 ; SSE2-NEXT: por %xmm6, %xmm0
1013 ; SSE2-NEXT: movdqa %xmm2, %xmm6
1014 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
1015 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1016 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1017 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1018 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1019 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1020 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1021 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1022 ; SSE2-NEXT: packuswb %xmm6, %xmm2
1023 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1024 ; SSE2-NEXT: psllw $4, %xmm4
1025 ; SSE2-NEXT: psrlw $4, %xmm2
1026 ; SSE2-NEXT: pand %xmm1, %xmm2
1027 ; SSE2-NEXT: pandn %xmm4, %xmm1
1028 ; SSE2-NEXT: por %xmm2, %xmm1
1029 ; SSE2-NEXT: pand %xmm1, %xmm3
1030 ; SSE2-NEXT: psllw $2, %xmm3
1031 ; SSE2-NEXT: pand %xmm8, %xmm1
1032 ; SSE2-NEXT: psrlw $2, %xmm1
1033 ; SSE2-NEXT: por %xmm3, %xmm1
1034 ; SSE2-NEXT: pand %xmm1, %xmm5
1035 ; SSE2-NEXT: paddb %xmm5, %xmm5
1036 ; SSE2-NEXT: pand %xmm7, %xmm1
1037 ; SSE2-NEXT: psrlw $1, %xmm1
1038 ; SSE2-NEXT: por %xmm5, %xmm1
1041 ; SSSE3-LABEL: test_bitreverse_v4i64:
1043 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1044 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
1045 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1046 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1047 ; SSSE3-NEXT: pand %xmm5, %xmm2
1048 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1049 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
1050 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
1051 ; SSSE3-NEXT: psrlw $4, %xmm0
1052 ; SSSE3-NEXT: pand %xmm5, %xmm0
1053 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1054 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
1055 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
1056 ; SSSE3-NEXT: por %xmm7, %xmm3
1057 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
1058 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1059 ; SSSE3-NEXT: pand %xmm5, %xmm0
1060 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1061 ; SSSE3-NEXT: psrlw $4, %xmm1
1062 ; SSSE3-NEXT: pand %xmm5, %xmm1
1063 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
1064 ; SSSE3-NEXT: por %xmm6, %xmm2
1065 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1066 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1069 ; AVX1-LABEL: test_bitreverse_v4i64:
1071 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1072 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1073 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1074 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1075 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
1076 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1077 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1078 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1079 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1080 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1081 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
1082 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
1083 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1084 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
1085 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
1086 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1087 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1088 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
1089 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1090 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1093 ; AVX2-LABEL: test_bitreverse_v4i64:
1095 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1096 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1097 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
1098 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1099 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1100 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1101 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1102 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1103 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1104 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
1107 ; AVX512-LABEL: test_bitreverse_v4i64:
1109 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1110 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1111 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
1112 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1113 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1114 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
1115 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1116 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1117 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1118 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
1121 ; XOPAVX1-LABEL: test_bitreverse_v4i64:
1123 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1124 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1125 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1126 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1127 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1128 ; XOPAVX1-NEXT: retq
1130 ; XOPAVX2-LABEL: test_bitreverse_v4i64:
1132 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1133 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1134 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1135 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1136 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1137 ; XOPAVX2-NEXT: retq
1138 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1142 define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1143 ; SSE2-LABEL: test_bitreverse_v64i8:
1145 ; SSE2-NEXT: movdqa %xmm3, %xmm10
1146 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1147 ; SSE2-NEXT: psllw $4, %xmm5
1148 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1149 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1150 ; SSE2-NEXT: pandn %xmm5, %xmm6
1151 ; SSE2-NEXT: psrlw $4, %xmm0
1152 ; SSE2-NEXT: pand %xmm3, %xmm0
1153 ; SSE2-NEXT: por %xmm6, %xmm0
1154 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1155 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1156 ; SSE2-NEXT: pand %xmm5, %xmm6
1157 ; SSE2-NEXT: psllw $2, %xmm6
1158 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1159 ; SSE2-NEXT: pand %xmm8, %xmm0
1160 ; SSE2-NEXT: psrlw $2, %xmm0
1161 ; SSE2-NEXT: por %xmm6, %xmm0
1162 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1163 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1164 ; SSE2-NEXT: pand %xmm6, %xmm7
1165 ; SSE2-NEXT: paddb %xmm7, %xmm7
1166 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1167 ; SSE2-NEXT: pand %xmm9, %xmm0
1168 ; SSE2-NEXT: psrlw $1, %xmm0
1169 ; SSE2-NEXT: por %xmm7, %xmm0
1170 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1171 ; SSE2-NEXT: psllw $4, %xmm7
1172 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1173 ; SSE2-NEXT: pandn %xmm7, %xmm4
1174 ; SSE2-NEXT: psrlw $4, %xmm1
1175 ; SSE2-NEXT: pand %xmm3, %xmm1
1176 ; SSE2-NEXT: por %xmm4, %xmm1
1177 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1178 ; SSE2-NEXT: pand %xmm5, %xmm4
1179 ; SSE2-NEXT: psllw $2, %xmm4
1180 ; SSE2-NEXT: pand %xmm8, %xmm1
1181 ; SSE2-NEXT: psrlw $2, %xmm1
1182 ; SSE2-NEXT: por %xmm4, %xmm1
1183 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1184 ; SSE2-NEXT: pand %xmm6, %xmm4
1185 ; SSE2-NEXT: paddb %xmm4, %xmm4
1186 ; SSE2-NEXT: pand %xmm9, %xmm1
1187 ; SSE2-NEXT: psrlw $1, %xmm1
1188 ; SSE2-NEXT: por %xmm4, %xmm1
1189 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1190 ; SSE2-NEXT: psllw $4, %xmm4
1191 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1192 ; SSE2-NEXT: pandn %xmm4, %xmm7
1193 ; SSE2-NEXT: psrlw $4, %xmm2
1194 ; SSE2-NEXT: pand %xmm3, %xmm2
1195 ; SSE2-NEXT: por %xmm7, %xmm2
1196 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1197 ; SSE2-NEXT: pand %xmm5, %xmm4
1198 ; SSE2-NEXT: psllw $2, %xmm4
1199 ; SSE2-NEXT: pand %xmm8, %xmm2
1200 ; SSE2-NEXT: psrlw $2, %xmm2
1201 ; SSE2-NEXT: por %xmm4, %xmm2
1202 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1203 ; SSE2-NEXT: pand %xmm6, %xmm4
1204 ; SSE2-NEXT: paddb %xmm4, %xmm4
1205 ; SSE2-NEXT: pand %xmm9, %xmm2
1206 ; SSE2-NEXT: psrlw $1, %xmm2
1207 ; SSE2-NEXT: por %xmm4, %xmm2
1208 ; SSE2-NEXT: movdqa %xmm10, %xmm4
1209 ; SSE2-NEXT: psllw $4, %xmm4
1210 ; SSE2-NEXT: psrlw $4, %xmm10
1211 ; SSE2-NEXT: pand %xmm3, %xmm10
1212 ; SSE2-NEXT: pandn %xmm4, %xmm3
1213 ; SSE2-NEXT: por %xmm10, %xmm3
1214 ; SSE2-NEXT: pand %xmm3, %xmm5
1215 ; SSE2-NEXT: psllw $2, %xmm5
1216 ; SSE2-NEXT: pand %xmm8, %xmm3
1217 ; SSE2-NEXT: psrlw $2, %xmm3
1218 ; SSE2-NEXT: por %xmm5, %xmm3
1219 ; SSE2-NEXT: pand %xmm3, %xmm6
1220 ; SSE2-NEXT: paddb %xmm6, %xmm6
1221 ; SSE2-NEXT: pand %xmm9, %xmm3
1222 ; SSE2-NEXT: psrlw $1, %xmm3
1223 ; SSE2-NEXT: por %xmm6, %xmm3
1226 ; SSSE3-LABEL: test_bitreverse_v64i8:
1228 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
1229 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1230 ; SSSE3-NEXT: pand %xmm8, %xmm0
1231 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1232 ; SSSE3-NEXT: movdqa %xmm9, %xmm6
1233 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1234 ; SSSE3-NEXT: psrlw $4, %xmm5
1235 ; SSSE3-NEXT: pand %xmm8, %xmm5
1236 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1237 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
1238 ; SSSE3-NEXT: pshufb %xmm5, %xmm0
1239 ; SSSE3-NEXT: por %xmm6, %xmm0
1240 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
1241 ; SSSE3-NEXT: pand %xmm8, %xmm5
1242 ; SSSE3-NEXT: movdqa %xmm9, %xmm6
1243 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
1244 ; SSSE3-NEXT: psrlw $4, %xmm1
1245 ; SSSE3-NEXT: pand %xmm8, %xmm1
1246 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
1247 ; SSSE3-NEXT: pshufb %xmm1, %xmm5
1248 ; SSSE3-NEXT: por %xmm6, %xmm5
1249 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1250 ; SSSE3-NEXT: pand %xmm8, %xmm1
1251 ; SSSE3-NEXT: movdqa %xmm9, %xmm7
1252 ; SSSE3-NEXT: pshufb %xmm1, %xmm7
1253 ; SSSE3-NEXT: psrlw $4, %xmm2
1254 ; SSSE3-NEXT: pand %xmm8, %xmm2
1255 ; SSSE3-NEXT: movdqa %xmm4, %xmm6
1256 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1257 ; SSSE3-NEXT: por %xmm7, %xmm6
1258 ; SSSE3-NEXT: movdqa %xmm3, %xmm1
1259 ; SSSE3-NEXT: pand %xmm8, %xmm1
1260 ; SSSE3-NEXT: pshufb %xmm1, %xmm9
1261 ; SSSE3-NEXT: psrlw $4, %xmm3
1262 ; SSSE3-NEXT: pand %xmm8, %xmm3
1263 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
1264 ; SSSE3-NEXT: por %xmm9, %xmm4
1265 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
1266 ; SSSE3-NEXT: movdqa %xmm6, %xmm2
1267 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
1270 ; AVX1-LABEL: test_bitreverse_v64i8:
1272 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1273 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1274 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1275 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1276 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1277 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
1278 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1279 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1280 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
1281 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
1282 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4
1283 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1284 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1285 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1286 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
1287 ; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
1288 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1289 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1290 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1291 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1292 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
1293 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1294 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
1295 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
1296 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
1297 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1298 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1299 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1300 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
1301 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
1302 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1305 ; AVX2-LABEL: test_bitreverse_v64i8:
1307 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1308 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
1309 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1310 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
1311 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1312 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1313 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1314 ; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0
1315 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
1316 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
1317 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
1318 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
1319 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1320 ; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1
1321 ; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
1324 ; AVX512F-LABEL: test_bitreverse_v64i8:
1326 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1327 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1328 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
1329 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1330 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
1331 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
1332 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
1333 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1334 ; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1
1335 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
1336 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
1337 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
1338 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1339 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
1340 ; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0
1341 ; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
1342 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1343 ; AVX512F-NEXT: retq
1345 ; AVX512BW-LABEL: test_bitreverse_v64i8:
1346 ; AVX512BW: # %bb.0:
1347 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1348 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
1349 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1350 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
1351 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
1352 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
1353 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1354 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
1355 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
1356 ; AVX512BW-NEXT: retq
1358 ; XOPAVX1-LABEL: test_bitreverse_v64i8:
1360 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1361 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1362 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1363 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
1364 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1365 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1366 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1367 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
1368 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1369 ; XOPAVX1-NEXT: retq
1371 ; XOPAVX2-LABEL: test_bitreverse_v64i8:
1373 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1374 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1375 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1376 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
1377 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1378 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1379 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1380 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
1381 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
1382 ; XOPAVX2-NEXT: retq
1383 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
1387 define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
1388 ; SSE2-LABEL: test_bitreverse_v32i16:
1390 ; SSE2-NEXT: movdqa %xmm3, %xmm11
1391 ; SSE2-NEXT: pxor %xmm10, %xmm10
1392 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1393 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
1394 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
1395 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
1396 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1397 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
1398 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
1399 ; SSE2-NEXT: packuswb %xmm3, %xmm0
1400 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1401 ; SSE2-NEXT: psllw $4, %xmm5
1402 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1403 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1404 ; SSE2-NEXT: pandn %xmm5, %xmm7
1405 ; SSE2-NEXT: psrlw $4, %xmm0
1406 ; SSE2-NEXT: pand %xmm3, %xmm0
1407 ; SSE2-NEXT: por %xmm7, %xmm0
1408 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1409 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1410 ; SSE2-NEXT: pand %xmm5, %xmm7
1411 ; SSE2-NEXT: psllw $2, %xmm7
1412 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1413 ; SSE2-NEXT: pand %xmm8, %xmm0
1414 ; SSE2-NEXT: psrlw $2, %xmm0
1415 ; SSE2-NEXT: por %xmm7, %xmm0
1416 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1417 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1418 ; SSE2-NEXT: pand %xmm7, %xmm6
1419 ; SSE2-NEXT: paddb %xmm6, %xmm6
1420 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1421 ; SSE2-NEXT: pand %xmm9, %xmm0
1422 ; SSE2-NEXT: psrlw $1, %xmm0
1423 ; SSE2-NEXT: por %xmm6, %xmm0
1424 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1425 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
1426 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
1427 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1428 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
1429 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1430 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
1431 ; SSE2-NEXT: packuswb %xmm6, %xmm1
1432 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1433 ; SSE2-NEXT: psllw $4, %xmm6
1434 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1435 ; SSE2-NEXT: pandn %xmm6, %xmm4
1436 ; SSE2-NEXT: psrlw $4, %xmm1
1437 ; SSE2-NEXT: pand %xmm3, %xmm1
1438 ; SSE2-NEXT: por %xmm4, %xmm1
1439 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1440 ; SSE2-NEXT: pand %xmm5, %xmm4
1441 ; SSE2-NEXT: psllw $2, %xmm4
1442 ; SSE2-NEXT: pand %xmm8, %xmm1
1443 ; SSE2-NEXT: psrlw $2, %xmm1
1444 ; SSE2-NEXT: por %xmm4, %xmm1
1445 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1446 ; SSE2-NEXT: pand %xmm7, %xmm4
1447 ; SSE2-NEXT: paddb %xmm4, %xmm4
1448 ; SSE2-NEXT: pand %xmm9, %xmm1
1449 ; SSE2-NEXT: psrlw $1, %xmm1
1450 ; SSE2-NEXT: por %xmm4, %xmm1
1451 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1452 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
1453 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
1454 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
1455 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
1456 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
1457 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
1458 ; SSE2-NEXT: packuswb %xmm4, %xmm2
1459 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1460 ; SSE2-NEXT: psllw $4, %xmm4
1461 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1462 ; SSE2-NEXT: pandn %xmm4, %xmm6
1463 ; SSE2-NEXT: psrlw $4, %xmm2
1464 ; SSE2-NEXT: pand %xmm3, %xmm2
1465 ; SSE2-NEXT: por %xmm6, %xmm2
1466 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1467 ; SSE2-NEXT: pand %xmm5, %xmm4
1468 ; SSE2-NEXT: psllw $2, %xmm4
1469 ; SSE2-NEXT: pand %xmm8, %xmm2
1470 ; SSE2-NEXT: psrlw $2, %xmm2
1471 ; SSE2-NEXT: por %xmm4, %xmm2
1472 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1473 ; SSE2-NEXT: pand %xmm7, %xmm4
1474 ; SSE2-NEXT: paddb %xmm4, %xmm4
1475 ; SSE2-NEXT: pand %xmm9, %xmm2
1476 ; SSE2-NEXT: psrlw $1, %xmm2
1477 ; SSE2-NEXT: por %xmm4, %xmm2
1478 ; SSE2-NEXT: movdqa %xmm11, %xmm4
1479 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
1480 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
1481 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,6]
1482 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
1483 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[1,0,3,2,4,5,6,7]
1484 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,4,7,6]
1485 ; SSE2-NEXT: packuswb %xmm4, %xmm6
1486 ; SSE2-NEXT: movdqa %xmm6, %xmm4
1487 ; SSE2-NEXT: psllw $4, %xmm4
1488 ; SSE2-NEXT: psrlw $4, %xmm6
1489 ; SSE2-NEXT: pand %xmm3, %xmm6
1490 ; SSE2-NEXT: pandn %xmm4, %xmm3
1491 ; SSE2-NEXT: por %xmm6, %xmm3
1492 ; SSE2-NEXT: pand %xmm3, %xmm5
1493 ; SSE2-NEXT: psllw $2, %xmm5
1494 ; SSE2-NEXT: pand %xmm8, %xmm3
1495 ; SSE2-NEXT: psrlw $2, %xmm3
1496 ; SSE2-NEXT: por %xmm5, %xmm3
1497 ; SSE2-NEXT: pand %xmm3, %xmm7
1498 ; SSE2-NEXT: paddb %xmm7, %xmm7
1499 ; SSE2-NEXT: pand %xmm9, %xmm3
1500 ; SSE2-NEXT: psrlw $1, %xmm3
1501 ; SSE2-NEXT: por %xmm7, %xmm3
1504 ; SSSE3-LABEL: test_bitreverse_v32i16:
1506 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
1507 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
1508 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1509 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
1510 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1511 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1512 ; SSSE3-NEXT: pand %xmm9, %xmm0
1513 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1514 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
1515 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1516 ; SSSE3-NEXT: psrlw $4, %xmm1
1517 ; SSSE3-NEXT: pand %xmm9, %xmm1
1518 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1519 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
1520 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1521 ; SSSE3-NEXT: por %xmm6, %xmm0
1522 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
1523 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
1524 ; SSSE3-NEXT: pand %xmm9, %xmm1
1525 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
1526 ; SSSE3-NEXT: pshufb %xmm1, %xmm6
1527 ; SSSE3-NEXT: psrlw $4, %xmm5
1528 ; SSSE3-NEXT: pand %xmm9, %xmm5
1529 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
1530 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
1531 ; SSSE3-NEXT: por %xmm6, %xmm1
1532 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
1533 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
1534 ; SSSE3-NEXT: pand %xmm9, %xmm5
1535 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
1536 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
1537 ; SSSE3-NEXT: psrlw $4, %xmm2
1538 ; SSSE3-NEXT: pand %xmm9, %xmm2
1539 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
1540 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
1541 ; SSSE3-NEXT: por %xmm6, %xmm5
1542 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
1543 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
1544 ; SSSE3-NEXT: pand %xmm9, %xmm2
1545 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
1546 ; SSSE3-NEXT: psrlw $4, %xmm3
1547 ; SSSE3-NEXT: pand %xmm9, %xmm3
1548 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
1549 ; SSSE3-NEXT: por %xmm7, %xmm4
1550 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
1551 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
1554 ; AVX1-LABEL: test_bitreverse_v32i16:
1556 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1557 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1558 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1559 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1560 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
1561 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1562 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
1563 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
1564 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1565 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1566 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
1567 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
1568 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1569 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
1570 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
1571 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1572 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
1573 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
1574 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
1575 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1576 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1577 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1578 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
1579 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
1580 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
1581 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1582 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
1583 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
1584 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1585 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
1586 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
1587 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1588 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
1589 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
1590 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
1591 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1594 ; AVX2-LABEL: test_bitreverse_v32i16:
1596 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1597 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1598 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1599 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
1600 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1601 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
1602 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1603 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
1604 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1605 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
1606 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
1607 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1608 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
1609 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
1610 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
1611 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
1612 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
1613 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
1616 ; AVX512F-LABEL: test_bitreverse_v32i16:
1618 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1619 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1620 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1621 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1622 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
1623 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1624 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
1625 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
1626 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
1627 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1628 ; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1
1629 ; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
1630 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1631 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
1632 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
1633 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1634 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
1635 ; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0
1636 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
1637 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1638 ; AVX512F-NEXT: retq
1640 ; AVX512BW-LABEL: test_bitreverse_v32i16:
1641 ; AVX512BW: # %bb.0:
1642 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
1643 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1644 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
1645 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1646 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
1647 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
1648 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
1649 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1650 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
1651 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
1652 ; AVX512BW-NEXT: retq
1654 ; XOPAVX1-LABEL: test_bitreverse_v32i16:
1656 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1657 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1658 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1659 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
1660 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1661 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1662 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1663 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
1664 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1665 ; XOPAVX1-NEXT: retq
1667 ; XOPAVX2-LABEL: test_bitreverse_v32i16:
1669 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1670 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1671 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1672 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
1673 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1674 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1675 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1676 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
1677 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
1678 ; XOPAVX2-NEXT: retq
1679 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
1683 define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
1684 ; SSE2-LABEL: test_bitreverse_v16i32:
1686 ; SSE2-NEXT: movdqa %xmm3, %xmm11
1687 ; SSE2-NEXT: pxor %xmm10, %xmm10
1688 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1689 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
1690 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1691 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1692 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1693 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1694 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1695 ; SSE2-NEXT: packuswb %xmm3, %xmm0
1696 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1697 ; SSE2-NEXT: psllw $4, %xmm5
1698 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1699 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1700 ; SSE2-NEXT: pandn %xmm5, %xmm7
1701 ; SSE2-NEXT: psrlw $4, %xmm0
1702 ; SSE2-NEXT: pand %xmm3, %xmm0
1703 ; SSE2-NEXT: por %xmm7, %xmm0
1704 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1705 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1706 ; SSE2-NEXT: pand %xmm5, %xmm7
1707 ; SSE2-NEXT: psllw $2, %xmm7
1708 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
1709 ; SSE2-NEXT: pand %xmm8, %xmm0
1710 ; SSE2-NEXT: psrlw $2, %xmm0
1711 ; SSE2-NEXT: por %xmm7, %xmm0
1712 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1713 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1714 ; SSE2-NEXT: pand %xmm7, %xmm6
1715 ; SSE2-NEXT: paddb %xmm6, %xmm6
1716 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
1717 ; SSE2-NEXT: pand %xmm9, %xmm0
1718 ; SSE2-NEXT: psrlw $1, %xmm0
1719 ; SSE2-NEXT: por %xmm6, %xmm0
1720 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1721 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
1722 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1723 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1724 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
1725 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1726 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1727 ; SSE2-NEXT: packuswb %xmm6, %xmm1
1728 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1729 ; SSE2-NEXT: psllw $4, %xmm6
1730 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1731 ; SSE2-NEXT: pandn %xmm6, %xmm4
1732 ; SSE2-NEXT: psrlw $4, %xmm1
1733 ; SSE2-NEXT: pand %xmm3, %xmm1
1734 ; SSE2-NEXT: por %xmm4, %xmm1
1735 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1736 ; SSE2-NEXT: pand %xmm5, %xmm4
1737 ; SSE2-NEXT: psllw $2, %xmm4
1738 ; SSE2-NEXT: pand %xmm8, %xmm1
1739 ; SSE2-NEXT: psrlw $2, %xmm1
1740 ; SSE2-NEXT: por %xmm4, %xmm1
1741 ; SSE2-NEXT: movdqa %xmm1, %xmm4
1742 ; SSE2-NEXT: pand %xmm7, %xmm4
1743 ; SSE2-NEXT: paddb %xmm4, %xmm4
1744 ; SSE2-NEXT: pand %xmm9, %xmm1
1745 ; SSE2-NEXT: psrlw $1, %xmm1
1746 ; SSE2-NEXT: por %xmm4, %xmm1
1747 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1748 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
1749 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
1750 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
1751 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
1752 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
1753 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
1754 ; SSE2-NEXT: packuswb %xmm4, %xmm2
1755 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1756 ; SSE2-NEXT: psllw $4, %xmm4
1757 ; SSE2-NEXT: movdqa %xmm3, %xmm6
1758 ; SSE2-NEXT: pandn %xmm4, %xmm6
1759 ; SSE2-NEXT: psrlw $4, %xmm2
1760 ; SSE2-NEXT: pand %xmm3, %xmm2
1761 ; SSE2-NEXT: por %xmm6, %xmm2
1762 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1763 ; SSE2-NEXT: pand %xmm5, %xmm4
1764 ; SSE2-NEXT: psllw $2, %xmm4
1765 ; SSE2-NEXT: pand %xmm8, %xmm2
1766 ; SSE2-NEXT: psrlw $2, %xmm2
1767 ; SSE2-NEXT: por %xmm4, %xmm2
1768 ; SSE2-NEXT: movdqa %xmm2, %xmm4
1769 ; SSE2-NEXT: pand %xmm7, %xmm4
1770 ; SSE2-NEXT: paddb %xmm4, %xmm4
1771 ; SSE2-NEXT: pand %xmm9, %xmm2
1772 ; SSE2-NEXT: psrlw $1, %xmm2
1773 ; SSE2-NEXT: por %xmm4, %xmm2
1774 ; SSE2-NEXT: movdqa %xmm11, %xmm4
1775 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
1776 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
1777 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
1778 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
1779 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[3,2,1,0,4,5,6,7]
1780 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1781 ; SSE2-NEXT: packuswb %xmm4, %xmm6
1782 ; SSE2-NEXT: movdqa %xmm6, %xmm4
1783 ; SSE2-NEXT: psllw $4, %xmm4
1784 ; SSE2-NEXT: psrlw $4, %xmm6
1785 ; SSE2-NEXT: pand %xmm3, %xmm6
1786 ; SSE2-NEXT: pandn %xmm4, %xmm3
1787 ; SSE2-NEXT: por %xmm6, %xmm3
1788 ; SSE2-NEXT: pand %xmm3, %xmm5
1789 ; SSE2-NEXT: psllw $2, %xmm5
1790 ; SSE2-NEXT: pand %xmm8, %xmm3
1791 ; SSE2-NEXT: psrlw $2, %xmm3
1792 ; SSE2-NEXT: por %xmm5, %xmm3
1793 ; SSE2-NEXT: pand %xmm3, %xmm7
1794 ; SSE2-NEXT: paddb %xmm7, %xmm7
1795 ; SSE2-NEXT: pand %xmm9, %xmm3
1796 ; SSE2-NEXT: psrlw $1, %xmm3
1797 ; SSE2-NEXT: por %xmm7, %xmm3
1800 ; SSSE3-LABEL: test_bitreverse_v16i32:
1802 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
1803 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
1804 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1805 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
1806 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1807 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1808 ; SSSE3-NEXT: pand %xmm9, %xmm0
1809 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1810 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
1811 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1812 ; SSSE3-NEXT: psrlw $4, %xmm1
1813 ; SSSE3-NEXT: pand %xmm9, %xmm1
1814 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1815 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
1816 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1817 ; SSSE3-NEXT: por %xmm6, %xmm0
1818 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
1819 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
1820 ; SSSE3-NEXT: pand %xmm9, %xmm1
1821 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
1822 ; SSSE3-NEXT: pshufb %xmm1, %xmm6
1823 ; SSSE3-NEXT: psrlw $4, %xmm5
1824 ; SSSE3-NEXT: pand %xmm9, %xmm5
1825 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
1826 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
1827 ; SSSE3-NEXT: por %xmm6, %xmm1
1828 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
1829 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
1830 ; SSSE3-NEXT: pand %xmm9, %xmm5
1831 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
1832 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
1833 ; SSSE3-NEXT: psrlw $4, %xmm2
1834 ; SSSE3-NEXT: pand %xmm9, %xmm2
1835 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
1836 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
1837 ; SSSE3-NEXT: por %xmm6, %xmm5
1838 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
1839 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
1840 ; SSSE3-NEXT: pand %xmm9, %xmm2
1841 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
1842 ; SSSE3-NEXT: psrlw $4, %xmm3
1843 ; SSSE3-NEXT: pand %xmm9, %xmm3
1844 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
1845 ; SSSE3-NEXT: por %xmm7, %xmm4
1846 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
1847 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
1850 ; AVX1-LABEL: test_bitreverse_v16i32:
1852 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1853 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1854 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1855 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1856 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
1857 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1858 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
1859 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
1860 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1861 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1862 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
1863 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
1864 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1865 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
1866 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
1867 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1868 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
1869 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
1870 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
1871 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1872 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1873 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1874 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
1875 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
1876 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
1877 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
1878 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
1879 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
1880 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1881 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
1882 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
1883 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1884 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
1885 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
1886 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
1887 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1890 ; AVX2-LABEL: test_bitreverse_v16i32:
1892 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1893 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1894 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1895 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
1896 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1897 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
1898 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1899 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
1900 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1901 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
1902 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
1903 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1904 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
1905 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
1906 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
1907 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
1908 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
1909 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
1912 ; AVX512F-LABEL: test_bitreverse_v16i32:
1914 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1915 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1916 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
1917 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1918 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
1919 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1920 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
1921 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
1922 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
1923 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1924 ; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1
1925 ; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
1926 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
1927 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
1928 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
1929 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1930 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
1931 ; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0
1932 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
1933 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1934 ; AVX512F-NEXT: retq
1936 ; AVX512BW-LABEL: test_bitreverse_v16i32:
1937 ; AVX512BW: # %bb.0:
1938 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
1939 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1940 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
1941 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1942 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
1943 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
1944 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
1945 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1946 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
1947 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
1948 ; AVX512BW-NEXT: retq
1950 ; XOPAVX1-LABEL: test_bitreverse_v16i32:
1952 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1953 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1954 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1955 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
1956 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1957 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1958 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1959 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
1960 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1961 ; XOPAVX1-NEXT: retq
1963 ; XOPAVX2-LABEL: test_bitreverse_v16i32:
1965 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1966 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1967 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1968 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
1969 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1970 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1971 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1972 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
1973 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
1974 ; XOPAVX2-NEXT: retq
1975 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
1979 define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
1980 ; SSE2-LABEL: test_bitreverse_v8i64:
1982 ; SSE2-NEXT: movdqa %xmm3, %xmm11
1983 ; SSE2-NEXT: pxor %xmm10, %xmm10
1984 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1985 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
1986 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1987 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1988 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1989 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
1990 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1991 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1992 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1993 ; SSE2-NEXT: packuswb %xmm3, %xmm0
1994 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1995 ; SSE2-NEXT: psllw $4, %xmm5
1996 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1997 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1998 ; SSE2-NEXT: pandn %xmm5, %xmm7
1999 ; SSE2-NEXT: psrlw $4, %xmm0
2000 ; SSE2-NEXT: pand %xmm3, %xmm0
2001 ; SSE2-NEXT: por %xmm7, %xmm0
2002 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2003 ; SSE2-NEXT: movdqa %xmm0, %xmm7
2004 ; SSE2-NEXT: pand %xmm5, %xmm7
2005 ; SSE2-NEXT: psllw $2, %xmm7
2006 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [204,204,204,204,204,204,204,204,204,204,204,204,204,204,204,204]
2007 ; SSE2-NEXT: pand %xmm8, %xmm0
2008 ; SSE2-NEXT: psrlw $2, %xmm0
2009 ; SSE2-NEXT: por %xmm7, %xmm0
2010 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2011 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2012 ; SSE2-NEXT: pand %xmm7, %xmm6
2013 ; SSE2-NEXT: paddb %xmm6, %xmm6
2014 ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [170,170,170,170,170,170,170,170,170,170,170,170,170,170,170,170]
2015 ; SSE2-NEXT: pand %xmm9, %xmm0
2016 ; SSE2-NEXT: psrlw $1, %xmm0
2017 ; SSE2-NEXT: por %xmm6, %xmm0
2018 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2019 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
2020 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
2021 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2022 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2023 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
2024 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2025 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2026 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2027 ; SSE2-NEXT: packuswb %xmm6, %xmm1
2028 ; SSE2-NEXT: movdqa %xmm1, %xmm6
2029 ; SSE2-NEXT: psllw $4, %xmm6
2030 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2031 ; SSE2-NEXT: pandn %xmm6, %xmm4
2032 ; SSE2-NEXT: psrlw $4, %xmm1
2033 ; SSE2-NEXT: pand %xmm3, %xmm1
2034 ; SSE2-NEXT: por %xmm4, %xmm1
2035 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2036 ; SSE2-NEXT: pand %xmm5, %xmm4
2037 ; SSE2-NEXT: psllw $2, %xmm4
2038 ; SSE2-NEXT: pand %xmm8, %xmm1
2039 ; SSE2-NEXT: psrlw $2, %xmm1
2040 ; SSE2-NEXT: por %xmm4, %xmm1
2041 ; SSE2-NEXT: movdqa %xmm1, %xmm4
2042 ; SSE2-NEXT: pand %xmm7, %xmm4
2043 ; SSE2-NEXT: paddb %xmm4, %xmm4
2044 ; SSE2-NEXT: pand %xmm9, %xmm1
2045 ; SSE2-NEXT: psrlw $1, %xmm1
2046 ; SSE2-NEXT: por %xmm4, %xmm1
2047 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2048 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2049 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2050 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2051 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2052 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2053 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2054 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2055 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2056 ; SSE2-NEXT: packuswb %xmm4, %xmm2
2057 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2058 ; SSE2-NEXT: psllw $4, %xmm4
2059 ; SSE2-NEXT: movdqa %xmm3, %xmm6
2060 ; SSE2-NEXT: pandn %xmm4, %xmm6
2061 ; SSE2-NEXT: psrlw $4, %xmm2
2062 ; SSE2-NEXT: pand %xmm3, %xmm2
2063 ; SSE2-NEXT: por %xmm6, %xmm2
2064 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2065 ; SSE2-NEXT: pand %xmm5, %xmm4
2066 ; SSE2-NEXT: psllw $2, %xmm4
2067 ; SSE2-NEXT: pand %xmm8, %xmm2
2068 ; SSE2-NEXT: psrlw $2, %xmm2
2069 ; SSE2-NEXT: por %xmm4, %xmm2
2070 ; SSE2-NEXT: movdqa %xmm2, %xmm4
2071 ; SSE2-NEXT: pand %xmm7, %xmm4
2072 ; SSE2-NEXT: paddb %xmm4, %xmm4
2073 ; SSE2-NEXT: pand %xmm9, %xmm2
2074 ; SSE2-NEXT: psrlw $1, %xmm2
2075 ; SSE2-NEXT: por %xmm4, %xmm2
2076 ; SSE2-NEXT: movdqa %xmm11, %xmm4
2077 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2078 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
2079 ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7]
2080 ; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
2081 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2082 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[2,3,0,1]
2083 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
2084 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
2085 ; SSE2-NEXT: packuswb %xmm4, %xmm6
2086 ; SSE2-NEXT: movdqa %xmm6, %xmm4
2087 ; SSE2-NEXT: psllw $4, %xmm4
2088 ; SSE2-NEXT: psrlw $4, %xmm6
2089 ; SSE2-NEXT: pand %xmm3, %xmm6
2090 ; SSE2-NEXT: pandn %xmm4, %xmm3
2091 ; SSE2-NEXT: por %xmm6, %xmm3
2092 ; SSE2-NEXT: pand %xmm3, %xmm5
2093 ; SSE2-NEXT: psllw $2, %xmm5
2094 ; SSE2-NEXT: pand %xmm8, %xmm3
2095 ; SSE2-NEXT: psrlw $2, %xmm3
2096 ; SSE2-NEXT: por %xmm5, %xmm3
2097 ; SSE2-NEXT: pand %xmm3, %xmm7
2098 ; SSE2-NEXT: paddb %xmm7, %xmm7
2099 ; SSE2-NEXT: pand %xmm9, %xmm3
2100 ; SSE2-NEXT: psrlw $1, %xmm3
2101 ; SSE2-NEXT: por %xmm7, %xmm3
2104 ; SSSE3-LABEL: test_bitreverse_v8i64:
2106 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
2107 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
2108 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2109 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
2110 ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2111 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
2112 ; SSSE3-NEXT: pand %xmm9, %xmm0
2113 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2114 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
2115 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
2116 ; SSSE3-NEXT: psrlw $4, %xmm1
2117 ; SSSE3-NEXT: pand %xmm9, %xmm1
2118 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2119 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
2120 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2121 ; SSSE3-NEXT: por %xmm6, %xmm0
2122 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
2123 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
2124 ; SSSE3-NEXT: pand %xmm9, %xmm1
2125 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
2126 ; SSSE3-NEXT: pshufb %xmm1, %xmm6
2127 ; SSSE3-NEXT: psrlw $4, %xmm5
2128 ; SSSE3-NEXT: pand %xmm9, %xmm5
2129 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
2130 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
2131 ; SSSE3-NEXT: por %xmm6, %xmm1
2132 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
2133 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2134 ; SSSE3-NEXT: pand %xmm9, %xmm5
2135 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
2136 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
2137 ; SSSE3-NEXT: psrlw $4, %xmm2
2138 ; SSSE3-NEXT: pand %xmm9, %xmm2
2139 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
2140 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
2141 ; SSSE3-NEXT: por %xmm6, %xmm5
2142 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
2143 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
2144 ; SSSE3-NEXT: pand %xmm9, %xmm2
2145 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
2146 ; SSSE3-NEXT: psrlw $4, %xmm3
2147 ; SSSE3-NEXT: pand %xmm9, %xmm3
2148 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
2149 ; SSSE3-NEXT: por %xmm7, %xmm4
2150 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
2151 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
2154 ; AVX1-LABEL: test_bitreverse_v8i64:
2156 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2157 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2158 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2159 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2160 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2161 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2162 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2163 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2164 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2165 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2166 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2167 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2168 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2169 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
2170 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2171 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
2172 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2173 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
2174 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
2175 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2176 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2177 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2178 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2179 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2180 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2181 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2182 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2183 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2184 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2185 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
2186 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
2187 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2188 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2189 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
2190 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
2191 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2194 ; AVX2-LABEL: test_bitreverse_v8i64:
2196 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2197 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2198 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2199 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
2200 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2201 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2202 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
2203 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
2204 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2205 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
2206 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
2207 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2208 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
2209 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2210 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2211 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2212 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
2213 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
2216 ; AVX512F-LABEL: test_bitreverse_v8i64:
2218 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2219 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2220 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2221 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2222 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
2223 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2224 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2225 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
2226 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
2227 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2228 ; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1
2229 ; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
2230 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2231 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
2232 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2233 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
2234 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
2235 ; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0
2236 ; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
2237 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2238 ; AVX512F-NEXT: retq
2240 ; AVX512BW-LABEL: test_bitreverse_v8i64:
2241 ; AVX512BW: # %bb.0:
2242 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2243 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2244 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
2245 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2246 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
2247 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
2248 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
2249 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2250 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
2251 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
2252 ; AVX512BW-NEXT: retq
2254 ; XOPAVX1-LABEL: test_bitreverse_v8i64:
2256 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2257 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2258 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2259 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2260 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2261 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2262 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2263 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2264 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2265 ; XOPAVX1-NEXT: retq
2267 ; XOPAVX2-LABEL: test_bitreverse_v8i64:
2269 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2270 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2271 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2272 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2273 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2274 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2275 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2276 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2277 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2278 ; XOPAVX2-NEXT: retq
2279 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
2287 define i32 @fold_bitreverse_i32() nounwind {
2288 ; ALL-LABEL: fold_bitreverse_i32:
2290 ; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF
2292 %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
2296 define <16 x i8> @fold_bitreverse_v16i8() nounwind {
2297 ; SSE-LABEL: fold_bitreverse_v16i8:
2299 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2302 ; AVX-LABEL: fold_bitreverse_v16i8:
2304 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2307 ; XOP-LABEL: fold_bitreverse_v16i8:
2309 ; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2311 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
2315 define <16 x i16> @fold_bitreverse_v16i16() nounwind {
2316 ; SSE-LABEL: fold_bitreverse_v16i16:
2318 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2319 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2322 ; AVX-LABEL: fold_bitreverse_v16i16:
2324 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2327 ; XOP-LABEL: fold_bitreverse_v16i16:
2329 ; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2331 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
2335 define <16 x i32> @fold_bitreverse_v16i32() nounwind {
2336 ; SSE-LABEL: fold_bitreverse_v16i32:
2338 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
2339 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
2340 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
2341 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
2344 ; AVX1-LABEL: fold_bitreverse_v16i32:
2346 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2347 ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2350 ; AVX2-LABEL: fold_bitreverse_v16i32:
2352 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2353 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2356 ; AVX512-LABEL: fold_bitreverse_v16i32:
2358 ; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2361 ; XOP-LABEL: fold_bitreverse_v16i32:
2363 ; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2364 ; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2366 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
2370 declare i8 @llvm.bitreverse.i8(i8) readnone
2371 declare i16 @llvm.bitreverse.i16(i16) readnone
2372 declare i32 @llvm.bitreverse.i32(i32) readnone
2373 declare i64 @llvm.bitreverse.i64(i64) readnone
2375 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
2376 declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
2377 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
2378 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
2380 declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone
2381 declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
2382 declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone
2383 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone
2385 declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone
2386 declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
2387 declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone
2388 declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone