1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSSE3
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1
12 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2
13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F
14 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512BW
16 ; Make sure we don't crash with avx512bw and xop
17 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
19 define i8 @test_bitreverse_i8(i8 %a) nounwind {
20 ; SSE-LABEL: test_bitreverse_i8:
22 ; SSE-NEXT: rolb $4, %dil
23 ; SSE-NEXT: movl %edi, %eax
24 ; SSE-NEXT: andb $51, %al
25 ; SSE-NEXT: shlb $2, %al
26 ; SSE-NEXT: shrb $2, %dil
27 ; SSE-NEXT: andb $51, %dil
28 ; SSE-NEXT: orb %dil, %al
29 ; SSE-NEXT: movl %eax, %ecx
30 ; SSE-NEXT: andb $85, %cl
31 ; SSE-NEXT: addb %cl, %cl
33 ; SSE-NEXT: andb $85, %al
34 ; SSE-NEXT: orb %cl, %al
37 ; AVX-LABEL: test_bitreverse_i8:
39 ; AVX-NEXT: rolb $4, %dil
40 ; AVX-NEXT: movl %edi, %eax
41 ; AVX-NEXT: andb $51, %al
42 ; AVX-NEXT: shlb $2, %al
43 ; AVX-NEXT: shrb $2, %dil
44 ; AVX-NEXT: andb $51, %dil
45 ; AVX-NEXT: orb %dil, %al
46 ; AVX-NEXT: movl %eax, %ecx
47 ; AVX-NEXT: andb $85, %cl
48 ; AVX-NEXT: addb %cl, %cl
50 ; AVX-NEXT: andb $85, %al
51 ; AVX-NEXT: orb %cl, %al
54 ; XOP-LABEL: test_bitreverse_i8:
56 ; XOP-NEXT: vmovd %edi, %xmm0
57 ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
58 ; XOP-NEXT: vmovd %xmm0, %eax
59 ; XOP-NEXT: # kill: def $al killed $al killed $eax
62 ; GFNISSE-LABEL: test_bitreverse_i8:
64 ; GFNISSE-NEXT: rolb $4, %dil
65 ; GFNISSE-NEXT: movl %edi, %eax
66 ; GFNISSE-NEXT: andb $51, %al
67 ; GFNISSE-NEXT: shlb $2, %al
68 ; GFNISSE-NEXT: shrb $2, %dil
69 ; GFNISSE-NEXT: andb $51, %dil
70 ; GFNISSE-NEXT: orb %dil, %al
71 ; GFNISSE-NEXT: movl %eax, %ecx
72 ; GFNISSE-NEXT: andb $85, %cl
73 ; GFNISSE-NEXT: addb %cl, %cl
74 ; GFNISSE-NEXT: shrb %al
75 ; GFNISSE-NEXT: andb $85, %al
76 ; GFNISSE-NEXT: orb %cl, %al
79 ; GFNIAVX-LABEL: test_bitreverse_i8:
81 ; GFNIAVX-NEXT: rolb $4, %dil
82 ; GFNIAVX-NEXT: movl %edi, %eax
83 ; GFNIAVX-NEXT: andb $51, %al
84 ; GFNIAVX-NEXT: shlb $2, %al
85 ; GFNIAVX-NEXT: shrb $2, %dil
86 ; GFNIAVX-NEXT: andb $51, %dil
87 ; GFNIAVX-NEXT: orb %dil, %al
88 ; GFNIAVX-NEXT: movl %eax, %ecx
89 ; GFNIAVX-NEXT: andb $85, %cl
90 ; GFNIAVX-NEXT: addb %cl, %cl
91 ; GFNIAVX-NEXT: shrb %al
92 ; GFNIAVX-NEXT: andb $85, %al
93 ; GFNIAVX-NEXT: orb %cl, %al
95 %b = call i8 @llvm.bitreverse.i8(i8 %a)
99 define i16 @test_bitreverse_i16(i16 %a) nounwind {
100 ; SSE-LABEL: test_bitreverse_i16:
102 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi
103 ; SSE-NEXT: rolw $8, %di
104 ; SSE-NEXT: movl %edi, %eax
105 ; SSE-NEXT: andl $3855, %eax # imm = 0xF0F
106 ; SSE-NEXT: shll $4, %eax
107 ; SSE-NEXT: shrl $4, %edi
108 ; SSE-NEXT: andl $3855, %edi # imm = 0xF0F
109 ; SSE-NEXT: orl %eax, %edi
110 ; SSE-NEXT: movl %edi, %eax
111 ; SSE-NEXT: andl $13107, %eax # imm = 0x3333
112 ; SSE-NEXT: shrl $2, %edi
113 ; SSE-NEXT: andl $13107, %edi # imm = 0x3333
114 ; SSE-NEXT: leal (%rdi,%rax,4), %eax
115 ; SSE-NEXT: movl %eax, %ecx
116 ; SSE-NEXT: andl $21845, %ecx # imm = 0x5555
117 ; SSE-NEXT: shrl %eax
118 ; SSE-NEXT: andl $21845, %eax # imm = 0x5555
119 ; SSE-NEXT: leal (%rax,%rcx,2), %eax
120 ; SSE-NEXT: # kill: def $ax killed $ax killed $eax
123 ; AVX-LABEL: test_bitreverse_i16:
125 ; AVX-NEXT: # kill: def $edi killed $edi def $rdi
126 ; AVX-NEXT: rolw $8, %di
127 ; AVX-NEXT: movl %edi, %eax
128 ; AVX-NEXT: andl $3855, %eax # imm = 0xF0F
129 ; AVX-NEXT: shll $4, %eax
130 ; AVX-NEXT: shrl $4, %edi
131 ; AVX-NEXT: andl $3855, %edi # imm = 0xF0F
132 ; AVX-NEXT: orl %eax, %edi
133 ; AVX-NEXT: movl %edi, %eax
134 ; AVX-NEXT: andl $13107, %eax # imm = 0x3333
135 ; AVX-NEXT: shrl $2, %edi
136 ; AVX-NEXT: andl $13107, %edi # imm = 0x3333
137 ; AVX-NEXT: leal (%rdi,%rax,4), %eax
138 ; AVX-NEXT: movl %eax, %ecx
139 ; AVX-NEXT: andl $21845, %ecx # imm = 0x5555
140 ; AVX-NEXT: shrl %eax
141 ; AVX-NEXT: andl $21845, %eax # imm = 0x5555
142 ; AVX-NEXT: leal (%rax,%rcx,2), %eax
143 ; AVX-NEXT: # kill: def $ax killed $ax killed $eax
146 ; XOP-LABEL: test_bitreverse_i16:
148 ; XOP-NEXT: vmovd %edi, %xmm0
149 ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
150 ; XOP-NEXT: vmovd %xmm0, %eax
151 ; XOP-NEXT: # kill: def $ax killed $ax killed $eax
154 ; GFNISSE-LABEL: test_bitreverse_i16:
156 ; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi
157 ; GFNISSE-NEXT: rolw $8, %di
158 ; GFNISSE-NEXT: movl %edi, %eax
159 ; GFNISSE-NEXT: andl $3855, %eax # imm = 0xF0F
160 ; GFNISSE-NEXT: shll $4, %eax
161 ; GFNISSE-NEXT: shrl $4, %edi
162 ; GFNISSE-NEXT: andl $3855, %edi # imm = 0xF0F
163 ; GFNISSE-NEXT: orl %eax, %edi
164 ; GFNISSE-NEXT: movl %edi, %eax
165 ; GFNISSE-NEXT: andl $13107, %eax # imm = 0x3333
166 ; GFNISSE-NEXT: shrl $2, %edi
167 ; GFNISSE-NEXT: andl $13107, %edi # imm = 0x3333
168 ; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax
169 ; GFNISSE-NEXT: movl %eax, %ecx
170 ; GFNISSE-NEXT: andl $21845, %ecx # imm = 0x5555
171 ; GFNISSE-NEXT: shrl %eax
172 ; GFNISSE-NEXT: andl $21845, %eax # imm = 0x5555
173 ; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax
174 ; GFNISSE-NEXT: # kill: def $ax killed $ax killed $eax
177 ; GFNIAVX-LABEL: test_bitreverse_i16:
179 ; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi
180 ; GFNIAVX-NEXT: rolw $8, %di
181 ; GFNIAVX-NEXT: movl %edi, %eax
182 ; GFNIAVX-NEXT: andl $3855, %eax # imm = 0xF0F
183 ; GFNIAVX-NEXT: shll $4, %eax
184 ; GFNIAVX-NEXT: shrl $4, %edi
185 ; GFNIAVX-NEXT: andl $3855, %edi # imm = 0xF0F
186 ; GFNIAVX-NEXT: orl %eax, %edi
187 ; GFNIAVX-NEXT: movl %edi, %eax
188 ; GFNIAVX-NEXT: andl $13107, %eax # imm = 0x3333
189 ; GFNIAVX-NEXT: shrl $2, %edi
190 ; GFNIAVX-NEXT: andl $13107, %edi # imm = 0x3333
191 ; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax
192 ; GFNIAVX-NEXT: movl %eax, %ecx
193 ; GFNIAVX-NEXT: andl $21845, %ecx # imm = 0x5555
194 ; GFNIAVX-NEXT: shrl %eax
195 ; GFNIAVX-NEXT: andl $21845, %eax # imm = 0x5555
196 ; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax
197 ; GFNIAVX-NEXT: # kill: def $ax killed $ax killed $eax
199 %b = call i16 @llvm.bitreverse.i16(i16 %a)
203 define i32 @test_bitreverse_i32(i32 %a) nounwind {
204 ; SSE-LABEL: test_bitreverse_i32:
206 ; SSE-NEXT: # kill: def $edi killed $edi def $rdi
207 ; SSE-NEXT: bswapl %edi
208 ; SSE-NEXT: movl %edi, %eax
209 ; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
210 ; SSE-NEXT: shll $4, %eax
211 ; SSE-NEXT: shrl $4, %edi
212 ; SSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
213 ; SSE-NEXT: orl %eax, %edi
214 ; SSE-NEXT: movl %edi, %eax
215 ; SSE-NEXT: andl $858993459, %eax # imm = 0x33333333
216 ; SSE-NEXT: shrl $2, %edi
217 ; SSE-NEXT: andl $858993459, %edi # imm = 0x33333333
218 ; SSE-NEXT: leal (%rdi,%rax,4), %eax
219 ; SSE-NEXT: movl %eax, %ecx
220 ; SSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555
221 ; SSE-NEXT: shrl %eax
222 ; SSE-NEXT: andl $1431655765, %eax # imm = 0x55555555
223 ; SSE-NEXT: leal (%rax,%rcx,2), %eax
226 ; AVX-LABEL: test_bitreverse_i32:
228 ; AVX-NEXT: # kill: def $edi killed $edi def $rdi
229 ; AVX-NEXT: bswapl %edi
230 ; AVX-NEXT: movl %edi, %eax
231 ; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
232 ; AVX-NEXT: shll $4, %eax
233 ; AVX-NEXT: shrl $4, %edi
234 ; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
235 ; AVX-NEXT: orl %eax, %edi
236 ; AVX-NEXT: movl %edi, %eax
237 ; AVX-NEXT: andl $858993459, %eax # imm = 0x33333333
238 ; AVX-NEXT: shrl $2, %edi
239 ; AVX-NEXT: andl $858993459, %edi # imm = 0x33333333
240 ; AVX-NEXT: leal (%rdi,%rax,4), %eax
241 ; AVX-NEXT: movl %eax, %ecx
242 ; AVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555
243 ; AVX-NEXT: shrl %eax
244 ; AVX-NEXT: andl $1431655765, %eax # imm = 0x55555555
245 ; AVX-NEXT: leal (%rax,%rcx,2), %eax
248 ; XOP-LABEL: test_bitreverse_i32:
250 ; XOP-NEXT: vmovd %edi, %xmm0
251 ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
252 ; XOP-NEXT: vmovd %xmm0, %eax
255 ; GFNISSE-LABEL: test_bitreverse_i32:
257 ; GFNISSE-NEXT: # kill: def $edi killed $edi def $rdi
258 ; GFNISSE-NEXT: bswapl %edi
259 ; GFNISSE-NEXT: movl %edi, %eax
260 ; GFNISSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
261 ; GFNISSE-NEXT: shll $4, %eax
262 ; GFNISSE-NEXT: shrl $4, %edi
263 ; GFNISSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
264 ; GFNISSE-NEXT: orl %eax, %edi
265 ; GFNISSE-NEXT: movl %edi, %eax
266 ; GFNISSE-NEXT: andl $858993459, %eax # imm = 0x33333333
267 ; GFNISSE-NEXT: shrl $2, %edi
268 ; GFNISSE-NEXT: andl $858993459, %edi # imm = 0x33333333
269 ; GFNISSE-NEXT: leal (%rdi,%rax,4), %eax
270 ; GFNISSE-NEXT: movl %eax, %ecx
271 ; GFNISSE-NEXT: andl $1431655765, %ecx # imm = 0x55555555
272 ; GFNISSE-NEXT: shrl %eax
273 ; GFNISSE-NEXT: andl $1431655765, %eax # imm = 0x55555555
274 ; GFNISSE-NEXT: leal (%rax,%rcx,2), %eax
277 ; GFNIAVX-LABEL: test_bitreverse_i32:
279 ; GFNIAVX-NEXT: # kill: def $edi killed $edi def $rdi
280 ; GFNIAVX-NEXT: bswapl %edi
281 ; GFNIAVX-NEXT: movl %edi, %eax
282 ; GFNIAVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
283 ; GFNIAVX-NEXT: shll $4, %eax
284 ; GFNIAVX-NEXT: shrl $4, %edi
285 ; GFNIAVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
286 ; GFNIAVX-NEXT: orl %eax, %edi
287 ; GFNIAVX-NEXT: movl %edi, %eax
288 ; GFNIAVX-NEXT: andl $858993459, %eax # imm = 0x33333333
289 ; GFNIAVX-NEXT: shrl $2, %edi
290 ; GFNIAVX-NEXT: andl $858993459, %edi # imm = 0x33333333
291 ; GFNIAVX-NEXT: leal (%rdi,%rax,4), %eax
292 ; GFNIAVX-NEXT: movl %eax, %ecx
293 ; GFNIAVX-NEXT: andl $1431655765, %ecx # imm = 0x55555555
294 ; GFNIAVX-NEXT: shrl %eax
295 ; GFNIAVX-NEXT: andl $1431655765, %eax # imm = 0x55555555
296 ; GFNIAVX-NEXT: leal (%rax,%rcx,2), %eax
298 %b = call i32 @llvm.bitreverse.i32(i32 %a)
302 define i64 @test_bitreverse_i64(i64 %a) nounwind {
303 ; SSE-LABEL: test_bitreverse_i64:
305 ; SSE-NEXT: bswapq %rdi
306 ; SSE-NEXT: movq %rdi, %rax
307 ; SSE-NEXT: shrq $4, %rax
308 ; SSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
309 ; SSE-NEXT: andq %rcx, %rax
310 ; SSE-NEXT: andq %rcx, %rdi
311 ; SSE-NEXT: shlq $4, %rdi
312 ; SSE-NEXT: orq %rax, %rdi
313 ; SSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
314 ; SSE-NEXT: movq %rdi, %rcx
315 ; SSE-NEXT: andq %rax, %rcx
316 ; SSE-NEXT: shrq $2, %rdi
317 ; SSE-NEXT: andq %rax, %rdi
318 ; SSE-NEXT: leaq (%rdi,%rcx,4), %rax
319 ; SSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
320 ; SSE-NEXT: movq %rax, %rdx
321 ; SSE-NEXT: andq %rcx, %rdx
322 ; SSE-NEXT: shrq %rax
323 ; SSE-NEXT: andq %rcx, %rax
324 ; SSE-NEXT: leaq (%rax,%rdx,2), %rax
327 ; AVX-LABEL: test_bitreverse_i64:
329 ; AVX-NEXT: bswapq %rdi
330 ; AVX-NEXT: movq %rdi, %rax
331 ; AVX-NEXT: shrq $4, %rax
332 ; AVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
333 ; AVX-NEXT: andq %rcx, %rax
334 ; AVX-NEXT: andq %rcx, %rdi
335 ; AVX-NEXT: shlq $4, %rdi
336 ; AVX-NEXT: orq %rax, %rdi
337 ; AVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
338 ; AVX-NEXT: movq %rdi, %rcx
339 ; AVX-NEXT: andq %rax, %rcx
340 ; AVX-NEXT: shrq $2, %rdi
341 ; AVX-NEXT: andq %rax, %rdi
342 ; AVX-NEXT: leaq (%rdi,%rcx,4), %rax
343 ; AVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
344 ; AVX-NEXT: movq %rax, %rdx
345 ; AVX-NEXT: andq %rcx, %rdx
346 ; AVX-NEXT: shrq %rax
347 ; AVX-NEXT: andq %rcx, %rax
348 ; AVX-NEXT: leaq (%rax,%rdx,2), %rax
351 ; XOP-LABEL: test_bitreverse_i64:
353 ; XOP-NEXT: vmovq %rdi, %xmm0
354 ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
355 ; XOP-NEXT: vmovq %xmm0, %rax
358 ; GFNISSE-LABEL: test_bitreverse_i64:
360 ; GFNISSE-NEXT: bswapq %rdi
361 ; GFNISSE-NEXT: movq %rdi, %rax
362 ; GFNISSE-NEXT: shrq $4, %rax
363 ; GFNISSE-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
364 ; GFNISSE-NEXT: andq %rcx, %rax
365 ; GFNISSE-NEXT: andq %rcx, %rdi
366 ; GFNISSE-NEXT: shlq $4, %rdi
367 ; GFNISSE-NEXT: orq %rax, %rdi
368 ; GFNISSE-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
369 ; GFNISSE-NEXT: movq %rdi, %rcx
370 ; GFNISSE-NEXT: andq %rax, %rcx
371 ; GFNISSE-NEXT: shrq $2, %rdi
372 ; GFNISSE-NEXT: andq %rax, %rdi
373 ; GFNISSE-NEXT: leaq (%rdi,%rcx,4), %rax
374 ; GFNISSE-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
375 ; GFNISSE-NEXT: movq %rax, %rdx
376 ; GFNISSE-NEXT: andq %rcx, %rdx
377 ; GFNISSE-NEXT: shrq %rax
378 ; GFNISSE-NEXT: andq %rcx, %rax
379 ; GFNISSE-NEXT: leaq (%rax,%rdx,2), %rax
382 ; GFNIAVX-LABEL: test_bitreverse_i64:
384 ; GFNIAVX-NEXT: bswapq %rdi
385 ; GFNIAVX-NEXT: movq %rdi, %rax
386 ; GFNIAVX-NEXT: shrq $4, %rax
387 ; GFNIAVX-NEXT: movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
388 ; GFNIAVX-NEXT: andq %rcx, %rax
389 ; GFNIAVX-NEXT: andq %rcx, %rdi
390 ; GFNIAVX-NEXT: shlq $4, %rdi
391 ; GFNIAVX-NEXT: orq %rax, %rdi
392 ; GFNIAVX-NEXT: movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
393 ; GFNIAVX-NEXT: movq %rdi, %rcx
394 ; GFNIAVX-NEXT: andq %rax, %rcx
395 ; GFNIAVX-NEXT: shrq $2, %rdi
396 ; GFNIAVX-NEXT: andq %rax, %rdi
397 ; GFNIAVX-NEXT: leaq (%rdi,%rcx,4), %rax
398 ; GFNIAVX-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
399 ; GFNIAVX-NEXT: movq %rax, %rdx
400 ; GFNIAVX-NEXT: andq %rcx, %rdx
401 ; GFNIAVX-NEXT: shrq %rax
402 ; GFNIAVX-NEXT: andq %rcx, %rax
403 ; GFNIAVX-NEXT: leaq (%rax,%rdx,2), %rax
405 %b = call i64 @llvm.bitreverse.i64(i64 %a)
409 define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
410 ; SSE2-LABEL: test_bitreverse_v16i8:
412 ; SSE2-NEXT: movdqa %xmm0, %xmm1
413 ; SSE2-NEXT: psrlw $4, %xmm1
414 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
415 ; SSE2-NEXT: pand %xmm2, %xmm1
416 ; SSE2-NEXT: pand %xmm2, %xmm0
417 ; SSE2-NEXT: psllw $4, %xmm0
418 ; SSE2-NEXT: por %xmm1, %xmm0
419 ; SSE2-NEXT: movdqa %xmm0, %xmm1
420 ; SSE2-NEXT: psrlw $2, %xmm1
421 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
422 ; SSE2-NEXT: pand %xmm2, %xmm1
423 ; SSE2-NEXT: pand %xmm2, %xmm0
424 ; SSE2-NEXT: psllw $2, %xmm0
425 ; SSE2-NEXT: por %xmm1, %xmm0
426 ; SSE2-NEXT: movdqa %xmm0, %xmm1
427 ; SSE2-NEXT: psrlw $1, %xmm1
428 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
429 ; SSE2-NEXT: pand %xmm2, %xmm1
430 ; SSE2-NEXT: pand %xmm2, %xmm0
431 ; SSE2-NEXT: paddb %xmm0, %xmm0
432 ; SSE2-NEXT: por %xmm1, %xmm0
435 ; SSSE3-LABEL: test_bitreverse_v16i8:
437 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
438 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
439 ; SSSE3-NEXT: pand %xmm1, %xmm2
440 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
441 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
442 ; SSSE3-NEXT: psrlw $4, %xmm0
443 ; SSSE3-NEXT: pand %xmm1, %xmm0
444 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
445 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
446 ; SSSE3-NEXT: por %xmm3, %xmm1
447 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
450 ; AVX1-LABEL: test_bitreverse_v16i8:
452 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
453 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
454 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
455 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
456 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
457 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
458 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
459 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
460 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
463 ; AVX2-LABEL: test_bitreverse_v16i8:
465 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
466 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
467 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
468 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
469 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
470 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
471 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
472 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
473 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
476 ; AVX512-LABEL: test_bitreverse_v16i8:
478 ; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
479 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
480 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
481 ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
482 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
483 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
484 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
485 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
486 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
489 ; XOP-LABEL: test_bitreverse_v16i8:
491 ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
494 ; GFNISSE-LABEL: test_bitreverse_v16i8:
496 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
499 ; GFNIAVX-LABEL: test_bitreverse_v16i8:
501 ; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
503 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
507 define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
508 ; SSE2-LABEL: test_bitreverse_v8i16:
510 ; SSE2-NEXT: movdqa %xmm0, %xmm1
511 ; SSE2-NEXT: psrlw $8, %xmm1
512 ; SSE2-NEXT: psllw $8, %xmm0
513 ; SSE2-NEXT: por %xmm1, %xmm0
514 ; SSE2-NEXT: movdqa %xmm0, %xmm1
515 ; SSE2-NEXT: psrlw $4, %xmm1
516 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
517 ; SSE2-NEXT: pand %xmm2, %xmm1
518 ; SSE2-NEXT: pand %xmm2, %xmm0
519 ; SSE2-NEXT: psllw $4, %xmm0
520 ; SSE2-NEXT: por %xmm1, %xmm0
521 ; SSE2-NEXT: movdqa %xmm0, %xmm1
522 ; SSE2-NEXT: psrlw $2, %xmm1
523 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
524 ; SSE2-NEXT: pand %xmm2, %xmm1
525 ; SSE2-NEXT: pand %xmm2, %xmm0
526 ; SSE2-NEXT: psllw $2, %xmm0
527 ; SSE2-NEXT: por %xmm1, %xmm0
528 ; SSE2-NEXT: movdqa %xmm0, %xmm1
529 ; SSE2-NEXT: psrlw $1, %xmm1
530 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
531 ; SSE2-NEXT: pand %xmm2, %xmm1
532 ; SSE2-NEXT: pand %xmm2, %xmm0
533 ; SSE2-NEXT: paddb %xmm0, %xmm0
534 ; SSE2-NEXT: por %xmm1, %xmm0
537 ; SSSE3-LABEL: test_bitreverse_v8i16:
539 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
540 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
541 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
542 ; SSSE3-NEXT: pand %xmm1, %xmm2
543 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
544 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
545 ; SSSE3-NEXT: psrlw $4, %xmm0
546 ; SSSE3-NEXT: pand %xmm1, %xmm0
547 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
548 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
549 ; SSSE3-NEXT: por %xmm3, %xmm1
550 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
553 ; AVX1-LABEL: test_bitreverse_v8i16:
555 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
556 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
557 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
558 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
559 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
560 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
561 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
562 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
563 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
564 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
567 ; AVX2-LABEL: test_bitreverse_v8i16:
569 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
570 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
571 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
572 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
573 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
574 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
575 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
576 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
577 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
578 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
581 ; AVX512-LABEL: test_bitreverse_v8i16:
583 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
584 ; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
585 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
586 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
587 ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
588 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
589 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
590 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
591 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
592 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
595 ; XOP-LABEL: test_bitreverse_v8i16:
597 ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
600 ; GFNISSE-LABEL: test_bitreverse_v8i16:
602 ; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
603 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
606 ; GFNIAVX-LABEL: test_bitreverse_v8i16:
608 ; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
609 ; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
611 %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
615 define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
616 ; SSE2-LABEL: test_bitreverse_v4i32:
618 ; SSE2-NEXT: pxor %xmm1, %xmm1
619 ; SSE2-NEXT: movdqa %xmm0, %xmm2
620 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
621 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
622 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
623 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
624 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
625 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
626 ; SSE2-NEXT: packuswb %xmm2, %xmm0
627 ; SSE2-NEXT: movdqa %xmm0, %xmm1
628 ; SSE2-NEXT: psrlw $4, %xmm1
629 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
630 ; SSE2-NEXT: pand %xmm2, %xmm1
631 ; SSE2-NEXT: pand %xmm2, %xmm0
632 ; SSE2-NEXT: psllw $4, %xmm0
633 ; SSE2-NEXT: por %xmm1, %xmm0
634 ; SSE2-NEXT: movdqa %xmm0, %xmm1
635 ; SSE2-NEXT: psrlw $2, %xmm1
636 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
637 ; SSE2-NEXT: pand %xmm2, %xmm1
638 ; SSE2-NEXT: pand %xmm2, %xmm0
639 ; SSE2-NEXT: psllw $2, %xmm0
640 ; SSE2-NEXT: por %xmm1, %xmm0
641 ; SSE2-NEXT: movdqa %xmm0, %xmm1
642 ; SSE2-NEXT: psrlw $1, %xmm1
643 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
644 ; SSE2-NEXT: pand %xmm2, %xmm1
645 ; SSE2-NEXT: pand %xmm2, %xmm0
646 ; SSE2-NEXT: paddb %xmm0, %xmm0
647 ; SSE2-NEXT: por %xmm1, %xmm0
650 ; SSSE3-LABEL: test_bitreverse_v4i32:
652 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
653 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
654 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
655 ; SSSE3-NEXT: pand %xmm1, %xmm2
656 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
657 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
658 ; SSSE3-NEXT: psrlw $4, %xmm0
659 ; SSSE3-NEXT: pand %xmm1, %xmm0
660 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
661 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
662 ; SSSE3-NEXT: por %xmm3, %xmm1
663 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
666 ; AVX1-LABEL: test_bitreverse_v4i32:
668 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
669 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
670 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
671 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
672 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
673 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
674 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
675 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
676 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
677 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
680 ; AVX2-LABEL: test_bitreverse_v4i32:
682 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
683 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
684 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
685 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
686 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
687 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
688 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
689 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
690 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
691 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
694 ; AVX512-LABEL: test_bitreverse_v4i32:
696 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
697 ; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
698 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
699 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
700 ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
701 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
702 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
703 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
704 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
705 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
708 ; XOP-LABEL: test_bitreverse_v4i32:
710 ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
713 ; GFNISSE-LABEL: test_bitreverse_v4i32:
715 ; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
716 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
719 ; GFNIAVX-LABEL: test_bitreverse_v4i32:
721 ; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
722 ; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
724 %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
728 define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
729 ; SSE2-LABEL: test_bitreverse_v2i64:
731 ; SSE2-NEXT: pxor %xmm1, %xmm1
732 ; SSE2-NEXT: movdqa %xmm0, %xmm2
733 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
734 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
735 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
736 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
737 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
738 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
739 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
740 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
741 ; SSE2-NEXT: packuswb %xmm2, %xmm0
742 ; SSE2-NEXT: movdqa %xmm0, %xmm1
743 ; SSE2-NEXT: psrlw $4, %xmm1
744 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
745 ; SSE2-NEXT: pand %xmm2, %xmm1
746 ; SSE2-NEXT: pand %xmm2, %xmm0
747 ; SSE2-NEXT: psllw $4, %xmm0
748 ; SSE2-NEXT: por %xmm1, %xmm0
749 ; SSE2-NEXT: movdqa %xmm0, %xmm1
750 ; SSE2-NEXT: psrlw $2, %xmm1
751 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
752 ; SSE2-NEXT: pand %xmm2, %xmm1
753 ; SSE2-NEXT: pand %xmm2, %xmm0
754 ; SSE2-NEXT: psllw $2, %xmm0
755 ; SSE2-NEXT: por %xmm1, %xmm0
756 ; SSE2-NEXT: movdqa %xmm0, %xmm1
757 ; SSE2-NEXT: psrlw $1, %xmm1
758 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
759 ; SSE2-NEXT: pand %xmm2, %xmm1
760 ; SSE2-NEXT: pand %xmm2, %xmm0
761 ; SSE2-NEXT: paddb %xmm0, %xmm0
762 ; SSE2-NEXT: por %xmm1, %xmm0
765 ; SSSE3-LABEL: test_bitreverse_v2i64:
767 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
768 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
769 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
770 ; SSSE3-NEXT: pand %xmm1, %xmm2
771 ; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
772 ; SSSE3-NEXT: pshufb %xmm2, %xmm3
773 ; SSSE3-NEXT: psrlw $4, %xmm0
774 ; SSSE3-NEXT: pand %xmm1, %xmm0
775 ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
776 ; SSSE3-NEXT: pshufb %xmm0, %xmm1
777 ; SSSE3-NEXT: por %xmm3, %xmm1
778 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
781 ; AVX1-LABEL: test_bitreverse_v2i64:
783 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
784 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
785 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
786 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
787 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
788 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
789 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
790 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
791 ; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
792 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
795 ; AVX2-LABEL: test_bitreverse_v2i64:
797 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
798 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
799 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
800 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
801 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
802 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
803 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
804 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
805 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
806 ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
809 ; AVX512-LABEL: test_bitreverse_v2i64:
811 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
812 ; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
813 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
814 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
815 ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
816 ; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
817 ; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
818 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
819 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
820 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
823 ; XOP-LABEL: test_bitreverse_v2i64:
825 ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
828 ; GFNISSE-LABEL: test_bitreverse_v2i64:
830 ; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
831 ; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
834 ; GFNIAVX-LABEL: test_bitreverse_v2i64:
836 ; GFNIAVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
837 ; GFNIAVX-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
839 %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
843 define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
844 ; SSE2-LABEL: test_bitreverse_v32i8:
846 ; SSE2-NEXT: movdqa %xmm0, %xmm3
847 ; SSE2-NEXT: psrlw $4, %xmm3
848 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
849 ; SSE2-NEXT: pand %xmm2, %xmm3
850 ; SSE2-NEXT: pand %xmm2, %xmm0
851 ; SSE2-NEXT: psllw $4, %xmm0
852 ; SSE2-NEXT: por %xmm3, %xmm0
853 ; SSE2-NEXT: movdqa %xmm0, %xmm4
854 ; SSE2-NEXT: psrlw $2, %xmm4
855 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
856 ; SSE2-NEXT: pand %xmm3, %xmm4
857 ; SSE2-NEXT: pand %xmm3, %xmm0
858 ; SSE2-NEXT: psllw $2, %xmm0
859 ; SSE2-NEXT: por %xmm4, %xmm0
860 ; SSE2-NEXT: movdqa %xmm0, %xmm5
861 ; SSE2-NEXT: psrlw $1, %xmm5
862 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
863 ; SSE2-NEXT: pand %xmm4, %xmm5
864 ; SSE2-NEXT: pand %xmm4, %xmm0
865 ; SSE2-NEXT: paddb %xmm0, %xmm0
866 ; SSE2-NEXT: por %xmm5, %xmm0
867 ; SSE2-NEXT: movdqa %xmm1, %xmm5
868 ; SSE2-NEXT: psrlw $4, %xmm5
869 ; SSE2-NEXT: pand %xmm2, %xmm5
870 ; SSE2-NEXT: pand %xmm2, %xmm1
871 ; SSE2-NEXT: psllw $4, %xmm1
872 ; SSE2-NEXT: por %xmm5, %xmm1
873 ; SSE2-NEXT: movdqa %xmm1, %xmm2
874 ; SSE2-NEXT: psrlw $2, %xmm2
875 ; SSE2-NEXT: pand %xmm3, %xmm2
876 ; SSE2-NEXT: pand %xmm3, %xmm1
877 ; SSE2-NEXT: psllw $2, %xmm1
878 ; SSE2-NEXT: por %xmm2, %xmm1
879 ; SSE2-NEXT: movdqa %xmm1, %xmm2
880 ; SSE2-NEXT: psrlw $1, %xmm2
881 ; SSE2-NEXT: pand %xmm4, %xmm2
882 ; SSE2-NEXT: pand %xmm4, %xmm1
883 ; SSE2-NEXT: paddb %xmm1, %xmm1
884 ; SSE2-NEXT: por %xmm2, %xmm1
887 ; SSSE3-LABEL: test_bitreverse_v32i8:
889 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
890 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
891 ; SSSE3-NEXT: pand %xmm4, %xmm2
892 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
893 ; SSSE3-NEXT: movdqa %xmm5, %xmm6
894 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
895 ; SSSE3-NEXT: psrlw $4, %xmm0
896 ; SSSE3-NEXT: pand %xmm4, %xmm0
897 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
898 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
899 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
900 ; SSSE3-NEXT: por %xmm6, %xmm3
901 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
902 ; SSSE3-NEXT: pand %xmm4, %xmm0
903 ; SSSE3-NEXT: pshufb %xmm0, %xmm5
904 ; SSSE3-NEXT: psrlw $4, %xmm1
905 ; SSSE3-NEXT: pand %xmm4, %xmm1
906 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
907 ; SSSE3-NEXT: por %xmm5, %xmm2
908 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
909 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
912 ; AVX1-LABEL: test_bitreverse_v32i8:
914 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
915 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
916 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
917 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
918 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
919 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
920 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
921 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
922 ; AVX1-NEXT: vpshufb %xmm1, %xmm5, %xmm1
923 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
924 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm3
925 ; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
926 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
927 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
928 ; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm0
929 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
930 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
933 ; AVX2-LABEL: test_bitreverse_v32i8:
935 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
936 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
937 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
938 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
939 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
940 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
941 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
942 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
943 ; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
944 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
945 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
948 ; AVX512-LABEL: test_bitreverse_v32i8:
950 ; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
951 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
952 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
953 ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
954 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
955 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
956 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
957 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
958 ; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
959 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
960 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
963 ; XOPAVX1-LABEL: test_bitreverse_v32i8:
965 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
966 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
967 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
968 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
969 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
972 ; XOPAVX2-LABEL: test_bitreverse_v32i8:
974 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
975 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
976 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
977 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
978 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
981 ; GFNISSE-LABEL: test_bitreverse_v32i8:
983 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745]
984 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm0
985 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm2, %xmm1
988 ; GFNIAVX1-LABEL: test_bitreverse_v32i8:
990 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
991 ; GFNIAVX1-NEXT: retq
993 ; GFNIAVX2-LABEL: test_bitreverse_v32i8:
995 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
996 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
997 ; GFNIAVX2-NEXT: retq
999 ; GFNIAVX512-LABEL: test_bitreverse_v32i8:
1000 ; GFNIAVX512: # %bb.0:
1001 ; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1002 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1003 ; GFNIAVX512-NEXT: retq
1004 %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
1008 define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
1009 ; SSE2-LABEL: test_bitreverse_v16i16:
1011 ; SSE2-NEXT: movdqa %xmm0, %xmm2
1012 ; SSE2-NEXT: psrlw $8, %xmm2
1013 ; SSE2-NEXT: psllw $8, %xmm0
1014 ; SSE2-NEXT: por %xmm2, %xmm0
1015 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1016 ; SSE2-NEXT: psrlw $4, %xmm3
1017 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1018 ; SSE2-NEXT: pand %xmm2, %xmm3
1019 ; SSE2-NEXT: pand %xmm2, %xmm0
1020 ; SSE2-NEXT: psllw $4, %xmm0
1021 ; SSE2-NEXT: por %xmm3, %xmm0
1022 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1023 ; SSE2-NEXT: psrlw $2, %xmm4
1024 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1025 ; SSE2-NEXT: pand %xmm3, %xmm4
1026 ; SSE2-NEXT: pand %xmm3, %xmm0
1027 ; SSE2-NEXT: psllw $2, %xmm0
1028 ; SSE2-NEXT: por %xmm4, %xmm0
1029 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1030 ; SSE2-NEXT: psrlw $1, %xmm5
1031 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1032 ; SSE2-NEXT: pand %xmm4, %xmm5
1033 ; SSE2-NEXT: pand %xmm4, %xmm0
1034 ; SSE2-NEXT: paddb %xmm0, %xmm0
1035 ; SSE2-NEXT: por %xmm5, %xmm0
1036 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1037 ; SSE2-NEXT: psrlw $8, %xmm5
1038 ; SSE2-NEXT: psllw $8, %xmm1
1039 ; SSE2-NEXT: por %xmm5, %xmm1
1040 ; SSE2-NEXT: movdqa %xmm1, %xmm5
1041 ; SSE2-NEXT: psrlw $4, %xmm5
1042 ; SSE2-NEXT: pand %xmm2, %xmm5
1043 ; SSE2-NEXT: pand %xmm2, %xmm1
1044 ; SSE2-NEXT: psllw $4, %xmm1
1045 ; SSE2-NEXT: por %xmm5, %xmm1
1046 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1047 ; SSE2-NEXT: psrlw $2, %xmm2
1048 ; SSE2-NEXT: pand %xmm3, %xmm2
1049 ; SSE2-NEXT: pand %xmm3, %xmm1
1050 ; SSE2-NEXT: psllw $2, %xmm1
1051 ; SSE2-NEXT: por %xmm2, %xmm1
1052 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1053 ; SSE2-NEXT: psrlw $1, %xmm2
1054 ; SSE2-NEXT: pand %xmm4, %xmm2
1055 ; SSE2-NEXT: pand %xmm4, %xmm1
1056 ; SSE2-NEXT: paddb %xmm1, %xmm1
1057 ; SSE2-NEXT: por %xmm2, %xmm1
1060 ; SSSE3-LABEL: test_bitreverse_v16i16:
1062 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1063 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
1064 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1065 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1066 ; SSSE3-NEXT: pand %xmm5, %xmm2
1067 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1068 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
1069 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
1070 ; SSSE3-NEXT: psrlw $4, %xmm0
1071 ; SSSE3-NEXT: pand %xmm5, %xmm0
1072 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1073 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
1074 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
1075 ; SSSE3-NEXT: por %xmm7, %xmm3
1076 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
1077 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1078 ; SSSE3-NEXT: pand %xmm5, %xmm0
1079 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1080 ; SSSE3-NEXT: psrlw $4, %xmm1
1081 ; SSSE3-NEXT: pand %xmm5, %xmm1
1082 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
1083 ; SSSE3-NEXT: por %xmm6, %xmm2
1084 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1085 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1088 ; AVX1-LABEL: test_bitreverse_v16i16:
1090 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1091 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1092 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1093 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1094 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
1095 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1096 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1097 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1098 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1099 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1100 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
1101 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
1102 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1103 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
1104 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
1105 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1106 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1107 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
1108 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1109 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1112 ; AVX2-LABEL: test_bitreverse_v16i16:
1114 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1115 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1116 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
1117 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1118 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1119 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1120 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1121 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1122 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1123 ; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
1124 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1125 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
1128 ; AVX512-LABEL: test_bitreverse_v16i16:
1130 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1131 ; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1132 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
1133 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1134 ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
1135 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1136 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
1137 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1138 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1139 ; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
1140 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1141 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
1144 ; XOPAVX1-LABEL: test_bitreverse_v16i16:
1146 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1147 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1148 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1149 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1150 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1151 ; XOPAVX1-NEXT: retq
1153 ; XOPAVX2-LABEL: test_bitreverse_v16i16:
1155 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1156 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1157 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1158 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1159 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1160 ; XOPAVX2-NEXT: retq
1162 ; GFNISSE-LABEL: test_bitreverse_v16i16:
1164 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1165 ; GFNISSE-NEXT: pshufb %xmm2, %xmm0
1166 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1167 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
1168 ; GFNISSE-NEXT: pshufb %xmm2, %xmm1
1169 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
1170 ; GFNISSE-NEXT: retq
1172 ; GFNIAVX1-LABEL: test_bitreverse_v16i16:
1173 ; GFNIAVX1: # %bb.0:
1174 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1175 ; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1176 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1177 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1178 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1179 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1180 ; GFNIAVX1-NEXT: retq
1182 ; GFNIAVX2-LABEL: test_bitreverse_v16i16:
1183 ; GFNIAVX2: # %bb.0:
1184 ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1185 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1186 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1187 ; GFNIAVX2-NEXT: retq
1189 ; GFNIAVX512-LABEL: test_bitreverse_v16i16:
1190 ; GFNIAVX512: # %bb.0:
1191 ; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1192 ; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1193 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1194 ; GFNIAVX512-NEXT: retq
1195 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
1199 define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
1200 ; SSE2-LABEL: test_bitreverse_v8i32:
1202 ; SSE2-NEXT: pxor %xmm2, %xmm2
1203 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1204 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1205 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1206 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1207 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1208 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1209 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1210 ; SSE2-NEXT: packuswb %xmm3, %xmm0
1211 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1212 ; SSE2-NEXT: psrlw $4, %xmm4
1213 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1214 ; SSE2-NEXT: pand %xmm3, %xmm4
1215 ; SSE2-NEXT: pand %xmm3, %xmm0
1216 ; SSE2-NEXT: psllw $4, %xmm0
1217 ; SSE2-NEXT: por %xmm4, %xmm0
1218 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1219 ; SSE2-NEXT: psrlw $2, %xmm5
1220 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1221 ; SSE2-NEXT: pand %xmm4, %xmm5
1222 ; SSE2-NEXT: pand %xmm4, %xmm0
1223 ; SSE2-NEXT: psllw $2, %xmm0
1224 ; SSE2-NEXT: por %xmm5, %xmm0
1225 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1226 ; SSE2-NEXT: psrlw $1, %xmm6
1227 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1228 ; SSE2-NEXT: pand %xmm5, %xmm6
1229 ; SSE2-NEXT: pand %xmm5, %xmm0
1230 ; SSE2-NEXT: paddb %xmm0, %xmm0
1231 ; SSE2-NEXT: por %xmm6, %xmm0
1232 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1233 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
1234 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1235 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1236 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1237 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1238 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1239 ; SSE2-NEXT: packuswb %xmm6, %xmm1
1240 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1241 ; SSE2-NEXT: psrlw $4, %xmm2
1242 ; SSE2-NEXT: pand %xmm3, %xmm2
1243 ; SSE2-NEXT: pand %xmm3, %xmm1
1244 ; SSE2-NEXT: psllw $4, %xmm1
1245 ; SSE2-NEXT: por %xmm2, %xmm1
1246 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1247 ; SSE2-NEXT: psrlw $2, %xmm2
1248 ; SSE2-NEXT: pand %xmm4, %xmm2
1249 ; SSE2-NEXT: pand %xmm4, %xmm1
1250 ; SSE2-NEXT: psllw $2, %xmm1
1251 ; SSE2-NEXT: por %xmm2, %xmm1
1252 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1253 ; SSE2-NEXT: psrlw $1, %xmm2
1254 ; SSE2-NEXT: pand %xmm5, %xmm2
1255 ; SSE2-NEXT: pand %xmm5, %xmm1
1256 ; SSE2-NEXT: paddb %xmm1, %xmm1
1257 ; SSE2-NEXT: por %xmm2, %xmm1
1260 ; SSSE3-LABEL: test_bitreverse_v8i32:
1262 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1263 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
1264 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1265 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1266 ; SSSE3-NEXT: pand %xmm5, %xmm2
1267 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1268 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
1269 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
1270 ; SSSE3-NEXT: psrlw $4, %xmm0
1271 ; SSSE3-NEXT: pand %xmm5, %xmm0
1272 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1273 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
1274 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
1275 ; SSSE3-NEXT: por %xmm7, %xmm3
1276 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
1277 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1278 ; SSSE3-NEXT: pand %xmm5, %xmm0
1279 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1280 ; SSSE3-NEXT: psrlw $4, %xmm1
1281 ; SSSE3-NEXT: pand %xmm5, %xmm1
1282 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
1283 ; SSSE3-NEXT: por %xmm6, %xmm2
1284 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1285 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1288 ; AVX1-LABEL: test_bitreverse_v8i32:
1290 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1291 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1292 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1293 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1294 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
1295 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1296 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1297 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1298 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1299 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1300 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
1301 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
1302 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1303 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
1304 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
1305 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1306 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1307 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
1308 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1309 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1312 ; AVX2-LABEL: test_bitreverse_v8i32:
1314 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1315 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1316 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
1317 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1318 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1319 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1320 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1321 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1322 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1323 ; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
1324 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1325 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
1328 ; AVX512-LABEL: test_bitreverse_v8i32:
1330 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1331 ; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1332 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
1333 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1334 ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
1335 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1336 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
1337 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1338 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1339 ; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
1340 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1341 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
1344 ; XOPAVX1-LABEL: test_bitreverse_v8i32:
1346 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1347 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1348 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1349 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1350 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1351 ; XOPAVX1-NEXT: retq
1353 ; XOPAVX2-LABEL: test_bitreverse_v8i32:
1355 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1356 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1357 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1358 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1359 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1360 ; XOPAVX2-NEXT: retq
1362 ; GFNISSE-LABEL: test_bitreverse_v8i32:
1364 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1365 ; GFNISSE-NEXT: pshufb %xmm2, %xmm0
1366 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1367 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
1368 ; GFNISSE-NEXT: pshufb %xmm2, %xmm1
1369 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
1370 ; GFNISSE-NEXT: retq
1372 ; GFNIAVX1-LABEL: test_bitreverse_v8i32:
1373 ; GFNIAVX1: # %bb.0:
1374 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1375 ; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1376 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1377 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1378 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1379 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1380 ; GFNIAVX1-NEXT: retq
1382 ; GFNIAVX2-LABEL: test_bitreverse_v8i32:
1383 ; GFNIAVX2: # %bb.0:
1384 ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1385 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1386 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1387 ; GFNIAVX2-NEXT: retq
1389 ; GFNIAVX512-LABEL: test_bitreverse_v8i32:
1390 ; GFNIAVX512: # %bb.0:
1391 ; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1392 ; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1393 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1394 ; GFNIAVX512-NEXT: retq
1395 %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1399 define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1400 ; SSE2-LABEL: test_bitreverse_v4i64:
1402 ; SSE2-NEXT: pxor %xmm2, %xmm2
1403 ; SSE2-NEXT: movdqa %xmm0, %xmm3
1404 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1405 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1406 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1407 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1408 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1409 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1410 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1411 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1412 ; SSE2-NEXT: packuswb %xmm3, %xmm0
1413 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1414 ; SSE2-NEXT: psrlw $4, %xmm4
1415 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1416 ; SSE2-NEXT: pand %xmm3, %xmm4
1417 ; SSE2-NEXT: pand %xmm3, %xmm0
1418 ; SSE2-NEXT: psllw $4, %xmm0
1419 ; SSE2-NEXT: por %xmm4, %xmm0
1420 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1421 ; SSE2-NEXT: psrlw $2, %xmm5
1422 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1423 ; SSE2-NEXT: pand %xmm4, %xmm5
1424 ; SSE2-NEXT: pand %xmm4, %xmm0
1425 ; SSE2-NEXT: psllw $2, %xmm0
1426 ; SSE2-NEXT: por %xmm5, %xmm0
1427 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1428 ; SSE2-NEXT: psrlw $1, %xmm6
1429 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1430 ; SSE2-NEXT: pand %xmm5, %xmm6
1431 ; SSE2-NEXT: pand %xmm5, %xmm0
1432 ; SSE2-NEXT: paddb %xmm0, %xmm0
1433 ; SSE2-NEXT: por %xmm6, %xmm0
1434 ; SSE2-NEXT: movdqa %xmm1, %xmm6
1435 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
1436 ; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1437 ; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1438 ; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1439 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1440 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1441 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1442 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1443 ; SSE2-NEXT: packuswb %xmm6, %xmm1
1444 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1445 ; SSE2-NEXT: psrlw $4, %xmm2
1446 ; SSE2-NEXT: pand %xmm3, %xmm2
1447 ; SSE2-NEXT: pand %xmm3, %xmm1
1448 ; SSE2-NEXT: psllw $4, %xmm1
1449 ; SSE2-NEXT: por %xmm2, %xmm1
1450 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1451 ; SSE2-NEXT: psrlw $2, %xmm2
1452 ; SSE2-NEXT: pand %xmm4, %xmm2
1453 ; SSE2-NEXT: pand %xmm4, %xmm1
1454 ; SSE2-NEXT: psllw $2, %xmm1
1455 ; SSE2-NEXT: por %xmm2, %xmm1
1456 ; SSE2-NEXT: movdqa %xmm1, %xmm2
1457 ; SSE2-NEXT: psrlw $1, %xmm2
1458 ; SSE2-NEXT: pand %xmm5, %xmm2
1459 ; SSE2-NEXT: pand %xmm5, %xmm1
1460 ; SSE2-NEXT: paddb %xmm1, %xmm1
1461 ; SSE2-NEXT: por %xmm2, %xmm1
1464 ; SSSE3-LABEL: test_bitreverse_v4i64:
1466 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1467 ; SSSE3-NEXT: pshufb %xmm4, %xmm0
1468 ; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1469 ; SSSE3-NEXT: movdqa %xmm0, %xmm2
1470 ; SSSE3-NEXT: pand %xmm5, %xmm2
1471 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1472 ; SSSE3-NEXT: movdqa %xmm6, %xmm7
1473 ; SSSE3-NEXT: pshufb %xmm2, %xmm7
1474 ; SSSE3-NEXT: psrlw $4, %xmm0
1475 ; SSSE3-NEXT: pand %xmm5, %xmm0
1476 ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1477 ; SSSE3-NEXT: movdqa %xmm2, %xmm3
1478 ; SSSE3-NEXT: pshufb %xmm0, %xmm3
1479 ; SSSE3-NEXT: por %xmm7, %xmm3
1480 ; SSSE3-NEXT: pshufb %xmm4, %xmm1
1481 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1482 ; SSSE3-NEXT: pand %xmm5, %xmm0
1483 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1484 ; SSSE3-NEXT: psrlw $4, %xmm1
1485 ; SSSE3-NEXT: pand %xmm5, %xmm1
1486 ; SSSE3-NEXT: pshufb %xmm1, %xmm2
1487 ; SSSE3-NEXT: por %xmm6, %xmm2
1488 ; SSSE3-NEXT: movdqa %xmm3, %xmm0
1489 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1492 ; AVX1-LABEL: test_bitreverse_v4i64:
1494 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1495 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1496 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1497 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1498 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
1499 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1500 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1501 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1502 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1503 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1504 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
1505 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
1506 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1507 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm2
1508 ; AVX1-NEXT: vpshufb %xmm2, %xmm5, %xmm2
1509 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1510 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1511 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
1512 ; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
1513 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1516 ; AVX2-LABEL: test_bitreverse_v4i64:
1518 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1519 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1520 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
1521 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1522 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1523 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1524 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1525 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
1526 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1527 ; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
1528 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1529 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
1532 ; AVX512-LABEL: test_bitreverse_v4i64:
1534 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1535 ; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1536 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
1537 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1538 ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
1539 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2
1540 ; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
1541 ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
1542 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1543 ; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
1544 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
1545 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
1548 ; XOPAVX1-LABEL: test_bitreverse_v4i64:
1550 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1551 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1552 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1553 ; XOPAVX1-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1554 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1555 ; XOPAVX1-NEXT: retq
1557 ; XOPAVX2-LABEL: test_bitreverse_v4i64:
1559 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1560 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1561 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
1562 ; XOPAVX2-NEXT: vpperm %xmm2, %xmm0, %xmm0, %xmm0
1563 ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
1564 ; XOPAVX2-NEXT: retq
1566 ; GFNISSE-LABEL: test_bitreverse_v4i64:
1568 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1569 ; GFNISSE-NEXT: pshufb %xmm2, %xmm0
1570 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [9241421688590303745,9241421688590303745]
1571 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0
1572 ; GFNISSE-NEXT: pshufb %xmm2, %xmm1
1573 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1
1574 ; GFNISSE-NEXT: retq
1576 ; GFNIAVX1-LABEL: test_bitreverse_v4i64:
1577 ; GFNIAVX1: # %bb.0:
1578 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1579 ; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1580 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
1581 ; GFNIAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
1582 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1583 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1584 ; GFNIAVX1-NEXT: retq
1586 ; GFNIAVX2-LABEL: test_bitreverse_v4i64:
1587 ; GFNIAVX2: # %bb.0:
1588 ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1589 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1590 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1591 ; GFNIAVX2-NEXT: retq
1593 ; GFNIAVX512-LABEL: test_bitreverse_v4i64:
1594 ; GFNIAVX512: # %bb.0:
1595 ; GFNIAVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1596 ; GFNIAVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1597 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1598 ; GFNIAVX512-NEXT: retq
1599 %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1603 define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1604 ; SSE2-LABEL: test_bitreverse_v64i8:
1606 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1607 ; SSE2-NEXT: psrlw $4, %xmm5
1608 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1609 ; SSE2-NEXT: pand %xmm4, %xmm5
1610 ; SSE2-NEXT: pand %xmm4, %xmm0
1611 ; SSE2-NEXT: psllw $4, %xmm0
1612 ; SSE2-NEXT: por %xmm5, %xmm0
1613 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1614 ; SSE2-NEXT: psrlw $2, %xmm6
1615 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1616 ; SSE2-NEXT: pand %xmm5, %xmm6
1617 ; SSE2-NEXT: pand %xmm5, %xmm0
1618 ; SSE2-NEXT: psllw $2, %xmm0
1619 ; SSE2-NEXT: por %xmm6, %xmm0
1620 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1621 ; SSE2-NEXT: psrlw $1, %xmm7
1622 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1623 ; SSE2-NEXT: pand %xmm6, %xmm7
1624 ; SSE2-NEXT: pand %xmm6, %xmm0
1625 ; SSE2-NEXT: paddb %xmm0, %xmm0
1626 ; SSE2-NEXT: por %xmm7, %xmm0
1627 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1628 ; SSE2-NEXT: psrlw $4, %xmm7
1629 ; SSE2-NEXT: pand %xmm4, %xmm7
1630 ; SSE2-NEXT: pand %xmm4, %xmm1
1631 ; SSE2-NEXT: psllw $4, %xmm1
1632 ; SSE2-NEXT: por %xmm7, %xmm1
1633 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1634 ; SSE2-NEXT: psrlw $2, %xmm7
1635 ; SSE2-NEXT: pand %xmm5, %xmm7
1636 ; SSE2-NEXT: pand %xmm5, %xmm1
1637 ; SSE2-NEXT: psllw $2, %xmm1
1638 ; SSE2-NEXT: por %xmm7, %xmm1
1639 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1640 ; SSE2-NEXT: psrlw $1, %xmm7
1641 ; SSE2-NEXT: pand %xmm6, %xmm7
1642 ; SSE2-NEXT: pand %xmm6, %xmm1
1643 ; SSE2-NEXT: paddb %xmm1, %xmm1
1644 ; SSE2-NEXT: por %xmm7, %xmm1
1645 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1646 ; SSE2-NEXT: psrlw $4, %xmm7
1647 ; SSE2-NEXT: pand %xmm4, %xmm7
1648 ; SSE2-NEXT: pand %xmm4, %xmm2
1649 ; SSE2-NEXT: psllw $4, %xmm2
1650 ; SSE2-NEXT: por %xmm7, %xmm2
1651 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1652 ; SSE2-NEXT: psrlw $2, %xmm7
1653 ; SSE2-NEXT: pand %xmm5, %xmm7
1654 ; SSE2-NEXT: pand %xmm5, %xmm2
1655 ; SSE2-NEXT: psllw $2, %xmm2
1656 ; SSE2-NEXT: por %xmm7, %xmm2
1657 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1658 ; SSE2-NEXT: psrlw $1, %xmm7
1659 ; SSE2-NEXT: pand %xmm6, %xmm7
1660 ; SSE2-NEXT: pand %xmm6, %xmm2
1661 ; SSE2-NEXT: paddb %xmm2, %xmm2
1662 ; SSE2-NEXT: por %xmm7, %xmm2
1663 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1664 ; SSE2-NEXT: psrlw $4, %xmm7
1665 ; SSE2-NEXT: pand %xmm4, %xmm7
1666 ; SSE2-NEXT: pand %xmm4, %xmm3
1667 ; SSE2-NEXT: psllw $4, %xmm3
1668 ; SSE2-NEXT: por %xmm7, %xmm3
1669 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1670 ; SSE2-NEXT: psrlw $2, %xmm4
1671 ; SSE2-NEXT: pand %xmm5, %xmm4
1672 ; SSE2-NEXT: pand %xmm5, %xmm3
1673 ; SSE2-NEXT: psllw $2, %xmm3
1674 ; SSE2-NEXT: por %xmm4, %xmm3
1675 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1676 ; SSE2-NEXT: psrlw $1, %xmm4
1677 ; SSE2-NEXT: pand %xmm6, %xmm4
1678 ; SSE2-NEXT: pand %xmm6, %xmm3
1679 ; SSE2-NEXT: paddb %xmm3, %xmm3
1680 ; SSE2-NEXT: por %xmm4, %xmm3
1683 ; SSSE3-LABEL: test_bitreverse_v64i8:
1685 ; SSSE3-NEXT: movdqa %xmm0, %xmm5
1686 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1687 ; SSSE3-NEXT: pand %xmm8, %xmm0
1688 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1689 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
1690 ; SSSE3-NEXT: pshufb %xmm0, %xmm6
1691 ; SSSE3-NEXT: psrlw $4, %xmm5
1692 ; SSSE3-NEXT: pand %xmm8, %xmm5
1693 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1694 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
1695 ; SSSE3-NEXT: pshufb %xmm5, %xmm0
1696 ; SSSE3-NEXT: por %xmm6, %xmm0
1697 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
1698 ; SSSE3-NEXT: pand %xmm8, %xmm5
1699 ; SSSE3-NEXT: movdqa %xmm7, %xmm6
1700 ; SSSE3-NEXT: pshufb %xmm5, %xmm6
1701 ; SSSE3-NEXT: psrlw $4, %xmm1
1702 ; SSSE3-NEXT: pand %xmm8, %xmm1
1703 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
1704 ; SSSE3-NEXT: pshufb %xmm1, %xmm5
1705 ; SSSE3-NEXT: por %xmm6, %xmm5
1706 ; SSSE3-NEXT: movdqa %xmm2, %xmm1
1707 ; SSSE3-NEXT: pand %xmm8, %xmm1
1708 ; SSSE3-NEXT: movdqa %xmm7, %xmm9
1709 ; SSSE3-NEXT: pshufb %xmm1, %xmm9
1710 ; SSSE3-NEXT: psrlw $4, %xmm2
1711 ; SSSE3-NEXT: pand %xmm8, %xmm2
1712 ; SSSE3-NEXT: movdqa %xmm4, %xmm6
1713 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
1714 ; SSSE3-NEXT: por %xmm9, %xmm6
1715 ; SSSE3-NEXT: movdqa %xmm3, %xmm1
1716 ; SSSE3-NEXT: pand %xmm8, %xmm1
1717 ; SSSE3-NEXT: pshufb %xmm1, %xmm7
1718 ; SSSE3-NEXT: psrlw $4, %xmm3
1719 ; SSSE3-NEXT: pand %xmm8, %xmm3
1720 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
1721 ; SSSE3-NEXT: por %xmm7, %xmm4
1722 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
1723 ; SSSE3-NEXT: movdqa %xmm6, %xmm2
1724 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
1727 ; AVX1-LABEL: test_bitreverse_v64i8:
1729 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1730 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1731 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1732 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1733 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1734 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
1735 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1736 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1737 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
1738 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
1739 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm4
1740 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1741 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
1742 ; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
1743 ; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm0
1744 ; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
1745 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1746 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1747 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
1748 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1749 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
1750 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
1751 ; AVX1-NEXT: vpshufb %xmm2, %xmm6, %xmm2
1752 ; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
1753 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4
1754 ; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm4
1755 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
1756 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
1757 ; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm1
1758 ; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
1759 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1762 ; AVX2-LABEL: test_bitreverse_v64i8:
1764 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1765 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
1766 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1767 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1768 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
1769 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
1770 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
1771 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1772 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
1773 ; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0
1774 ; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
1775 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
1776 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
1777 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
1778 ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
1779 ; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1
1780 ; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
1783 ; AVX512F-LABEL: test_bitreverse_v64i8:
1785 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
1786 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1787 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
1788 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1789 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
1790 ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
1791 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
1792 ; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm4
1793 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1794 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
1795 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
1796 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1797 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
1798 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
1799 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
1800 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
1801 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
1802 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1803 ; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
1804 ; AVX512F-NEXT: retq
1806 ; AVX512BW-LABEL: test_bitreverse_v64i8:
1807 ; AVX512BW: # %bb.0:
1808 ; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1809 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
1810 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1811 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1812 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
1813 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
1814 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
1815 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1816 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1817 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
1818 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
1819 ; AVX512BW-NEXT: retq
1821 ; XOPAVX1-LABEL: test_bitreverse_v64i8:
1823 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1824 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1825 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1826 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
1827 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1828 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1829 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1830 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
1831 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1832 ; XOPAVX1-NEXT: retq
1834 ; XOPAVX2-LABEL: test_bitreverse_v64i8:
1836 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
1837 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1838 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1839 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
1840 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1841 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1842 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
1843 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
1844 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
1845 ; XOPAVX2-NEXT: retq
1847 ; GFNISSE-LABEL: test_bitreverse_v64i8:
1849 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [9241421688590303745,9241421688590303745]
1850 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm0
1851 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm1
1852 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm2
1853 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm4, %xmm3
1854 ; GFNISSE-NEXT: retq
1856 ; GFNIAVX1-LABEL: test_bitreverse_v64i8:
1857 ; GFNIAVX1: # %bb.0:
1858 ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1859 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
1860 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
1861 ; GFNIAVX1-NEXT: retq
1863 ; GFNIAVX2-LABEL: test_bitreverse_v64i8:
1864 ; GFNIAVX2: # %bb.0:
1865 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
1866 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
1867 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
1868 ; GFNIAVX2-NEXT: retq
1870 ; GFNIAVX512-LABEL: test_bitreverse_v64i8:
1871 ; GFNIAVX512: # %bb.0:
1872 ; GFNIAVX512-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1873 ; GFNIAVX512-NEXT: retq
1874 %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
1878 define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
1879 ; SSE2-LABEL: test_bitreverse_v32i16:
1881 ; SSE2-NEXT: movdqa %xmm0, %xmm4
1882 ; SSE2-NEXT: psrlw $8, %xmm4
1883 ; SSE2-NEXT: psllw $8, %xmm0
1884 ; SSE2-NEXT: por %xmm4, %xmm0
1885 ; SSE2-NEXT: movdqa %xmm0, %xmm5
1886 ; SSE2-NEXT: psrlw $4, %xmm5
1887 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1888 ; SSE2-NEXT: pand %xmm4, %xmm5
1889 ; SSE2-NEXT: pand %xmm4, %xmm0
1890 ; SSE2-NEXT: psllw $4, %xmm0
1891 ; SSE2-NEXT: por %xmm5, %xmm0
1892 ; SSE2-NEXT: movdqa %xmm0, %xmm6
1893 ; SSE2-NEXT: psrlw $2, %xmm6
1894 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1895 ; SSE2-NEXT: pand %xmm5, %xmm6
1896 ; SSE2-NEXT: pand %xmm5, %xmm0
1897 ; SSE2-NEXT: psllw $2, %xmm0
1898 ; SSE2-NEXT: por %xmm6, %xmm0
1899 ; SSE2-NEXT: movdqa %xmm0, %xmm7
1900 ; SSE2-NEXT: psrlw $1, %xmm7
1901 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1902 ; SSE2-NEXT: pand %xmm6, %xmm7
1903 ; SSE2-NEXT: pand %xmm6, %xmm0
1904 ; SSE2-NEXT: paddb %xmm0, %xmm0
1905 ; SSE2-NEXT: por %xmm7, %xmm0
1906 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1907 ; SSE2-NEXT: psrlw $8, %xmm7
1908 ; SSE2-NEXT: psllw $8, %xmm1
1909 ; SSE2-NEXT: por %xmm7, %xmm1
1910 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1911 ; SSE2-NEXT: psrlw $4, %xmm7
1912 ; SSE2-NEXT: pand %xmm4, %xmm7
1913 ; SSE2-NEXT: pand %xmm4, %xmm1
1914 ; SSE2-NEXT: psllw $4, %xmm1
1915 ; SSE2-NEXT: por %xmm7, %xmm1
1916 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1917 ; SSE2-NEXT: psrlw $2, %xmm7
1918 ; SSE2-NEXT: pand %xmm5, %xmm7
1919 ; SSE2-NEXT: pand %xmm5, %xmm1
1920 ; SSE2-NEXT: psllw $2, %xmm1
1921 ; SSE2-NEXT: por %xmm7, %xmm1
1922 ; SSE2-NEXT: movdqa %xmm1, %xmm7
1923 ; SSE2-NEXT: psrlw $1, %xmm7
1924 ; SSE2-NEXT: pand %xmm6, %xmm7
1925 ; SSE2-NEXT: pand %xmm6, %xmm1
1926 ; SSE2-NEXT: paddb %xmm1, %xmm1
1927 ; SSE2-NEXT: por %xmm7, %xmm1
1928 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1929 ; SSE2-NEXT: psrlw $8, %xmm7
1930 ; SSE2-NEXT: psllw $8, %xmm2
1931 ; SSE2-NEXT: por %xmm7, %xmm2
1932 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1933 ; SSE2-NEXT: psrlw $4, %xmm7
1934 ; SSE2-NEXT: pand %xmm4, %xmm7
1935 ; SSE2-NEXT: pand %xmm4, %xmm2
1936 ; SSE2-NEXT: psllw $4, %xmm2
1937 ; SSE2-NEXT: por %xmm7, %xmm2
1938 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1939 ; SSE2-NEXT: psrlw $2, %xmm7
1940 ; SSE2-NEXT: pand %xmm5, %xmm7
1941 ; SSE2-NEXT: pand %xmm5, %xmm2
1942 ; SSE2-NEXT: psllw $2, %xmm2
1943 ; SSE2-NEXT: por %xmm7, %xmm2
1944 ; SSE2-NEXT: movdqa %xmm2, %xmm7
1945 ; SSE2-NEXT: psrlw $1, %xmm7
1946 ; SSE2-NEXT: pand %xmm6, %xmm7
1947 ; SSE2-NEXT: pand %xmm6, %xmm2
1948 ; SSE2-NEXT: paddb %xmm2, %xmm2
1949 ; SSE2-NEXT: por %xmm7, %xmm2
1950 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1951 ; SSE2-NEXT: psrlw $8, %xmm7
1952 ; SSE2-NEXT: psllw $8, %xmm3
1953 ; SSE2-NEXT: por %xmm7, %xmm3
1954 ; SSE2-NEXT: movdqa %xmm3, %xmm7
1955 ; SSE2-NEXT: psrlw $4, %xmm7
1956 ; SSE2-NEXT: pand %xmm4, %xmm7
1957 ; SSE2-NEXT: pand %xmm4, %xmm3
1958 ; SSE2-NEXT: psllw $4, %xmm3
1959 ; SSE2-NEXT: por %xmm7, %xmm3
1960 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1961 ; SSE2-NEXT: psrlw $2, %xmm4
1962 ; SSE2-NEXT: pand %xmm5, %xmm4
1963 ; SSE2-NEXT: pand %xmm5, %xmm3
1964 ; SSE2-NEXT: psllw $2, %xmm3
1965 ; SSE2-NEXT: por %xmm4, %xmm3
1966 ; SSE2-NEXT: movdqa %xmm3, %xmm4
1967 ; SSE2-NEXT: psrlw $1, %xmm4
1968 ; SSE2-NEXT: pand %xmm6, %xmm4
1969 ; SSE2-NEXT: pand %xmm6, %xmm3
1970 ; SSE2-NEXT: paddb %xmm3, %xmm3
1971 ; SSE2-NEXT: por %xmm4, %xmm3
1974 ; SSSE3-LABEL: test_bitreverse_v32i16:
1976 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
1977 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
1978 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1979 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
1980 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1981 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
1982 ; SSSE3-NEXT: pand %xmm7, %xmm0
1983 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1984 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
1985 ; SSSE3-NEXT: pshufb %xmm0, %xmm9
1986 ; SSSE3-NEXT: psrlw $4, %xmm1
1987 ; SSSE3-NEXT: pand %xmm7, %xmm1
1988 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1989 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
1990 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
1991 ; SSSE3-NEXT: por %xmm9, %xmm0
1992 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
1993 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
1994 ; SSSE3-NEXT: pand %xmm7, %xmm1
1995 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
1996 ; SSSE3-NEXT: pshufb %xmm1, %xmm9
1997 ; SSSE3-NEXT: psrlw $4, %xmm5
1998 ; SSSE3-NEXT: pand %xmm7, %xmm5
1999 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
2000 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
2001 ; SSSE3-NEXT: por %xmm9, %xmm1
2002 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
2003 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2004 ; SSSE3-NEXT: pand %xmm7, %xmm5
2005 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
2006 ; SSSE3-NEXT: pshufb %xmm5, %xmm9
2007 ; SSSE3-NEXT: psrlw $4, %xmm2
2008 ; SSSE3-NEXT: pand %xmm7, %xmm2
2009 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
2010 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
2011 ; SSSE3-NEXT: por %xmm9, %xmm5
2012 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
2013 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
2014 ; SSSE3-NEXT: pand %xmm7, %xmm2
2015 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
2016 ; SSSE3-NEXT: psrlw $4, %xmm3
2017 ; SSSE3-NEXT: pand %xmm7, %xmm3
2018 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
2019 ; SSSE3-NEXT: por %xmm6, %xmm4
2020 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
2021 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
2024 ; AVX1-LABEL: test_bitreverse_v32i16:
2026 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2027 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2028 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2029 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2030 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2031 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2032 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2033 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2034 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2035 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2036 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2037 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2038 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2039 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
2040 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2041 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
2042 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2043 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
2044 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
2045 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2046 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2047 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2048 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2049 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2050 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2051 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2052 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2053 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2054 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2055 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
2056 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
2057 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2058 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2059 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
2060 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
2061 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2064 ; AVX2-LABEL: test_bitreverse_v32i16:
2066 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2067 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
2068 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2069 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2070 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
2071 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2072 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
2073 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2074 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
2075 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
2076 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2077 ; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
2078 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
2079 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
2080 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2081 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
2082 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2083 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2084 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2085 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
2086 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
2089 ; AVX512F-LABEL: test_bitreverse_v32i16:
2091 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2092 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2093 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
2094 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2095 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2096 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
2097 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2098 ; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
2099 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2100 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2101 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
2102 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2103 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2104 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
2105 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
2106 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2107 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
2108 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
2109 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
2110 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
2111 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
2112 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2113 ; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
2114 ; AVX512F-NEXT: retq
2116 ; AVX512BW-LABEL: test_bitreverse_v32i16:
2117 ; AVX512BW: # %bb.0:
2118 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2119 ; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2120 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
2121 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2122 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2123 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
2124 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
2125 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
2126 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2127 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2128 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
2129 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
2130 ; AVX512BW-NEXT: retq
2132 ; XOPAVX1-LABEL: test_bitreverse_v32i16:
2134 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2135 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2136 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2137 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2138 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2139 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2140 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2141 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2142 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2143 ; XOPAVX1-NEXT: retq
2145 ; XOPAVX2-LABEL: test_bitreverse_v32i16:
2147 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2148 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2149 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2150 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2151 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2152 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2153 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2154 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2155 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2156 ; XOPAVX2-NEXT: retq
2158 ; GFNISSE-LABEL: test_bitreverse_v32i16:
2160 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2161 ; GFNISSE-NEXT: pshufb %xmm4, %xmm0
2162 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2163 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
2164 ; GFNISSE-NEXT: pshufb %xmm4, %xmm1
2165 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
2166 ; GFNISSE-NEXT: pshufb %xmm4, %xmm2
2167 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
2168 ; GFNISSE-NEXT: pshufb %xmm4, %xmm3
2169 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
2170 ; GFNISSE-NEXT: retq
2172 ; GFNIAVX1-LABEL: test_bitreverse_v32i16:
2173 ; GFNIAVX1: # %bb.0:
2174 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2175 ; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2176 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2177 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2178 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2179 ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2180 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2181 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
2182 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
2183 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2184 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
2185 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2186 ; GFNIAVX1-NEXT: retq
2188 ; GFNIAVX2-LABEL: test_bitreverse_v32i16:
2189 ; GFNIAVX2: # %bb.0:
2190 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2191 ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
2192 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2193 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2194 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2195 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2196 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2197 ; GFNIAVX2-NEXT: retq
2199 ; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
2200 ; GFNIAVX512F: # %bb.0:
2201 ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2202 ; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2203 ; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1]
2204 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2205 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2206 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2207 ; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2208 ; GFNIAVX512F-NEXT: retq
2210 ; GFNIAVX512BW-LABEL: test_bitreverse_v32i16:
2211 ; GFNIAVX512BW: # %bb.0:
2212 ; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2213 ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2214 ; GFNIAVX512BW-NEXT: retq
2215 %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
2219 define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
2220 ; SSE2-LABEL: test_bitreverse_v16i32:
2222 ; SSE2-NEXT: pxor %xmm4, %xmm4
2223 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2224 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
2225 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2226 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2227 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2228 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2229 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2230 ; SSE2-NEXT: packuswb %xmm5, %xmm0
2231 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2232 ; SSE2-NEXT: psrlw $4, %xmm6
2233 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2234 ; SSE2-NEXT: pand %xmm5, %xmm6
2235 ; SSE2-NEXT: pand %xmm5, %xmm0
2236 ; SSE2-NEXT: psllw $4, %xmm0
2237 ; SSE2-NEXT: por %xmm6, %xmm0
2238 ; SSE2-NEXT: movdqa %xmm0, %xmm7
2239 ; SSE2-NEXT: psrlw $2, %xmm7
2240 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2241 ; SSE2-NEXT: pand %xmm6, %xmm7
2242 ; SSE2-NEXT: pand %xmm6, %xmm0
2243 ; SSE2-NEXT: psllw $2, %xmm0
2244 ; SSE2-NEXT: por %xmm7, %xmm0
2245 ; SSE2-NEXT: movdqa %xmm0, %xmm8
2246 ; SSE2-NEXT: psrlw $1, %xmm8
2247 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2248 ; SSE2-NEXT: pand %xmm7, %xmm8
2249 ; SSE2-NEXT: pand %xmm7, %xmm0
2250 ; SSE2-NEXT: paddb %xmm0, %xmm0
2251 ; SSE2-NEXT: por %xmm8, %xmm0
2252 ; SSE2-NEXT: movdqa %xmm1, %xmm8
2253 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2254 ; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2255 ; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2256 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2257 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2258 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2259 ; SSE2-NEXT: packuswb %xmm8, %xmm1
2260 ; SSE2-NEXT: movdqa %xmm1, %xmm8
2261 ; SSE2-NEXT: psrlw $4, %xmm8
2262 ; SSE2-NEXT: pand %xmm5, %xmm8
2263 ; SSE2-NEXT: pand %xmm5, %xmm1
2264 ; SSE2-NEXT: psllw $4, %xmm1
2265 ; SSE2-NEXT: por %xmm8, %xmm1
2266 ; SSE2-NEXT: movdqa %xmm1, %xmm8
2267 ; SSE2-NEXT: psrlw $2, %xmm8
2268 ; SSE2-NEXT: pand %xmm6, %xmm8
2269 ; SSE2-NEXT: pand %xmm6, %xmm1
2270 ; SSE2-NEXT: psllw $2, %xmm1
2271 ; SSE2-NEXT: por %xmm8, %xmm1
2272 ; SSE2-NEXT: movdqa %xmm1, %xmm8
2273 ; SSE2-NEXT: psrlw $1, %xmm8
2274 ; SSE2-NEXT: pand %xmm7, %xmm8
2275 ; SSE2-NEXT: pand %xmm7, %xmm1
2276 ; SSE2-NEXT: paddb %xmm1, %xmm1
2277 ; SSE2-NEXT: por %xmm8, %xmm1
2278 ; SSE2-NEXT: movdqa %xmm2, %xmm8
2279 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2280 ; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2281 ; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2282 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2283 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2284 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2285 ; SSE2-NEXT: packuswb %xmm8, %xmm2
2286 ; SSE2-NEXT: movdqa %xmm2, %xmm8
2287 ; SSE2-NEXT: psrlw $4, %xmm8
2288 ; SSE2-NEXT: pand %xmm5, %xmm8
2289 ; SSE2-NEXT: pand %xmm5, %xmm2
2290 ; SSE2-NEXT: psllw $4, %xmm2
2291 ; SSE2-NEXT: por %xmm8, %xmm2
2292 ; SSE2-NEXT: movdqa %xmm2, %xmm8
2293 ; SSE2-NEXT: psrlw $2, %xmm8
2294 ; SSE2-NEXT: pand %xmm6, %xmm8
2295 ; SSE2-NEXT: pand %xmm6, %xmm2
2296 ; SSE2-NEXT: psllw $2, %xmm2
2297 ; SSE2-NEXT: por %xmm8, %xmm2
2298 ; SSE2-NEXT: movdqa %xmm2, %xmm8
2299 ; SSE2-NEXT: psrlw $1, %xmm8
2300 ; SSE2-NEXT: pand %xmm7, %xmm8
2301 ; SSE2-NEXT: pand %xmm7, %xmm2
2302 ; SSE2-NEXT: paddb %xmm2, %xmm2
2303 ; SSE2-NEXT: por %xmm8, %xmm2
2304 ; SSE2-NEXT: movdqa %xmm3, %xmm8
2305 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2306 ; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2307 ; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2308 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2309 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2310 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2311 ; SSE2-NEXT: packuswb %xmm8, %xmm3
2312 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2313 ; SSE2-NEXT: psrlw $4, %xmm4
2314 ; SSE2-NEXT: pand %xmm5, %xmm4
2315 ; SSE2-NEXT: pand %xmm5, %xmm3
2316 ; SSE2-NEXT: psllw $4, %xmm3
2317 ; SSE2-NEXT: por %xmm4, %xmm3
2318 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2319 ; SSE2-NEXT: psrlw $2, %xmm4
2320 ; SSE2-NEXT: pand %xmm6, %xmm4
2321 ; SSE2-NEXT: pand %xmm6, %xmm3
2322 ; SSE2-NEXT: psllw $2, %xmm3
2323 ; SSE2-NEXT: por %xmm4, %xmm3
2324 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2325 ; SSE2-NEXT: psrlw $1, %xmm4
2326 ; SSE2-NEXT: pand %xmm7, %xmm4
2327 ; SSE2-NEXT: pand %xmm7, %xmm3
2328 ; SSE2-NEXT: paddb %xmm3, %xmm3
2329 ; SSE2-NEXT: por %xmm4, %xmm3
2332 ; SSSE3-LABEL: test_bitreverse_v16i32:
2334 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
2335 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
2336 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2337 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
2338 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2339 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
2340 ; SSSE3-NEXT: pand %xmm7, %xmm0
2341 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2342 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
2343 ; SSSE3-NEXT: pshufb %xmm0, %xmm9
2344 ; SSSE3-NEXT: psrlw $4, %xmm1
2345 ; SSSE3-NEXT: pand %xmm7, %xmm1
2346 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2347 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
2348 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2349 ; SSSE3-NEXT: por %xmm9, %xmm0
2350 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
2351 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
2352 ; SSSE3-NEXT: pand %xmm7, %xmm1
2353 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
2354 ; SSSE3-NEXT: pshufb %xmm1, %xmm9
2355 ; SSSE3-NEXT: psrlw $4, %xmm5
2356 ; SSSE3-NEXT: pand %xmm7, %xmm5
2357 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
2358 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
2359 ; SSSE3-NEXT: por %xmm9, %xmm1
2360 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
2361 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2362 ; SSSE3-NEXT: pand %xmm7, %xmm5
2363 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
2364 ; SSSE3-NEXT: pshufb %xmm5, %xmm9
2365 ; SSSE3-NEXT: psrlw $4, %xmm2
2366 ; SSSE3-NEXT: pand %xmm7, %xmm2
2367 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
2368 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
2369 ; SSSE3-NEXT: por %xmm9, %xmm5
2370 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
2371 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
2372 ; SSSE3-NEXT: pand %xmm7, %xmm2
2373 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
2374 ; SSSE3-NEXT: psrlw $4, %xmm3
2375 ; SSSE3-NEXT: pand %xmm7, %xmm3
2376 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
2377 ; SSSE3-NEXT: por %xmm6, %xmm4
2378 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
2379 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
2382 ; AVX1-LABEL: test_bitreverse_v16i32:
2384 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2385 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2386 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2387 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2388 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2389 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2390 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2391 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2392 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2393 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2394 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2395 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2396 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2397 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
2398 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2399 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
2400 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2401 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
2402 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
2403 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2404 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2405 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2406 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2407 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2408 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2409 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2410 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2411 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2412 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2413 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
2414 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
2415 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2416 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2417 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
2418 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
2419 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2422 ; AVX2-LABEL: test_bitreverse_v16i32:
2424 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2425 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
2426 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2427 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2428 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
2429 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2430 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
2431 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2432 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
2433 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
2434 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2435 ; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
2436 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
2437 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
2438 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2439 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
2440 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2441 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2442 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2443 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
2444 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
2447 ; AVX512F-LABEL: test_bitreverse_v16i32:
2449 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2450 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2451 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
2452 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2453 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2454 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
2455 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2456 ; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
2457 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2458 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2459 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
2460 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2461 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2462 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
2463 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
2464 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2465 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
2466 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
2467 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
2468 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
2469 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
2470 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2471 ; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
2472 ; AVX512F-NEXT: retq
2474 ; AVX512BW-LABEL: test_bitreverse_v16i32:
2475 ; AVX512BW: # %bb.0:
2476 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2477 ; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2478 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
2479 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2480 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2481 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
2482 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
2483 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
2484 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2485 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2486 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
2487 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
2488 ; AVX512BW-NEXT: retq
2490 ; XOPAVX1-LABEL: test_bitreverse_v16i32:
2492 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2493 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2494 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2495 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2496 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2497 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2498 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2499 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2500 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2501 ; XOPAVX1-NEXT: retq
2503 ; XOPAVX2-LABEL: test_bitreverse_v16i32:
2505 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2506 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2507 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2508 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2509 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2510 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2511 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2512 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2513 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2514 ; XOPAVX2-NEXT: retq
2516 ; GFNISSE-LABEL: test_bitreverse_v16i32:
2518 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2519 ; GFNISSE-NEXT: pshufb %xmm4, %xmm0
2520 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2521 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
2522 ; GFNISSE-NEXT: pshufb %xmm4, %xmm1
2523 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
2524 ; GFNISSE-NEXT: pshufb %xmm4, %xmm2
2525 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
2526 ; GFNISSE-NEXT: pshufb %xmm4, %xmm3
2527 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
2528 ; GFNISSE-NEXT: retq
2530 ; GFNIAVX1-LABEL: test_bitreverse_v16i32:
2531 ; GFNIAVX1: # %bb.0:
2532 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2533 ; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2534 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2535 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2536 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2537 ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2538 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2539 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
2540 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
2541 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2542 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
2543 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2544 ; GFNIAVX1-NEXT: retq
2546 ; GFNIAVX2-LABEL: test_bitreverse_v16i32:
2547 ; GFNIAVX2: # %bb.0:
2548 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2549 ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
2550 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2551 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2552 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2553 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2554 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2555 ; GFNIAVX2-NEXT: retq
2557 ; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
2558 ; GFNIAVX512F: # %bb.0:
2559 ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2560 ; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2561 ; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1]
2562 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2563 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2564 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2565 ; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2566 ; GFNIAVX512F-NEXT: retq
2568 ; GFNIAVX512BW-LABEL: test_bitreverse_v16i32:
2569 ; GFNIAVX512BW: # %bb.0:
2570 ; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2571 ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2572 ; GFNIAVX512BW-NEXT: retq
2573 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2577 define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2578 ; SSE2-LABEL: test_bitreverse_v8i64:
2580 ; SSE2-NEXT: pxor %xmm4, %xmm4
2581 ; SSE2-NEXT: movdqa %xmm0, %xmm5
2582 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
2583 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2584 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2585 ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2586 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2587 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2588 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2589 ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2590 ; SSE2-NEXT: packuswb %xmm5, %xmm0
2591 ; SSE2-NEXT: movdqa %xmm0, %xmm6
2592 ; SSE2-NEXT: psrlw $4, %xmm6
2593 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2594 ; SSE2-NEXT: pand %xmm5, %xmm6
2595 ; SSE2-NEXT: pand %xmm5, %xmm0
2596 ; SSE2-NEXT: psllw $4, %xmm0
2597 ; SSE2-NEXT: por %xmm6, %xmm0
2598 ; SSE2-NEXT: movdqa %xmm0, %xmm7
2599 ; SSE2-NEXT: psrlw $2, %xmm7
2600 ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2601 ; SSE2-NEXT: pand %xmm6, %xmm7
2602 ; SSE2-NEXT: pand %xmm6, %xmm0
2603 ; SSE2-NEXT: psllw $2, %xmm0
2604 ; SSE2-NEXT: por %xmm7, %xmm0
2605 ; SSE2-NEXT: movdqa %xmm0, %xmm8
2606 ; SSE2-NEXT: psrlw $1, %xmm8
2607 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2608 ; SSE2-NEXT: pand %xmm7, %xmm8
2609 ; SSE2-NEXT: pand %xmm7, %xmm0
2610 ; SSE2-NEXT: paddb %xmm0, %xmm0
2611 ; SSE2-NEXT: por %xmm8, %xmm0
2612 ; SSE2-NEXT: movdqa %xmm1, %xmm8
2613 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2614 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
2615 ; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2616 ; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2617 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2618 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2619 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2620 ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2621 ; SSE2-NEXT: packuswb %xmm8, %xmm1
2622 ; SSE2-NEXT: movdqa %xmm1, %xmm8
2623 ; SSE2-NEXT: psrlw $4, %xmm8
2624 ; SSE2-NEXT: pand %xmm5, %xmm8
2625 ; SSE2-NEXT: pand %xmm5, %xmm1
2626 ; SSE2-NEXT: psllw $4, %xmm1
2627 ; SSE2-NEXT: por %xmm8, %xmm1
2628 ; SSE2-NEXT: movdqa %xmm1, %xmm8
2629 ; SSE2-NEXT: psrlw $2, %xmm8
2630 ; SSE2-NEXT: pand %xmm6, %xmm8
2631 ; SSE2-NEXT: pand %xmm6, %xmm1
2632 ; SSE2-NEXT: psllw $2, %xmm1
2633 ; SSE2-NEXT: por %xmm8, %xmm1
2634 ; SSE2-NEXT: movdqa %xmm1, %xmm8
2635 ; SSE2-NEXT: psrlw $1, %xmm8
2636 ; SSE2-NEXT: pand %xmm7, %xmm8
2637 ; SSE2-NEXT: pand %xmm7, %xmm1
2638 ; SSE2-NEXT: paddb %xmm1, %xmm1
2639 ; SSE2-NEXT: por %xmm8, %xmm1
2640 ; SSE2-NEXT: movdqa %xmm2, %xmm8
2641 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2642 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
2643 ; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2644 ; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2645 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2646 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2647 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2648 ; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2649 ; SSE2-NEXT: packuswb %xmm8, %xmm2
2650 ; SSE2-NEXT: movdqa %xmm2, %xmm8
2651 ; SSE2-NEXT: psrlw $4, %xmm8
2652 ; SSE2-NEXT: pand %xmm5, %xmm8
2653 ; SSE2-NEXT: pand %xmm5, %xmm2
2654 ; SSE2-NEXT: psllw $4, %xmm2
2655 ; SSE2-NEXT: por %xmm8, %xmm2
2656 ; SSE2-NEXT: movdqa %xmm2, %xmm8
2657 ; SSE2-NEXT: psrlw $2, %xmm8
2658 ; SSE2-NEXT: pand %xmm6, %xmm8
2659 ; SSE2-NEXT: pand %xmm6, %xmm2
2660 ; SSE2-NEXT: psllw $2, %xmm2
2661 ; SSE2-NEXT: por %xmm8, %xmm2
2662 ; SSE2-NEXT: movdqa %xmm2, %xmm8
2663 ; SSE2-NEXT: psrlw $1, %xmm8
2664 ; SSE2-NEXT: pand %xmm7, %xmm8
2665 ; SSE2-NEXT: pand %xmm7, %xmm2
2666 ; SSE2-NEXT: paddb %xmm2, %xmm2
2667 ; SSE2-NEXT: por %xmm8, %xmm2
2668 ; SSE2-NEXT: movdqa %xmm3, %xmm8
2669 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2670 ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
2671 ; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2672 ; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2673 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2674 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2675 ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2676 ; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2677 ; SSE2-NEXT: packuswb %xmm8, %xmm3
2678 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2679 ; SSE2-NEXT: psrlw $4, %xmm4
2680 ; SSE2-NEXT: pand %xmm5, %xmm4
2681 ; SSE2-NEXT: pand %xmm5, %xmm3
2682 ; SSE2-NEXT: psllw $4, %xmm3
2683 ; SSE2-NEXT: por %xmm4, %xmm3
2684 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2685 ; SSE2-NEXT: psrlw $2, %xmm4
2686 ; SSE2-NEXT: pand %xmm6, %xmm4
2687 ; SSE2-NEXT: pand %xmm6, %xmm3
2688 ; SSE2-NEXT: psllw $2, %xmm3
2689 ; SSE2-NEXT: por %xmm4, %xmm3
2690 ; SSE2-NEXT: movdqa %xmm3, %xmm4
2691 ; SSE2-NEXT: psrlw $1, %xmm4
2692 ; SSE2-NEXT: pand %xmm7, %xmm4
2693 ; SSE2-NEXT: pand %xmm7, %xmm3
2694 ; SSE2-NEXT: paddb %xmm3, %xmm3
2695 ; SSE2-NEXT: por %xmm4, %xmm3
2698 ; SSSE3-LABEL: test_bitreverse_v8i64:
2700 ; SSSE3-NEXT: movdqa %xmm1, %xmm5
2701 ; SSSE3-NEXT: movdqa %xmm0, %xmm1
2702 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2703 ; SSSE3-NEXT: pshufb %xmm8, %xmm1
2704 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2705 ; SSSE3-NEXT: movdqa %xmm1, %xmm0
2706 ; SSSE3-NEXT: pand %xmm7, %xmm0
2707 ; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2708 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
2709 ; SSSE3-NEXT: pshufb %xmm0, %xmm9
2710 ; SSSE3-NEXT: psrlw $4, %xmm1
2711 ; SSSE3-NEXT: pand %xmm7, %xmm1
2712 ; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2713 ; SSSE3-NEXT: movdqa %xmm4, %xmm0
2714 ; SSSE3-NEXT: pshufb %xmm1, %xmm0
2715 ; SSSE3-NEXT: por %xmm9, %xmm0
2716 ; SSSE3-NEXT: pshufb %xmm8, %xmm5
2717 ; SSSE3-NEXT: movdqa %xmm5, %xmm1
2718 ; SSSE3-NEXT: pand %xmm7, %xmm1
2719 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
2720 ; SSSE3-NEXT: pshufb %xmm1, %xmm9
2721 ; SSSE3-NEXT: psrlw $4, %xmm5
2722 ; SSSE3-NEXT: pand %xmm7, %xmm5
2723 ; SSSE3-NEXT: movdqa %xmm4, %xmm1
2724 ; SSSE3-NEXT: pshufb %xmm5, %xmm1
2725 ; SSSE3-NEXT: por %xmm9, %xmm1
2726 ; SSSE3-NEXT: pshufb %xmm8, %xmm2
2727 ; SSSE3-NEXT: movdqa %xmm2, %xmm5
2728 ; SSSE3-NEXT: pand %xmm7, %xmm5
2729 ; SSSE3-NEXT: movdqa %xmm6, %xmm9
2730 ; SSSE3-NEXT: pshufb %xmm5, %xmm9
2731 ; SSSE3-NEXT: psrlw $4, %xmm2
2732 ; SSSE3-NEXT: pand %xmm7, %xmm2
2733 ; SSSE3-NEXT: movdqa %xmm4, %xmm5
2734 ; SSSE3-NEXT: pshufb %xmm2, %xmm5
2735 ; SSSE3-NEXT: por %xmm9, %xmm5
2736 ; SSSE3-NEXT: pshufb %xmm8, %xmm3
2737 ; SSSE3-NEXT: movdqa %xmm3, %xmm2
2738 ; SSSE3-NEXT: pand %xmm7, %xmm2
2739 ; SSSE3-NEXT: pshufb %xmm2, %xmm6
2740 ; SSSE3-NEXT: psrlw $4, %xmm3
2741 ; SSSE3-NEXT: pand %xmm7, %xmm3
2742 ; SSSE3-NEXT: pshufb %xmm3, %xmm4
2743 ; SSSE3-NEXT: por %xmm6, %xmm4
2744 ; SSSE3-NEXT: movdqa %xmm5, %xmm2
2745 ; SSSE3-NEXT: movdqa %xmm4, %xmm3
2748 ; AVX1-LABEL: test_bitreverse_v8i64:
2750 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2751 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2752 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2753 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2754 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2755 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2756 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2757 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2758 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2759 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2760 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2761 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2762 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2763 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm5
2764 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2765 ; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
2766 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
2767 ; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm0
2768 ; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0
2769 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2770 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2771 ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2772 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm5
2773 ; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm5
2774 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2
2775 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
2776 ; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2
2777 ; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
2778 ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2779 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm3
2780 ; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3
2781 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
2782 ; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
2783 ; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
2784 ; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
2785 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2788 ; AVX2-LABEL: test_bitreverse_v8i64:
2790 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2791 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
2792 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2793 ; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2794 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4
2795 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2796 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
2797 ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2798 ; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
2799 ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
2800 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2801 ; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
2802 ; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0
2803 ; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
2804 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2805 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2
2806 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2807 ; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1
2808 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
2809 ; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1
2810 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
2813 ; AVX512F-LABEL: test_bitreverse_v8i64:
2815 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2816 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2817 ; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
2818 ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2819 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2820 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
2821 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2822 ; AVX512F-NEXT: # ymm5 = mem[0,1,0,1]
2823 ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
2824 ; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2825 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
2826 ; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
2827 ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2828 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
2829 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
2830 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2831 ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
2832 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
2833 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
2834 ; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
2835 ; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
2836 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2837 ; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
2838 ; AVX512F-NEXT: retq
2840 ; AVX512BW-LABEL: test_bitreverse_v8i64:
2841 ; AVX512BW: # %bb.0:
2842 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2843 ; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2844 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
2845 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2846 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2847 ; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
2848 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
2849 ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
2850 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2851 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2852 ; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
2853 ; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
2854 ; AVX512BW-NEXT: retq
2856 ; XOPAVX1-LABEL: test_bitreverse_v8i64:
2858 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2859 ; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2860 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2861 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2862 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2863 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
2864 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2865 ; XOPAVX1-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2866 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2867 ; XOPAVX1-NEXT: retq
2869 ; XOPAVX2-LABEL: test_bitreverse_v8i64:
2871 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
2872 ; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2873 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2874 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm0, %xmm0, %xmm0
2875 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
2876 ; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2877 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
2878 ; XOPAVX2-NEXT: vpperm %xmm3, %xmm1, %xmm0, %xmm1
2879 ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
2880 ; XOPAVX2-NEXT: retq
2882 ; GFNISSE-LABEL: test_bitreverse_v8i64:
2884 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2885 ; GFNISSE-NEXT: pshufb %xmm4, %xmm0
2886 ; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [9241421688590303745,9241421688590303745]
2887 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0
2888 ; GFNISSE-NEXT: pshufb %xmm4, %xmm1
2889 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1
2890 ; GFNISSE-NEXT: pshufb %xmm4, %xmm2
2891 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2
2892 ; GFNISSE-NEXT: pshufb %xmm4, %xmm3
2893 ; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3
2894 ; GFNISSE-NEXT: retq
2896 ; GFNIAVX1-LABEL: test_bitreverse_v8i64:
2897 ; GFNIAVX1: # %bb.0:
2898 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
2899 ; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2900 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2901 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
2902 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2903 ; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2904 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2905 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
2906 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
2907 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
2908 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
2909 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2910 ; GFNIAVX1-NEXT: retq
2912 ; GFNIAVX2-LABEL: test_bitreverse_v8i64:
2913 ; GFNIAVX2: # %bb.0:
2914 ; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2915 ; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
2916 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2917 ; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
2918 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2919 ; GFNIAVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2920 ; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2921 ; GFNIAVX2-NEXT: retq
2923 ; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
2924 ; GFNIAVX512F: # %bb.0:
2925 ; GFNIAVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
2926 ; GFNIAVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2927 ; GFNIAVX512F-NEXT: # ymm2 = mem[0,1,0,1]
2928 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
2929 ; GFNIAVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
2930 ; GFNIAVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2931 ; GFNIAVX512F-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2932 ; GFNIAVX512F-NEXT: retq
2934 ; GFNIAVX512BW-LABEL: test_bitreverse_v8i64:
2935 ; GFNIAVX512BW: # %bb.0:
2936 ; GFNIAVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2937 ; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2938 ; GFNIAVX512BW-NEXT: retq
2939 %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
2947 define i32 @fold_bitreverse_i32() nounwind {
2948 ; ALL-LABEL: fold_bitreverse_i32:
2950 ; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF
2952 %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
2956 define <16 x i8> @fold_bitreverse_v16i8() nounwind {
2957 ; SSE-LABEL: fold_bitreverse_v16i8:
2959 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2962 ; AVX-LABEL: fold_bitreverse_v16i8:
2964 ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2967 ; XOP-LABEL: fold_bitreverse_v16i8:
2969 ; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2972 ; GFNISSE-LABEL: fold_bitreverse_v16i8:
2974 ; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2975 ; GFNISSE-NEXT: retq
2977 ; GFNIAVX-LABEL: fold_bitreverse_v16i8:
2979 ; GFNIAVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2980 ; GFNIAVX-NEXT: retq
2981 %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
2985 define <16 x i16> @fold_bitreverse_v16i16() nounwind {
2986 ; SSE-LABEL: fold_bitreverse_v16i16:
2988 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2989 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2992 ; AVX-LABEL: fold_bitreverse_v16i16:
2994 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2997 ; XOP-LABEL: fold_bitreverse_v16i16:
2999 ; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3002 ; GFNISSE-LABEL: fold_bitreverse_v16i16:
3004 ; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
3005 ; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
3006 ; GFNISSE-NEXT: retq
3008 ; GFNIAVX-LABEL: fold_bitreverse_v16i16:
3010 ; GFNIAVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
3011 ; GFNIAVX-NEXT: retq
3012 %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
3016 define <16 x i32> @fold_bitreverse_v16i32() nounwind {
3017 ; SSE-LABEL: fold_bitreverse_v16i32:
3019 ; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
3020 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
3021 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
3022 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
3025 ; AVX1-LABEL: fold_bitreverse_v16i32:
3027 ; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3028 ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3031 ; AVX2-LABEL: fold_bitreverse_v16i32:
3033 ; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3034 ; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3037 ; AVX512-LABEL: fold_bitreverse_v16i32:
3039 ; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3042 ; XOP-LABEL: fold_bitreverse_v16i32:
3044 ; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3045 ; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3048 ; GFNISSE-LABEL: fold_bitreverse_v16i32:
3050 ; GFNISSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
3051 ; GFNISSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
3052 ; GFNISSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
3053 ; GFNISSE-NEXT: movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
3054 ; GFNISSE-NEXT: retq
3056 ; GFNIAVX1-LABEL: fold_bitreverse_v16i32:
3057 ; GFNIAVX1: # %bb.0:
3058 ; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3059 ; GFNIAVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3060 ; GFNIAVX1-NEXT: retq
3062 ; GFNIAVX2-LABEL: fold_bitreverse_v16i32:
3063 ; GFNIAVX2: # %bb.0:
3064 ; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
3065 ; GFNIAVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3066 ; GFNIAVX2-NEXT: retq
3068 ; GFNIAVX512-LABEL: fold_bitreverse_v16i32:
3069 ; GFNIAVX512: # %bb.0:
3070 ; GFNIAVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
3071 ; GFNIAVX512-NEXT: retq
3072 %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
3076 declare i8 @llvm.bitreverse.i8(i8) readnone
3077 declare i16 @llvm.bitreverse.i16(i16) readnone
3078 declare i32 @llvm.bitreverse.i32(i32) readnone
3079 declare i64 @llvm.bitreverse.i64(i64) readnone
3081 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
3082 declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
3083 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
3084 declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
3086 declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) readnone
3087 declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
3088 declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) readnone
3089 declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) readnone
3091 declare <64 x i8> @llvm.bitreverse.v64i8(<64 x i8>) readnone
3092 declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
3093 declare <16 x i32> @llvm.bitreverse.v16i32(<16 x i32>) readnone
3094 declare <8 x i64> @llvm.bitreverse.v8i64(<8 x i64>) readnone