1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK12
15 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK13
16 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK14
17 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK15
18 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK16
19 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK17
20 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK18
21 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK19
22 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK20
23 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK21
24 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK22
25 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK23
26 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK24
27 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK25
28 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK26
29 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK27
30 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK28
31 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK29
32 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK30
33 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK31
35 define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
36 ; X64-NO-BMI2-LABEL: lshr_4bytes:
37 ; X64-NO-BMI2: # %bb.0:
38 ; X64-NO-BMI2-NEXT: movl (%rdi), %eax
39 ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx
40 ; X64-NO-BMI2-NEXT: shlb $3, %cl
41 ; X64-NO-BMI2-NEXT: shrl %cl, %eax
42 ; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
43 ; X64-NO-BMI2-NEXT: retq
45 ; X64-HAVE-BMI2-LABEL: lshr_4bytes:
46 ; X64-HAVE-BMI2: # %bb.0:
47 ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax
48 ; X64-HAVE-BMI2-NEXT: shlb $3, %al
49 ; X64-HAVE-BMI2-NEXT: shrxl %eax, (%rdi), %eax
50 ; X64-HAVE-BMI2-NEXT: movl %eax, (%rdx)
51 ; X64-HAVE-BMI2-NEXT: retq
53 ; X86-NO-BMI2-LABEL: lshr_4bytes:
54 ; X86-NO-BMI2: # %bb.0:
55 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
56 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
57 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
58 ; X86-NO-BMI2-NEXT: movl (%edx), %edx
59 ; X86-NO-BMI2-NEXT: movzbl (%ecx), %ecx
60 ; X86-NO-BMI2-NEXT: shlb $3, %cl
61 ; X86-NO-BMI2-NEXT: shrl %cl, %edx
62 ; X86-NO-BMI2-NEXT: movl %edx, (%eax)
63 ; X86-NO-BMI2-NEXT: retl
65 ; X86-HAVE-BMI2-LABEL: lshr_4bytes:
66 ; X86-HAVE-BMI2: # %bb.0:
67 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
68 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
69 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
70 ; X86-HAVE-BMI2-NEXT: movzbl (%edx), %edx
71 ; X86-HAVE-BMI2-NEXT: shlb $3, %dl
72 ; X86-HAVE-BMI2-NEXT: shrxl %edx, (%ecx), %ecx
73 ; X86-HAVE-BMI2-NEXT: movl %ecx, (%eax)
74 ; X86-HAVE-BMI2-NEXT: retl
75 %src = load i32, ptr %src.ptr, align 1
76 %byteOff = load i32, ptr %byteOff.ptr, align 1
77 %bitOff = shl i32 %byteOff, 3
78 %res = lshr i32 %src, %bitOff
79 store i32 %res, ptr %dst, align 1
82 define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
83 ; X64-NO-BMI2-LABEL: shl_4bytes:
84 ; X64-NO-BMI2: # %bb.0:
85 ; X64-NO-BMI2-NEXT: movl (%rdi), %eax
86 ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx
87 ; X64-NO-BMI2-NEXT: shlb $3, %cl
88 ; X64-NO-BMI2-NEXT: shll %cl, %eax
89 ; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
90 ; X64-NO-BMI2-NEXT: retq
92 ; X64-HAVE-BMI2-LABEL: shl_4bytes:
93 ; X64-HAVE-BMI2: # %bb.0:
94 ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax
95 ; X64-HAVE-BMI2-NEXT: shlb $3, %al
96 ; X64-HAVE-BMI2-NEXT: shlxl %eax, (%rdi), %eax
97 ; X64-HAVE-BMI2-NEXT: movl %eax, (%rdx)
98 ; X64-HAVE-BMI2-NEXT: retq
100 ; X86-NO-BMI2-LABEL: shl_4bytes:
101 ; X86-NO-BMI2: # %bb.0:
102 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
103 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
104 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
105 ; X86-NO-BMI2-NEXT: movl (%edx), %edx
106 ; X86-NO-BMI2-NEXT: movzbl (%ecx), %ecx
107 ; X86-NO-BMI2-NEXT: shlb $3, %cl
108 ; X86-NO-BMI2-NEXT: shll %cl, %edx
109 ; X86-NO-BMI2-NEXT: movl %edx, (%eax)
110 ; X86-NO-BMI2-NEXT: retl
112 ; X86-HAVE-BMI2-LABEL: shl_4bytes:
113 ; X86-HAVE-BMI2: # %bb.0:
114 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
115 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
116 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
117 ; X86-HAVE-BMI2-NEXT: movzbl (%edx), %edx
118 ; X86-HAVE-BMI2-NEXT: shlb $3, %dl
119 ; X86-HAVE-BMI2-NEXT: shlxl %edx, (%ecx), %ecx
120 ; X86-HAVE-BMI2-NEXT: movl %ecx, (%eax)
121 ; X86-HAVE-BMI2-NEXT: retl
122 %src = load i32, ptr %src.ptr, align 1
123 %byteOff = load i32, ptr %byteOff.ptr, align 1
124 %bitOff = shl i32 %byteOff, 3
125 %res = shl i32 %src, %bitOff
126 store i32 %res, ptr %dst, align 1
129 define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
130 ; X64-NO-BMI2-LABEL: ashr_4bytes:
131 ; X64-NO-BMI2: # %bb.0:
132 ; X64-NO-BMI2-NEXT: movl (%rdi), %eax
133 ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx
134 ; X64-NO-BMI2-NEXT: shlb $3, %cl
135 ; X64-NO-BMI2-NEXT: sarl %cl, %eax
136 ; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
137 ; X64-NO-BMI2-NEXT: retq
139 ; X64-HAVE-BMI2-LABEL: ashr_4bytes:
140 ; X64-HAVE-BMI2: # %bb.0:
141 ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax
142 ; X64-HAVE-BMI2-NEXT: shlb $3, %al
143 ; X64-HAVE-BMI2-NEXT: sarxl %eax, (%rdi), %eax
144 ; X64-HAVE-BMI2-NEXT: movl %eax, (%rdx)
145 ; X64-HAVE-BMI2-NEXT: retq
147 ; X86-NO-BMI2-LABEL: ashr_4bytes:
148 ; X86-NO-BMI2: # %bb.0:
149 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
150 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
151 ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
152 ; X86-NO-BMI2-NEXT: movl (%edx), %edx
153 ; X86-NO-BMI2-NEXT: movzbl (%ecx), %ecx
154 ; X86-NO-BMI2-NEXT: shlb $3, %cl
155 ; X86-NO-BMI2-NEXT: sarl %cl, %edx
156 ; X86-NO-BMI2-NEXT: movl %edx, (%eax)
157 ; X86-NO-BMI2-NEXT: retl
159 ; X86-HAVE-BMI2-LABEL: ashr_4bytes:
160 ; X86-HAVE-BMI2: # %bb.0:
161 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
162 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
163 ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
164 ; X86-HAVE-BMI2-NEXT: movzbl (%edx), %edx
165 ; X86-HAVE-BMI2-NEXT: shlb $3, %dl
166 ; X86-HAVE-BMI2-NEXT: sarxl %edx, (%ecx), %ecx
167 ; X86-HAVE-BMI2-NEXT: movl %ecx, (%eax)
168 ; X86-HAVE-BMI2-NEXT: retl
169 %src = load i32, ptr %src.ptr, align 1
170 %byteOff = load i32, ptr %byteOff.ptr, align 1
171 %bitOff = shl i32 %byteOff, 3
172 %res = ashr i32 %src, %bitOff
173 store i32 %res, ptr %dst, align 1
177 define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
178 ; X64-NO-BMI2-LABEL: lshr_8bytes:
179 ; X64-NO-BMI2: # %bb.0:
180 ; X64-NO-BMI2-NEXT: movq (%rdi), %rax
181 ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx
182 ; X64-NO-BMI2-NEXT: shlb $3, %cl
183 ; X64-NO-BMI2-NEXT: shrq %cl, %rax
184 ; X64-NO-BMI2-NEXT: movq %rax, (%rdx)
185 ; X64-NO-BMI2-NEXT: retq
187 ; X64-HAVE-BMI2-LABEL: lshr_8bytes:
188 ; X64-HAVE-BMI2: # %bb.0:
189 ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax
190 ; X64-HAVE-BMI2-NEXT: shlb $3, %al
191 ; X64-HAVE-BMI2-NEXT: shrxq %rax, (%rdi), %rax
192 ; X64-HAVE-BMI2-NEXT: movq %rax, (%rdx)
193 ; X64-HAVE-BMI2-NEXT: retq
195 ; X86-NO-SHLD-NO-BMI2-LABEL: lshr_8bytes:
196 ; X86-NO-SHLD-NO-BMI2: # %bb.0:
197 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx
198 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi
199 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi
200 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
201 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
202 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
203 ; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %ebx
204 ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi
205 ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax
206 ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
207 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
208 ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx
209 ; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %edi
210 ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl
211 ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi
212 ; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi
213 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
214 ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %esi
215 ; X86-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx
216 ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al
217 ; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %esi, %edi
218 ; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %esi, %ecx
219 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, 4(%edx)
220 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, (%edx)
221 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi
222 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi
223 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx
224 ; X86-NO-SHLD-NO-BMI2-NEXT: retl
226 ; X86-HAVE-SHLD-NO-BMI2-LABEL: lshr_8bytes:
227 ; X86-HAVE-SHLD-NO-BMI2: # %bb.0:
228 ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi
229 ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi
230 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
231 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
232 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
233 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esi), %edx
234 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esi), %esi
235 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %ecx
236 ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
237 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, %edi
238 ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrl %cl, %edi
239 ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edx
240 ; X86-HAVE-SHLD-NO-BMI2-NEXT: xorl %esi, %esi
241 ; X86-HAVE-SHLD-NO-BMI2-NEXT: testb $32, %cl
242 ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx
243 ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi
244 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 4(%eax)
245 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax)
246 ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi
247 ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi
248 ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl
250 ; X86-NO-SHLD-HAVE-BMI2-LABEL: lshr_8bytes:
251 ; X86-NO-SHLD-HAVE-BMI2: # %bb.0:
252 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx
253 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi
254 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi
255 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
256 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
257 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
258 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %esi
259 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx
260 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
261 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%edx), %edx
262 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %ebx
263 ; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %bl
264 ; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edi
265 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ebx, %edi, %edi
266 ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %edx, %edi
267 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %edx
268 ; X86-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
269 ; X86-NO-SHLD-HAVE-BMI2-NEXT: testb $32, %cl
270 ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovnel %edx, %edi
271 ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovel %edx, %esi
272 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax)
273 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%eax)
274 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi
275 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi
276 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx
277 ; X86-NO-SHLD-HAVE-BMI2-NEXT: retl
279 ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_8bytes:
280 ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0:
281 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi
282 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi
283 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
284 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
285 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
286 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esi), %edx
287 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %esi
288 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx
289 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
290 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx
291 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi
292 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi
293 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl
294 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edx
295 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %edi
296 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%eax)
297 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax)
298 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi
299 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi
300 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl
301 %src = load i64, ptr %src.ptr, align 1
302 %byteOff = load i64, ptr %byteOff.ptr, align 1
303 %bitOff = shl i64 %byteOff, 3
304 %res = lshr i64 %src, %bitOff
305 store i64 %res, ptr %dst, align 1
308 define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
309 ; X64-NO-BMI2-LABEL: shl_8bytes:
310 ; X64-NO-BMI2: # %bb.0:
311 ; X64-NO-BMI2-NEXT: movq (%rdi), %rax
312 ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx
313 ; X64-NO-BMI2-NEXT: shlb $3, %cl
314 ; X64-NO-BMI2-NEXT: shlq %cl, %rax
315 ; X64-NO-BMI2-NEXT: movq %rax, (%rdx)
316 ; X64-NO-BMI2-NEXT: retq
318 ; X64-HAVE-BMI2-LABEL: shl_8bytes:
319 ; X64-HAVE-BMI2: # %bb.0:
320 ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax
321 ; X64-HAVE-BMI2-NEXT: shlb $3, %al
322 ; X64-HAVE-BMI2-NEXT: shlxq %rax, (%rdi), %rax
323 ; X64-HAVE-BMI2-NEXT: movq %rax, (%rdx)
324 ; X64-HAVE-BMI2-NEXT: retq
326 ; X86-NO-SHLD-NO-BMI2-LABEL: shl_8bytes:
327 ; X86-NO-SHLD-NO-BMI2: # %bb.0:
328 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx
329 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi
330 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi
331 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
332 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
333 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
334 ; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %esi
335 ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %ebx
336 ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax
337 ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
338 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
339 ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebx
340 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %edi
341 ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %edi
342 ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl
343 ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %edi
344 ; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi
345 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
346 ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi
347 ; X86-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx
348 ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al
349 ; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %esi, %edi
350 ; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %esi, %ecx
351 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, (%edx)
352 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, 4(%edx)
353 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi
354 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi
355 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx
356 ; X86-NO-SHLD-NO-BMI2-NEXT: retl
358 ; X86-HAVE-SHLD-NO-BMI2-LABEL: shl_8bytes:
359 ; X86-HAVE-SHLD-NO-BMI2: # %bb.0:
360 ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi
361 ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi
362 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
363 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
364 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
365 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi
366 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edx
367 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %ecx
368 ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
369 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, %edi
370 ; X86-HAVE-SHLD-NO-BMI2-NEXT: shll %cl, %edi
371 ; X86-HAVE-SHLD-NO-BMI2-NEXT: shldl %cl, %esi, %edx
372 ; X86-HAVE-SHLD-NO-BMI2-NEXT: xorl %esi, %esi
373 ; X86-HAVE-SHLD-NO-BMI2-NEXT: testb $32, %cl
374 ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx
375 ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi
376 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, 4(%eax)
377 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%eax)
378 ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi
379 ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi
380 ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl
382 ; X86-NO-SHLD-HAVE-BMI2-LABEL: shl_8bytes:
383 ; X86-NO-SHLD-HAVE-BMI2: # %bb.0:
384 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx
385 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi
386 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi
387 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
388 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
389 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
390 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi
391 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx
392 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
393 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ecx, 4(%edx), %edx
394 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %ebx
395 ; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %bl
396 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ecx, %esi, %edi
397 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrl %esi
398 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ebx, %esi, %esi
399 ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %edx, %esi
400 ; X86-NO-SHLD-HAVE-BMI2-NEXT: xorl %edx, %edx
401 ; X86-NO-SHLD-HAVE-BMI2-NEXT: testb $32, %cl
402 ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovnel %edi, %esi
403 ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovel %edi, %edx
404 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax)
405 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax)
406 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi
407 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi
408 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx
409 ; X86-NO-SHLD-HAVE-BMI2-NEXT: retl
411 ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: shl_8bytes:
412 ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0:
413 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi
414 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi
415 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
416 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
417 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
418 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi
419 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edx
420 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx
421 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
422 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shldl %cl, %esi, %edx
423 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlxl %ecx, %esi, %esi
424 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi
425 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl
426 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edx
427 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %edi
428 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%eax)
429 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, (%eax)
430 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi
431 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi
432 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl
433 %src = load i64, ptr %src.ptr, align 1
434 %byteOff = load i64, ptr %byteOff.ptr, align 1
435 %bitOff = shl i64 %byteOff, 3
436 %res = shl i64 %src, %bitOff
437 store i64 %res, ptr %dst, align 1
440 define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
441 ; X64-NO-BMI2-LABEL: ashr_8bytes:
442 ; X64-NO-BMI2: # %bb.0:
443 ; X64-NO-BMI2-NEXT: movq (%rdi), %rax
444 ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx
445 ; X64-NO-BMI2-NEXT: shlb $3, %cl
446 ; X64-NO-BMI2-NEXT: sarq %cl, %rax
447 ; X64-NO-BMI2-NEXT: movq %rax, (%rdx)
448 ; X64-NO-BMI2-NEXT: retq
450 ; X64-HAVE-BMI2-LABEL: ashr_8bytes:
451 ; X64-HAVE-BMI2: # %bb.0:
452 ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax
453 ; X64-HAVE-BMI2-NEXT: shlb $3, %al
454 ; X64-HAVE-BMI2-NEXT: sarxq %rax, (%rdi), %rax
455 ; X64-HAVE-BMI2-NEXT: movq %rax, (%rdx)
456 ; X64-HAVE-BMI2-NEXT: retq
458 ; X86-NO-SHLD-NO-BMI2-LABEL: ashr_8bytes:
459 ; X86-NO-SHLD-NO-BMI2: # %bb.0:
460 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx
461 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi
462 ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi
463 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
464 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
465 ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
466 ; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %ebx
467 ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi
468 ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax
469 ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
470 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
471 ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx
472 ; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %edi
473 ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl
474 ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi
475 ; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi
476 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx
477 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
478 ; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx
479 ; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %esi
480 ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al
481 ; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %ebx, %edi
482 ; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %ebx, %esi
483 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, 4(%edx)
484 ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, (%edx)
485 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi
486 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi
487 ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx
488 ; X86-NO-SHLD-NO-BMI2-NEXT: retl
490 ; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_8bytes:
491 ; X86-HAVE-SHLD-NO-BMI2: # %bb.0:
492 ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi
493 ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi
494 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
495 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
496 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
497 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esi), %edx
498 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esi), %esi
499 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %ecx
500 ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
501 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, %edi
502 ; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %edi
503 ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edx
504 ; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %esi
505 ; X86-HAVE-SHLD-NO-BMI2-NEXT: testb $32, %cl
506 ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx
507 ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi
508 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 4(%eax)
509 ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax)
510 ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi
511 ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi
512 ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl
514 ; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_8bytes:
515 ; X86-NO-SHLD-HAVE-BMI2: # %bb.0:
516 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx
517 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi
518 ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi
519 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
520 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx
521 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
522 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %ecx
523 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%edx), %edx
524 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %dl
525 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %edx, (%esi), %esi
526 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, %ebx
527 ; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %bl
528 ; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ecx,%ecx), %edi
529 ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ebx, %edi, %edi
530 ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %edi
531 ; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %edx, %ecx, %esi
532 ; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %ecx
533 ; X86-NO-SHLD-HAVE-BMI2-NEXT: testb $32, %dl
534 ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edi
535 ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %ecx
536 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%eax)
537 ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%eax)
538 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi
539 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi
540 ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx
541 ; X86-NO-SHLD-HAVE-BMI2-NEXT: retl
543 ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_8bytes:
544 ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0:
545 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi
546 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi
547 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
548 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx
549 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
550 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esi), %edx
551 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %esi
552 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx
553 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
554 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx
555 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %esi, %edi
556 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %esi
557 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl
558 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edi, %edx
559 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edi, %esi
560 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax)
561 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax)
562 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi
563 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi
564 ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl
565 %src = load i64, ptr %src.ptr, align 1
566 %byteOff = load i64, ptr %byteOff.ptr, align 1
567 %bitOff = shl i64 %byteOff, 3
568 %res = ashr i64 %src, %bitOff
569 store i64 %res, ptr %dst, align 1
573 define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
574 ; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes:
575 ; X64-NO-SHLD-NO-BMI2: # %bb.0:
576 ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
577 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
578 ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
579 ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
580 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
581 ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8
582 ; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi
583 ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
584 ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
585 ; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi
586 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
587 ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rdi
588 ; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx
589 ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
590 ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %rdi, %rsi
591 ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %rdi, %rcx
592 ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, 8(%rdx)
593 ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx)
594 ; X64-NO-SHLD-NO-BMI2-NEXT: retq
596 ; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes:
597 ; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
598 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
599 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
600 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
601 ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
602 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi
603 ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi
604 ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax
605 ; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %edi, %edi
606 ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
607 ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax
608 ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi
609 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
610 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
611 ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
613 ; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes:
614 ; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
615 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax
616 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
617 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
618 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi
619 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
620 ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
621 ; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8
622 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi
623 ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi
624 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rax, %rax
625 ; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
626 ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
627 ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi
628 ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi
629 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, 8(%rdx)
630 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx)
631 ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
633 ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes:
634 ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
635 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
636 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
637 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
638 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
639 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax
640 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rdi, %rsi
641 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi
642 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
643 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax
644 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi
645 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
646 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
647 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
649 ; X86-SSE2-LABEL: lshr_16bytes:
651 ; X86-SSE2-NEXT: pushl %ebx
652 ; X86-SSE2-NEXT: pushl %edi
653 ; X86-SSE2-NEXT: pushl %esi
654 ; X86-SSE2-NEXT: subl $32, %esp
655 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
656 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
657 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
658 ; X86-SSE2-NEXT: movl (%edx), %esi
659 ; X86-SSE2-NEXT: movl 4(%edx), %edi
660 ; X86-SSE2-NEXT: movl 8(%edx), %ebx
661 ; X86-SSE2-NEXT: movl 12(%edx), %edx
662 ; X86-SSE2-NEXT: movzbl (%ecx), %ecx
663 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
664 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
665 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
666 ; X86-SSE2-NEXT: movl %esi, (%esp)
667 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
668 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
669 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
670 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
671 ; X86-SSE2-NEXT: andl $15, %ecx
672 ; X86-SSE2-NEXT: movl (%esp,%ecx), %edx
673 ; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi
674 ; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi
675 ; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx
676 ; X86-SSE2-NEXT: movl %ecx, 8(%eax)
677 ; X86-SSE2-NEXT: movl %edi, 12(%eax)
678 ; X86-SSE2-NEXT: movl %edx, (%eax)
679 ; X86-SSE2-NEXT: movl %esi, 4(%eax)
680 ; X86-SSE2-NEXT: addl $32, %esp
681 ; X86-SSE2-NEXT: popl %esi
682 ; X86-SSE2-NEXT: popl %edi
683 ; X86-SSE2-NEXT: popl %ebx
684 ; X86-SSE2-NEXT: retl
686 ; X86-SSE42-LABEL: lshr_16bytes:
687 ; X86-SSE42: # %bb.0:
688 ; X86-SSE42-NEXT: subl $32, %esp
689 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
690 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
691 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
692 ; X86-SSE42-NEXT: movups (%edx), %xmm0
693 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx
694 ; X86-SSE42-NEXT: xorps %xmm1, %xmm1
695 ; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
696 ; X86-SSE42-NEXT: movups %xmm0, (%esp)
697 ; X86-SSE42-NEXT: andl $15, %ecx
698 ; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
699 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
700 ; X86-SSE42-NEXT: addl $32, %esp
701 ; X86-SSE42-NEXT: retl
703 ; X86-AVX-LABEL: lshr_16bytes:
705 ; X86-AVX-NEXT: subl $32, %esp
706 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
707 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
708 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
709 ; X86-AVX-NEXT: vmovups (%edx), %xmm0
710 ; X86-AVX-NEXT: movzbl (%ecx), %ecx
711 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
712 ; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp)
713 ; X86-AVX-NEXT: vmovups %xmm0, (%esp)
714 ; X86-AVX-NEXT: andl $15, %ecx
715 ; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
716 ; X86-AVX-NEXT: vmovups %xmm0, (%eax)
717 ; X86-AVX-NEXT: addl $32, %esp
719 %src = load i128, ptr %src.ptr, align 1
720 %byteOff = load i128, ptr %byteOff.ptr, align 1
721 %bitOff = shl i128 %byteOff, 3
722 %res = lshr i128 %src, %bitOff
723 store i128 %res, ptr %dst, align 1
726 define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
727 ; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes:
728 ; X64-NO-SHLD-NO-BMI2: # %bb.0:
729 ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
730 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
731 ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
732 ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
733 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
734 ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rdi
735 ; X64-NO-SHLD-NO-BMI2-NEXT: movq %r8, %rsi
736 ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %rsi
737 ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
738 ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi
739 ; X64-NO-SHLD-NO-BMI2-NEXT: orq %rdi, %rsi
740 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
741 ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %r8
742 ; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx
743 ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
744 ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi
745 ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rcx
746 ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, (%rdx)
747 ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, 8(%rdx)
748 ; X64-NO-SHLD-NO-BMI2-NEXT: retq
750 ; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes:
751 ; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
752 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
753 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
754 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
755 ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
756 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, %rsi
757 ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
758 ; X64-HAVE-SHLD-NO-BMI2-NEXT: shldq %cl, %rax, %rdi
759 ; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %eax, %eax
760 ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
761 ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rdi
762 ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rax
763 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
764 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
765 ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
767 ; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes:
768 ; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
769 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
770 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
771 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
772 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, 8(%rdi), %rsi
773 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
774 ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
775 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %r8
776 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrq %rax
777 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rdi, %rax, %rax
778 ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rax
779 ; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
780 ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
781 ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %r8, %rax
782 ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %r8, %rsi
783 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
784 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx)
785 ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
787 ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes:
788 ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
789 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
790 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
791 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
792 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
793 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shldq %cl, %rax, %rdi
794 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %rax
795 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi
796 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
797 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi
798 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi
799 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
800 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
801 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
803 ; X86-SSE2-LABEL: shl_16bytes:
805 ; X86-SSE2-NEXT: pushl %ebx
806 ; X86-SSE2-NEXT: pushl %edi
807 ; X86-SSE2-NEXT: pushl %esi
808 ; X86-SSE2-NEXT: subl $32, %esp
809 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
810 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
811 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
812 ; X86-SSE2-NEXT: movl (%edx), %esi
813 ; X86-SSE2-NEXT: movl 4(%edx), %edi
814 ; X86-SSE2-NEXT: movl 8(%edx), %ebx
815 ; X86-SSE2-NEXT: movl 12(%edx), %edx
816 ; X86-SSE2-NEXT: movzbl (%ecx), %ecx
817 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
818 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
819 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
820 ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
821 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
822 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
823 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
824 ; X86-SSE2-NEXT: movl $0, (%esp)
825 ; X86-SSE2-NEXT: andb $15, %cl
826 ; X86-SSE2-NEXT: negb %cl
827 ; X86-SSE2-NEXT: movsbl %cl, %ecx
828 ; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx
829 ; X86-SSE2-NEXT: movl 20(%esp,%ecx), %esi
830 ; X86-SSE2-NEXT: movl 28(%esp,%ecx), %edi
831 ; X86-SSE2-NEXT: movl 24(%esp,%ecx), %ecx
832 ; X86-SSE2-NEXT: movl %ecx, 8(%eax)
833 ; X86-SSE2-NEXT: movl %edi, 12(%eax)
834 ; X86-SSE2-NEXT: movl %edx, (%eax)
835 ; X86-SSE2-NEXT: movl %esi, 4(%eax)
836 ; X86-SSE2-NEXT: addl $32, %esp
837 ; X86-SSE2-NEXT: popl %esi
838 ; X86-SSE2-NEXT: popl %edi
839 ; X86-SSE2-NEXT: popl %ebx
840 ; X86-SSE2-NEXT: retl
842 ; X86-SSE42-LABEL: shl_16bytes:
843 ; X86-SSE42: # %bb.0:
844 ; X86-SSE42-NEXT: subl $32, %esp
845 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
846 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
847 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
848 ; X86-SSE42-NEXT: movups (%edx), %xmm0
849 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx
850 ; X86-SSE42-NEXT: xorps %xmm1, %xmm1
851 ; X86-SSE42-NEXT: movups %xmm1, (%esp)
852 ; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
853 ; X86-SSE42-NEXT: andb $15, %cl
854 ; X86-SSE42-NEXT: negb %cl
855 ; X86-SSE42-NEXT: movsbl %cl, %ecx
856 ; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm0
857 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
858 ; X86-SSE42-NEXT: addl $32, %esp
859 ; X86-SSE42-NEXT: retl
861 ; X86-AVX-LABEL: shl_16bytes:
863 ; X86-AVX-NEXT: subl $32, %esp
864 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
865 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
866 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
867 ; X86-AVX-NEXT: vmovups (%edx), %xmm0
868 ; X86-AVX-NEXT: movzbl (%ecx), %ecx
869 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
870 ; X86-AVX-NEXT: vmovups %xmm1, (%esp)
871 ; X86-AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
872 ; X86-AVX-NEXT: andb $15, %cl
873 ; X86-AVX-NEXT: negb %cl
874 ; X86-AVX-NEXT: movsbl %cl, %ecx
875 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm0
876 ; X86-AVX-NEXT: vmovups %xmm0, (%eax)
877 ; X86-AVX-NEXT: addl $32, %esp
879 %src = load i128, ptr %src.ptr, align 1
880 %byteOff = load i128, ptr %byteOff.ptr, align 1
881 %bitOff = shl i128 %byteOff, 3
882 %res = shl i128 %src, %bitOff
883 store i128 %res, ptr %dst, align 1
886 define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
887 ; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes:
888 ; X64-NO-SHLD-NO-BMI2: # %bb.0:
889 ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8
890 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
891 ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax
892 ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al
893 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
894 ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8
895 ; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi
896 ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl
897 ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi
898 ; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi
899 ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, %r8
900 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx
901 ; X64-NO-SHLD-NO-BMI2-NEXT: sarq %cl, %r8
902 ; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rdi
903 ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al
904 ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi
905 ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rdi
906 ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
907 ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx)
908 ; X64-NO-SHLD-NO-BMI2-NEXT: retq
910 ; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes:
911 ; X64-HAVE-SHLD-NO-BMI2: # %bb.0:
912 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax
913 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi
914 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx
915 ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl
916 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi
917 ; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq %cl, %rsi
918 ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax
919 ; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq $63, %rdi
920 ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl
921 ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax
922 ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi
923 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx)
924 ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx)
925 ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq
927 ; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
928 ; X64-NO-SHLD-HAVE-BMI2: # %bb.0:
929 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax
930 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
931 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
932 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi
933 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi
934 ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil
935 ; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8
936 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi
937 ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi
938 ; X64-NO-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rax, %rsi
939 ; X64-NO-SHLD-HAVE-BMI2-NEXT: sarq $63, %rax
940 ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
941 ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rdi
942 ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rax
943 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx)
944 ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx)
945 ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq
947 ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes:
948 ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0:
949 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax
950 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi
951 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx
952 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl
953 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax
954 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rdi, %rsi
955 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarq $63, %rdi
956 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl
957 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax
958 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi
959 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx)
960 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
961 ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
963 ; X86-SSE2-LABEL: ashr_16bytes:
965 ; X86-SSE2-NEXT: pushl %ebx
966 ; X86-SSE2-NEXT: pushl %edi
967 ; X86-SSE2-NEXT: pushl %esi
968 ; X86-SSE2-NEXT: subl $32, %esp
969 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
970 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
971 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
972 ; X86-SSE2-NEXT: movl (%edx), %esi
973 ; X86-SSE2-NEXT: movl 4(%edx), %edi
974 ; X86-SSE2-NEXT: movl 8(%edx), %ebx
975 ; X86-SSE2-NEXT: movl 12(%edx), %edx
976 ; X86-SSE2-NEXT: movzbl (%ecx), %ecx
977 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
978 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
979 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
980 ; X86-SSE2-NEXT: movl %esi, (%esp)
981 ; X86-SSE2-NEXT: sarl $31, %edx
982 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
983 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
984 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
985 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
986 ; X86-SSE2-NEXT: andl $15, %ecx
987 ; X86-SSE2-NEXT: movl (%esp,%ecx), %edx
988 ; X86-SSE2-NEXT: movl 4(%esp,%ecx), %esi
989 ; X86-SSE2-NEXT: movl 12(%esp,%ecx), %edi
990 ; X86-SSE2-NEXT: movl 8(%esp,%ecx), %ecx
991 ; X86-SSE2-NEXT: movl %ecx, 8(%eax)
992 ; X86-SSE2-NEXT: movl %edi, 12(%eax)
993 ; X86-SSE2-NEXT: movl %edx, (%eax)
994 ; X86-SSE2-NEXT: movl %esi, 4(%eax)
995 ; X86-SSE2-NEXT: addl $32, %esp
996 ; X86-SSE2-NEXT: popl %esi
997 ; X86-SSE2-NEXT: popl %edi
998 ; X86-SSE2-NEXT: popl %ebx
999 ; X86-SSE2-NEXT: retl
1001 ; X86-SSE42-LABEL: ashr_16bytes:
1002 ; X86-SSE42: # %bb.0:
1003 ; X86-SSE42-NEXT: pushl %ebx
1004 ; X86-SSE42-NEXT: pushl %edi
1005 ; X86-SSE42-NEXT: pushl %esi
1006 ; X86-SSE42-NEXT: subl $32, %esp
1007 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
1008 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
1009 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
1010 ; X86-SSE42-NEXT: movl (%edx), %esi
1011 ; X86-SSE42-NEXT: movl 4(%edx), %edi
1012 ; X86-SSE42-NEXT: movl 8(%edx), %ebx
1013 ; X86-SSE42-NEXT: movl 12(%edx), %edx
1014 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx
1015 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1016 ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
1017 ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
1018 ; X86-SSE42-NEXT: movl %esi, (%esp)
1019 ; X86-SSE42-NEXT: sarl $31, %edx
1020 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1021 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1022 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1023 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1024 ; X86-SSE42-NEXT: andl $15, %ecx
1025 ; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
1026 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
1027 ; X86-SSE42-NEXT: addl $32, %esp
1028 ; X86-SSE42-NEXT: popl %esi
1029 ; X86-SSE42-NEXT: popl %edi
1030 ; X86-SSE42-NEXT: popl %ebx
1031 ; X86-SSE42-NEXT: retl
1033 ; X86-AVX-LABEL: ashr_16bytes:
1035 ; X86-AVX-NEXT: pushl %ebx
1036 ; X86-AVX-NEXT: pushl %edi
1037 ; X86-AVX-NEXT: pushl %esi
1038 ; X86-AVX-NEXT: subl $32, %esp
1039 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1040 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1041 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1042 ; X86-AVX-NEXT: movl (%edx), %esi
1043 ; X86-AVX-NEXT: movl 4(%edx), %edi
1044 ; X86-AVX-NEXT: movl 8(%edx), %ebx
1045 ; X86-AVX-NEXT: movl 12(%edx), %edx
1046 ; X86-AVX-NEXT: movzbl (%ecx), %ecx
1047 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1048 ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
1049 ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
1050 ; X86-AVX-NEXT: movl %esi, (%esp)
1051 ; X86-AVX-NEXT: sarl $31, %edx
1052 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1053 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1054 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1055 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1056 ; X86-AVX-NEXT: andl $15, %ecx
1057 ; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
1058 ; X86-AVX-NEXT: vmovups %xmm0, (%eax)
1059 ; X86-AVX-NEXT: addl $32, %esp
1060 ; X86-AVX-NEXT: popl %esi
1061 ; X86-AVX-NEXT: popl %edi
1062 ; X86-AVX-NEXT: popl %ebx
1063 ; X86-AVX-NEXT: retl
1064 %src = load i128, ptr %src.ptr, align 1
1065 %byteOff = load i128, ptr %byteOff.ptr, align 1
1066 %bitOff = shl i128 %byteOff, 3
1067 %res = ashr i128 %src, %bitOff
1068 store i128 %res, ptr %dst, align 1
1072 define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
1073 ; X64-SSE2-LABEL: lshr_32bytes:
1074 ; X64-SSE2: # %bb.0:
1075 ; X64-SSE2-NEXT: movq (%rdi), %rax
1076 ; X64-SSE2-NEXT: movq 8(%rdi), %rcx
1077 ; X64-SSE2-NEXT: movq 16(%rdi), %r8
1078 ; X64-SSE2-NEXT: movq 24(%rdi), %rdi
1079 ; X64-SSE2-NEXT: movzbl (%rsi), %esi
1080 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
1081 ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
1082 ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1083 ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
1084 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1085 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1086 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1087 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1088 ; X64-SSE2-NEXT: andl $31, %esi
1089 ; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax
1090 ; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx
1091 ; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi
1092 ; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
1093 ; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
1094 ; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
1095 ; X64-SSE2-NEXT: movq %rax, (%rdx)
1096 ; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
1097 ; X64-SSE2-NEXT: retq
1099 ; X64-SSE42-LABEL: lshr_32bytes:
1100 ; X64-SSE42: # %bb.0:
1101 ; X64-SSE42-NEXT: movups (%rdi), %xmm0
1102 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
1103 ; X64-SSE42-NEXT: movzbl (%rsi), %eax
1104 ; X64-SSE42-NEXT: xorps %xmm2, %xmm2
1105 ; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
1106 ; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
1107 ; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
1108 ; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
1109 ; X64-SSE42-NEXT: andl $31, %eax
1110 ; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
1111 ; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1
1112 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
1113 ; X64-SSE42-NEXT: movups %xmm0, (%rdx)
1114 ; X64-SSE42-NEXT: retq
1116 ; X64-AVX-LABEL: lshr_32bytes:
1118 ; X64-AVX-NEXT: vmovups (%rdi), %ymm0
1119 ; X64-AVX-NEXT: movzbl (%rsi), %eax
1120 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1121 ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
1122 ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
1123 ; X64-AVX-NEXT: andl $31, %eax
1124 ; X64-AVX-NEXT: vmovups -64(%rsp,%rax), %xmm0
1125 ; X64-AVX-NEXT: vmovups -48(%rsp,%rax), %xmm1
1126 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
1127 ; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
1128 ; X64-AVX-NEXT: vzeroupper
1129 ; X64-AVX-NEXT: retq
1131 ; X86-SSE2-LABEL: lshr_32bytes:
1132 ; X86-SSE2: # %bb.0:
1133 ; X86-SSE2-NEXT: pushl %ebp
1134 ; X86-SSE2-NEXT: pushl %ebx
1135 ; X86-SSE2-NEXT: pushl %edi
1136 ; X86-SSE2-NEXT: pushl %esi
1137 ; X86-SSE2-NEXT: subl $72, %esp
1138 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1139 ; X86-SSE2-NEXT: movl (%eax), %ecx
1140 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1141 ; X86-SSE2-NEXT: movl 4(%eax), %ecx
1142 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
1143 ; X86-SSE2-NEXT: movl 8(%eax), %esi
1144 ; X86-SSE2-NEXT: movl 12(%eax), %edi
1145 ; X86-SSE2-NEXT: movl 16(%eax), %ebx
1146 ; X86-SSE2-NEXT: movl 20(%eax), %ebp
1147 ; X86-SSE2-NEXT: movl 24(%eax), %edx
1148 ; X86-SSE2-NEXT: movl 28(%eax), %ecx
1149 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1150 ; X86-SSE2-NEXT: movzbl (%eax), %eax
1151 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1152 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
1153 ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
1154 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
1155 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
1156 ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
1157 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
1158 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1159 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1160 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1161 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1162 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1163 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1164 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1165 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1166 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1167 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1168 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1169 ; X86-SSE2-NEXT: andl $31, %eax
1170 ; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx
1171 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1172 ; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx
1173 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
1174 ; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi
1175 ; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi
1176 ; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx
1177 ; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp
1178 ; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx
1179 ; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx
1180 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1181 ; X86-SSE2-NEXT: movl %ecx, 24(%eax)
1182 ; X86-SSE2-NEXT: movl %edx, 28(%eax)
1183 ; X86-SSE2-NEXT: movl %ebp, 16(%eax)
1184 ; X86-SSE2-NEXT: movl %ebx, 20(%eax)
1185 ; X86-SSE2-NEXT: movl %edi, 8(%eax)
1186 ; X86-SSE2-NEXT: movl %esi, 12(%eax)
1187 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1188 ; X86-SSE2-NEXT: movl %ecx, (%eax)
1189 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
1190 ; X86-SSE2-NEXT: movl %ecx, 4(%eax)
1191 ; X86-SSE2-NEXT: addl $72, %esp
1192 ; X86-SSE2-NEXT: popl %esi
1193 ; X86-SSE2-NEXT: popl %edi
1194 ; X86-SSE2-NEXT: popl %ebx
1195 ; X86-SSE2-NEXT: popl %ebp
1196 ; X86-SSE2-NEXT: retl
1198 ; X86-SSE42-LABEL: lshr_32bytes:
1199 ; X86-SSE42: # %bb.0:
1200 ; X86-SSE42-NEXT: subl $64, %esp
1201 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
1202 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
1203 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
1204 ; X86-SSE42-NEXT: movups (%edx), %xmm0
1205 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1
1206 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx
1207 ; X86-SSE42-NEXT: xorps %xmm2, %xmm2
1208 ; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
1209 ; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
1210 ; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
1211 ; X86-SSE42-NEXT: movups %xmm0, (%esp)
1212 ; X86-SSE42-NEXT: andl $31, %ecx
1213 ; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
1214 ; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
1215 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
1216 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
1217 ; X86-SSE42-NEXT: addl $64, %esp
1218 ; X86-SSE42-NEXT: retl
1220 ; X86-AVX-LABEL: lshr_32bytes:
1222 ; X86-AVX-NEXT: subl $64, %esp
1223 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1224 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1225 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1226 ; X86-AVX-NEXT: vmovups (%edx), %ymm0
1227 ; X86-AVX-NEXT: movzbl (%ecx), %ecx
1228 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1229 ; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
1230 ; X86-AVX-NEXT: vmovups %ymm0, (%esp)
1231 ; X86-AVX-NEXT: andl $31, %ecx
1232 ; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
1233 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
1234 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
1235 ; X86-AVX-NEXT: vmovups %xmm0, (%eax)
1236 ; X86-AVX-NEXT: addl $64, %esp
1237 ; X86-AVX-NEXT: vzeroupper
1238 ; X86-AVX-NEXT: retl
1239 %src = load i256, ptr %src.ptr, align 1
1240 %byteOff = load i256, ptr %byteOff.ptr, align 1
1241 %bitOff = shl i256 %byteOff, 3
1242 %res = lshr i256 %src, %bitOff
1243 store i256 %res, ptr %dst, align 1
1246 define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
1247 ; X64-SSE2-LABEL: shl_32bytes:
1248 ; X64-SSE2: # %bb.0:
1249 ; X64-SSE2-NEXT: movq (%rdi), %rax
1250 ; X64-SSE2-NEXT: movq 8(%rdi), %rcx
1251 ; X64-SSE2-NEXT: movq 16(%rdi), %r8
1252 ; X64-SSE2-NEXT: movq 24(%rdi), %rdi
1253 ; X64-SSE2-NEXT: movzbl (%rsi), %esi
1254 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
1255 ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
1256 ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1257 ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
1258 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1259 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1260 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1261 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1262 ; X64-SSE2-NEXT: andb $31, %sil
1263 ; X64-SSE2-NEXT: negb %sil
1264 ; X64-SSE2-NEXT: movsbq %sil, %rax
1265 ; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rcx
1266 ; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
1267 ; X64-SSE2-NEXT: movq -8(%rsp,%rax), %rdi
1268 ; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax
1269 ; X64-SSE2-NEXT: movq %rax, 16(%rdx)
1270 ; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
1271 ; X64-SSE2-NEXT: movq %rcx, (%rdx)
1272 ; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
1273 ; X64-SSE2-NEXT: retq
1275 ; X64-SSE42-LABEL: shl_32bytes:
1276 ; X64-SSE42: # %bb.0:
1277 ; X64-SSE42-NEXT: movups (%rdi), %xmm0
1278 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
1279 ; X64-SSE42-NEXT: movzbl (%rsi), %eax
1280 ; X64-SSE42-NEXT: xorps %xmm2, %xmm2
1281 ; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
1282 ; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
1283 ; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
1284 ; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
1285 ; X64-SSE42-NEXT: andb $31, %al
1286 ; X64-SSE42-NEXT: negb %al
1287 ; X64-SSE42-NEXT: movsbq %al, %rax
1288 ; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm0
1289 ; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm1
1290 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
1291 ; X64-SSE42-NEXT: movups %xmm0, (%rdx)
1292 ; X64-SSE42-NEXT: retq
1294 ; X64-AVX-LABEL: shl_32bytes:
1296 ; X64-AVX-NEXT: vmovups (%rdi), %ymm0
1297 ; X64-AVX-NEXT: movzbl (%rsi), %eax
1298 ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1299 ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
1300 ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
1301 ; X64-AVX-NEXT: andb $31, %al
1302 ; X64-AVX-NEXT: negb %al
1303 ; X64-AVX-NEXT: movsbq %al, %rax
1304 ; X64-AVX-NEXT: vmovups -32(%rsp,%rax), %xmm0
1305 ; X64-AVX-NEXT: vmovups -16(%rsp,%rax), %xmm1
1306 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
1307 ; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
1308 ; X64-AVX-NEXT: vzeroupper
1309 ; X64-AVX-NEXT: retq
1311 ; X86-SSE2-LABEL: shl_32bytes:
1312 ; X86-SSE2: # %bb.0:
1313 ; X86-SSE2-NEXT: pushl %ebp
1314 ; X86-SSE2-NEXT: pushl %ebx
1315 ; X86-SSE2-NEXT: pushl %edi
1316 ; X86-SSE2-NEXT: pushl %esi
1317 ; X86-SSE2-NEXT: subl $72, %esp
1318 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1319 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
1320 ; X86-SSE2-NEXT: movl (%edi), %ecx
1321 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1322 ; X86-SSE2-NEXT: movl 4(%edi), %ecx
1323 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
1324 ; X86-SSE2-NEXT: movl 8(%edi), %esi
1325 ; X86-SSE2-NEXT: movl 12(%edi), %ebx
1326 ; X86-SSE2-NEXT: movl 16(%edi), %ebp
1327 ; X86-SSE2-NEXT: movzbl (%eax), %eax
1328 ; X86-SSE2-NEXT: movl 20(%edi), %edx
1329 ; X86-SSE2-NEXT: movl 24(%edi), %ecx
1330 ; X86-SSE2-NEXT: movl 28(%edi), %edi
1331 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
1332 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1333 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
1334 ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
1335 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
1336 ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
1337 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
1338 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1339 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1340 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1341 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1342 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1343 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1344 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1345 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1346 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1347 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1348 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1349 ; X86-SSE2-NEXT: andb $31, %al
1350 ; X86-SSE2-NEXT: negb %al
1351 ; X86-SSE2-NEXT: movsbl %al, %edx
1352 ; X86-SSE2-NEXT: movl 40(%esp,%edx), %eax
1353 ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1354 ; X86-SSE2-NEXT: movl 44(%esp,%edx), %eax
1355 ; X86-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
1356 ; X86-SSE2-NEXT: movl 52(%esp,%edx), %esi
1357 ; X86-SSE2-NEXT: movl 48(%esp,%edx), %edi
1358 ; X86-SSE2-NEXT: movl 60(%esp,%edx), %ebx
1359 ; X86-SSE2-NEXT: movl 56(%esp,%edx), %ebp
1360 ; X86-SSE2-NEXT: movl 68(%esp,%edx), %ecx
1361 ; X86-SSE2-NEXT: movl 64(%esp,%edx), %edx
1362 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1363 ; X86-SSE2-NEXT: movl %edx, 24(%eax)
1364 ; X86-SSE2-NEXT: movl %ecx, 28(%eax)
1365 ; X86-SSE2-NEXT: movl %ebp, 16(%eax)
1366 ; X86-SSE2-NEXT: movl %ebx, 20(%eax)
1367 ; X86-SSE2-NEXT: movl %edi, 8(%eax)
1368 ; X86-SSE2-NEXT: movl %esi, 12(%eax)
1369 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1370 ; X86-SSE2-NEXT: movl %ecx, (%eax)
1371 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
1372 ; X86-SSE2-NEXT: movl %ecx, 4(%eax)
1373 ; X86-SSE2-NEXT: addl $72, %esp
1374 ; X86-SSE2-NEXT: popl %esi
1375 ; X86-SSE2-NEXT: popl %edi
1376 ; X86-SSE2-NEXT: popl %ebx
1377 ; X86-SSE2-NEXT: popl %ebp
1378 ; X86-SSE2-NEXT: retl
1380 ; X86-SSE42-LABEL: shl_32bytes:
1381 ; X86-SSE42: # %bb.0:
1382 ; X86-SSE42-NEXT: subl $64, %esp
1383 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
1384 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
1385 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
1386 ; X86-SSE42-NEXT: movups (%edx), %xmm0
1387 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1
1388 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx
1389 ; X86-SSE42-NEXT: xorps %xmm2, %xmm2
1390 ; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
1391 ; X86-SSE42-NEXT: movups %xmm2, (%esp)
1392 ; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
1393 ; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
1394 ; X86-SSE42-NEXT: andb $31, %cl
1395 ; X86-SSE42-NEXT: negb %cl
1396 ; X86-SSE42-NEXT: movsbl %cl, %ecx
1397 ; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0
1398 ; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1
1399 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
1400 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
1401 ; X86-SSE42-NEXT: addl $64, %esp
1402 ; X86-SSE42-NEXT: retl
1404 ; X86-AVX-LABEL: shl_32bytes:
1406 ; X86-AVX-NEXT: subl $64, %esp
1407 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1408 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1409 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1410 ; X86-AVX-NEXT: vmovups (%edx), %ymm0
1411 ; X86-AVX-NEXT: movzbl (%ecx), %ecx
1412 ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1413 ; X86-AVX-NEXT: vmovups %ymm1, (%esp)
1414 ; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
1415 ; X86-AVX-NEXT: andb $31, %cl
1416 ; X86-AVX-NEXT: negb %cl
1417 ; X86-AVX-NEXT: movsbl %cl, %ecx
1418 ; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0
1419 ; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1
1420 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
1421 ; X86-AVX-NEXT: vmovups %xmm0, (%eax)
1422 ; X86-AVX-NEXT: addl $64, %esp
1423 ; X86-AVX-NEXT: vzeroupper
1424 ; X86-AVX-NEXT: retl
1425 %src = load i256, ptr %src.ptr, align 1
1426 %byteOff = load i256, ptr %byteOff.ptr, align 1
1427 %bitOff = shl i256 %byteOff, 3
1428 %res = shl i256 %src, %bitOff
1429 store i256 %res, ptr %dst, align 1
1432 define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
1433 ; X64-SSE2-LABEL: ashr_32bytes:
1434 ; X64-SSE2: # %bb.0:
1435 ; X64-SSE2-NEXT: movq (%rdi), %rax
1436 ; X64-SSE2-NEXT: movq 8(%rdi), %rcx
1437 ; X64-SSE2-NEXT: movq 16(%rdi), %r8
1438 ; X64-SSE2-NEXT: movq 24(%rdi), %rdi
1439 ; X64-SSE2-NEXT: movzbl (%rsi), %esi
1440 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
1441 ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
1442 ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1443 ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
1444 ; X64-SSE2-NEXT: sarq $63, %rdi
1445 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
1446 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
1447 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
1448 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
1449 ; X64-SSE2-NEXT: andl $31, %esi
1450 ; X64-SSE2-NEXT: movq -64(%rsp,%rsi), %rax
1451 ; X64-SSE2-NEXT: movq -56(%rsp,%rsi), %rcx
1452 ; X64-SSE2-NEXT: movq -40(%rsp,%rsi), %rdi
1453 ; X64-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
1454 ; X64-SSE2-NEXT: movq %rsi, 16(%rdx)
1455 ; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
1456 ; X64-SSE2-NEXT: movq %rax, (%rdx)
1457 ; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
1458 ; X64-SSE2-NEXT: retq
1460 ; X64-SSE42-LABEL: ashr_32bytes:
1461 ; X64-SSE42: # %bb.0:
1462 ; X64-SSE42-NEXT: movups (%rdi), %xmm0
1463 ; X64-SSE42-NEXT: movq 16(%rdi), %rax
1464 ; X64-SSE42-NEXT: movq 24(%rdi), %rcx
1465 ; X64-SSE42-NEXT: movzbl (%rsi), %esi
1466 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1467 ; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
1468 ; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
1469 ; X64-SSE42-NEXT: sarq $63, %rcx
1470 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1471 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1472 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1473 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1474 ; X64-SSE42-NEXT: andl $31, %esi
1475 ; X64-SSE42-NEXT: movups -64(%rsp,%rsi), %xmm0
1476 ; X64-SSE42-NEXT: movups -48(%rsp,%rsi), %xmm1
1477 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
1478 ; X64-SSE42-NEXT: movups %xmm0, (%rdx)
1479 ; X64-SSE42-NEXT: retq
1481 ; X64-AVX-LABEL: ashr_32bytes:
1483 ; X64-AVX-NEXT: vmovups (%rdi), %xmm0
1484 ; X64-AVX-NEXT: movq 16(%rdi), %rax
1485 ; X64-AVX-NEXT: movq 24(%rdi), %rcx
1486 ; X64-AVX-NEXT: movzbl (%rsi), %esi
1487 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1488 ; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
1489 ; X64-AVX-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
1490 ; X64-AVX-NEXT: sarq $63, %rcx
1491 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1492 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1493 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1494 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1495 ; X64-AVX-NEXT: andl $31, %esi
1496 ; X64-AVX-NEXT: vmovups -64(%rsp,%rsi), %xmm0
1497 ; X64-AVX-NEXT: vmovups -48(%rsp,%rsi), %xmm1
1498 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
1499 ; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
1500 ; X64-AVX-NEXT: retq
1502 ; X86-SSE2-LABEL: ashr_32bytes:
1503 ; X86-SSE2: # %bb.0:
1504 ; X86-SSE2-NEXT: pushl %ebp
1505 ; X86-SSE2-NEXT: pushl %ebx
1506 ; X86-SSE2-NEXT: pushl %edi
1507 ; X86-SSE2-NEXT: pushl %esi
1508 ; X86-SSE2-NEXT: subl $72, %esp
1509 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1510 ; X86-SSE2-NEXT: movl (%eax), %ecx
1511 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1512 ; X86-SSE2-NEXT: movl 4(%eax), %ecx
1513 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
1514 ; X86-SSE2-NEXT: movl 8(%eax), %edi
1515 ; X86-SSE2-NEXT: movl 12(%eax), %ebx
1516 ; X86-SSE2-NEXT: movl 16(%eax), %ebp
1517 ; X86-SSE2-NEXT: movl 20(%eax), %esi
1518 ; X86-SSE2-NEXT: movl 24(%eax), %edx
1519 ; X86-SSE2-NEXT: movl 28(%eax), %ecx
1520 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1521 ; X86-SSE2-NEXT: movzbl (%eax), %eax
1522 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
1523 ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
1524 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1525 ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
1526 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
1527 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
1528 ; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
1529 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
1530 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
1531 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
1532 ; X86-SSE2-NEXT: sarl $31, %ecx
1533 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1534 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1535 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1536 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1537 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1538 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1539 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1540 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1541 ; X86-SSE2-NEXT: andl $31, %eax
1542 ; X86-SSE2-NEXT: movl 8(%esp,%eax), %ecx
1543 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1544 ; X86-SSE2-NEXT: movl 12(%esp,%eax), %ecx
1545 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
1546 ; X86-SSE2-NEXT: movl 20(%esp,%eax), %esi
1547 ; X86-SSE2-NEXT: movl 16(%esp,%eax), %edi
1548 ; X86-SSE2-NEXT: movl 28(%esp,%eax), %ebx
1549 ; X86-SSE2-NEXT: movl 24(%esp,%eax), %ebp
1550 ; X86-SSE2-NEXT: movl 36(%esp,%eax), %edx
1551 ; X86-SSE2-NEXT: movl 32(%esp,%eax), %ecx
1552 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1553 ; X86-SSE2-NEXT: movl %ecx, 24(%eax)
1554 ; X86-SSE2-NEXT: movl %edx, 28(%eax)
1555 ; X86-SSE2-NEXT: movl %ebp, 16(%eax)
1556 ; X86-SSE2-NEXT: movl %ebx, 20(%eax)
1557 ; X86-SSE2-NEXT: movl %edi, 8(%eax)
1558 ; X86-SSE2-NEXT: movl %esi, 12(%eax)
1559 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1560 ; X86-SSE2-NEXT: movl %ecx, (%eax)
1561 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
1562 ; X86-SSE2-NEXT: movl %ecx, 4(%eax)
1563 ; X86-SSE2-NEXT: addl $72, %esp
1564 ; X86-SSE2-NEXT: popl %esi
1565 ; X86-SSE2-NEXT: popl %edi
1566 ; X86-SSE2-NEXT: popl %ebx
1567 ; X86-SSE2-NEXT: popl %ebp
1568 ; X86-SSE2-NEXT: retl
1570 ; X86-SSE42-LABEL: ashr_32bytes:
1571 ; X86-SSE42: # %bb.0:
1572 ; X86-SSE42-NEXT: pushl %ebx
1573 ; X86-SSE42-NEXT: pushl %edi
1574 ; X86-SSE42-NEXT: pushl %esi
1575 ; X86-SSE42-NEXT: subl $64, %esp
1576 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
1577 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
1578 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
1579 ; X86-SSE42-NEXT: movups (%edx), %xmm0
1580 ; X86-SSE42-NEXT: movl 16(%edx), %esi
1581 ; X86-SSE42-NEXT: movl 20(%edx), %edi
1582 ; X86-SSE42-NEXT: movl 24(%edx), %ebx
1583 ; X86-SSE42-NEXT: movl 28(%edx), %edx
1584 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx
1585 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1586 ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
1587 ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
1588 ; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
1589 ; X86-SSE42-NEXT: movups %xmm0, (%esp)
1590 ; X86-SSE42-NEXT: sarl $31, %edx
1591 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1592 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1593 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1594 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1595 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1596 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1597 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1598 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
1599 ; X86-SSE42-NEXT: andl $31, %ecx
1600 ; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
1601 ; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
1602 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
1603 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
1604 ; X86-SSE42-NEXT: addl $64, %esp
1605 ; X86-SSE42-NEXT: popl %esi
1606 ; X86-SSE42-NEXT: popl %edi
1607 ; X86-SSE42-NEXT: popl %ebx
1608 ; X86-SSE42-NEXT: retl
1610 ; X86-AVX-LABEL: ashr_32bytes:
1612 ; X86-AVX-NEXT: pushl %ebx
1613 ; X86-AVX-NEXT: pushl %edi
1614 ; X86-AVX-NEXT: pushl %esi
1615 ; X86-AVX-NEXT: subl $64, %esp
1616 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
1617 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
1618 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
1619 ; X86-AVX-NEXT: vmovups (%edx), %xmm0
1620 ; X86-AVX-NEXT: movl 16(%edx), %esi
1621 ; X86-AVX-NEXT: movl 20(%edx), %edi
1622 ; X86-AVX-NEXT: movl 24(%edx), %ebx
1623 ; X86-AVX-NEXT: movl 28(%edx), %edx
1624 ; X86-AVX-NEXT: movzbl (%ecx), %ecx
1625 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1626 ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
1627 ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
1628 ; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
1629 ; X86-AVX-NEXT: vmovups %xmm0, (%esp)
1630 ; X86-AVX-NEXT: sarl $31, %edx
1631 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1632 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1633 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1634 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1635 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1636 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1637 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1638 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
1639 ; X86-AVX-NEXT: andl $31, %ecx
1640 ; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
1641 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
1642 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
1643 ; X86-AVX-NEXT: vmovups %xmm0, (%eax)
1644 ; X86-AVX-NEXT: addl $64, %esp
1645 ; X86-AVX-NEXT: popl %esi
1646 ; X86-AVX-NEXT: popl %edi
1647 ; X86-AVX-NEXT: popl %ebx
1648 ; X86-AVX-NEXT: retl
1649 %src = load i256, ptr %src.ptr, align 1
1650 %byteOff = load i256, ptr %byteOff.ptr, align 1
1651 %bitOff = shl i256 %byteOff, 3
1652 %res = ashr i256 %src, %bitOff
1653 store i256 %res, ptr %dst, align 1
1657 define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
1658 ; X64-SSE2-LABEL: lshr_64bytes:
1659 ; X64-SSE2: # %bb.0:
1660 ; X64-SSE2-NEXT: pushq %rbx
1661 ; X64-SSE2-NEXT: movq (%rdi), %rax
1662 ; X64-SSE2-NEXT: movq 8(%rdi), %rcx
1663 ; X64-SSE2-NEXT: movq 16(%rdi), %r8
1664 ; X64-SSE2-NEXT: movq 24(%rdi), %r9
1665 ; X64-SSE2-NEXT: movq 32(%rdi), %r10
1666 ; X64-SSE2-NEXT: movq 40(%rdi), %r11
1667 ; X64-SSE2-NEXT: movq 48(%rdi), %rbx
1668 ; X64-SSE2-NEXT: movq 56(%rdi), %rdi
1669 ; X64-SSE2-NEXT: movl (%rsi), %esi
1670 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
1671 ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
1672 ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
1673 ; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
1674 ; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
1675 ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
1676 ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
1677 ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
1678 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1679 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1680 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1681 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1682 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1683 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1684 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1685 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1686 ; X64-SSE2-NEXT: andl $63, %esi
1687 ; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax
1688 ; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx
1689 ; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi
1690 ; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8
1691 ; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9
1692 ; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10
1693 ; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11
1694 ; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi
1695 ; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
1696 ; X64-SSE2-NEXT: movq %r11, 56(%rdx)
1697 ; X64-SSE2-NEXT: movq %r10, 32(%rdx)
1698 ; X64-SSE2-NEXT: movq %r9, 40(%rdx)
1699 ; X64-SSE2-NEXT: movq %r8, 16(%rdx)
1700 ; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
1701 ; X64-SSE2-NEXT: movq %rax, (%rdx)
1702 ; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
1703 ; X64-SSE2-NEXT: popq %rbx
1704 ; X64-SSE2-NEXT: retq
1706 ; X64-SSE42-LABEL: lshr_64bytes:
1707 ; X64-SSE42: # %bb.0:
1708 ; X64-SSE42-NEXT: movups (%rdi), %xmm0
1709 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
1710 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
1711 ; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
1712 ; X64-SSE42-NEXT: movl (%rsi), %eax
1713 ; X64-SSE42-NEXT: xorps %xmm4, %xmm4
1714 ; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
1715 ; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
1716 ; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
1717 ; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
1718 ; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp)
1719 ; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
1720 ; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
1721 ; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
1722 ; X64-SSE42-NEXT: andl $63, %eax
1723 ; X64-SSE42-NEXT: movups -128(%rsp,%rax), %xmm0
1724 ; X64-SSE42-NEXT: movups -112(%rsp,%rax), %xmm1
1725 ; X64-SSE42-NEXT: movups -96(%rsp,%rax), %xmm2
1726 ; X64-SSE42-NEXT: movups -80(%rsp,%rax), %xmm3
1727 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
1728 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
1729 ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
1730 ; X64-SSE42-NEXT: movups %xmm0, (%rdx)
1731 ; X64-SSE42-NEXT: retq
1733 ; X64-AVX1-LABEL: lshr_64bytes:
1734 ; X64-AVX1: # %bb.0:
1735 ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
1736 ; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
1737 ; X64-AVX1-NEXT: movl (%rsi), %eax
1738 ; X64-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
1739 ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
1740 ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
1741 ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
1742 ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
1743 ; X64-AVX1-NEXT: andl $63, %eax
1744 ; X64-AVX1-NEXT: vmovups -128(%rsp,%rax), %xmm0
1745 ; X64-AVX1-NEXT: vmovups -112(%rsp,%rax), %xmm1
1746 ; X64-AVX1-NEXT: vmovups -96(%rsp,%rax), %xmm2
1747 ; X64-AVX1-NEXT: vmovups -80(%rsp,%rax), %xmm3
1748 ; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
1749 ; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
1750 ; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
1751 ; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
1752 ; X64-AVX1-NEXT: vzeroupper
1753 ; X64-AVX1-NEXT: retq
1755 ; X64-AVX512-LABEL: lshr_64bytes:
1756 ; X64-AVX512: # %bb.0:
1757 ; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
1758 ; X64-AVX512-NEXT: movl (%rsi), %eax
1759 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
1760 ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
1761 ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
1762 ; X64-AVX512-NEXT: andl $63, %eax
1763 ; X64-AVX512-NEXT: vmovups -128(%rsp,%rax), %xmm0
1764 ; X64-AVX512-NEXT: vmovups -112(%rsp,%rax), %xmm1
1765 ; X64-AVX512-NEXT: vmovups -96(%rsp,%rax), %xmm2
1766 ; X64-AVX512-NEXT: vmovups -80(%rsp,%rax), %xmm3
1767 ; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
1768 ; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
1769 ; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
1770 ; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
1771 ; X64-AVX512-NEXT: vzeroupper
1772 ; X64-AVX512-NEXT: retq
1774 ; X86-SSE2-LABEL: lshr_64bytes:
1775 ; X86-SSE2: # %bb.0:
1776 ; X86-SSE2-NEXT: pushl %ebp
1777 ; X86-SSE2-NEXT: pushl %ebx
1778 ; X86-SSE2-NEXT: pushl %edi
1779 ; X86-SSE2-NEXT: pushl %esi
1780 ; X86-SSE2-NEXT: subl $168, %esp
1781 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1782 ; X86-SSE2-NEXT: movl (%eax), %ecx
1783 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1784 ; X86-SSE2-NEXT: movl 4(%eax), %ecx
1785 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1786 ; X86-SSE2-NEXT: movl 8(%eax), %ecx
1787 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1788 ; X86-SSE2-NEXT: movl 12(%eax), %ecx
1789 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1790 ; X86-SSE2-NEXT: movl 16(%eax), %ecx
1791 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1792 ; X86-SSE2-NEXT: movl 20(%eax), %ecx
1793 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1794 ; X86-SSE2-NEXT: movl 24(%eax), %ecx
1795 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1796 ; X86-SSE2-NEXT: movl 28(%eax), %ecx
1797 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1798 ; X86-SSE2-NEXT: movl 32(%eax), %ecx
1799 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1800 ; X86-SSE2-NEXT: movl 36(%eax), %ecx
1801 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
1802 ; X86-SSE2-NEXT: movl 40(%eax), %ebp
1803 ; X86-SSE2-NEXT: movl 44(%eax), %ebx
1804 ; X86-SSE2-NEXT: movl 48(%eax), %edi
1805 ; X86-SSE2-NEXT: movl 52(%eax), %esi
1806 ; X86-SSE2-NEXT: movl 56(%eax), %edx
1807 ; X86-SSE2-NEXT: movl 60(%eax), %ecx
1808 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1809 ; X86-SSE2-NEXT: movl (%eax), %eax
1810 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1811 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
1812 ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
1813 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
1814 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
1815 ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
1816 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
1817 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1818 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1819 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1820 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1821 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1822 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1823 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1824 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1825 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1826 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1827 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1828 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1829 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1830 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1831 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1832 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1833 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1834 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1835 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
1836 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1837 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1838 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1839 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1840 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1841 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1842 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1843 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1844 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1845 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1846 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1847 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1848 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1849 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1850 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1851 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
1852 ; X86-SSE2-NEXT: andl $63, %eax
1853 ; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx
1854 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1855 ; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx
1856 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1857 ; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx
1858 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1859 ; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx
1860 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1861 ; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx
1862 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1863 ; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx
1864 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1865 ; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx
1866 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1867 ; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx
1868 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1869 ; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx
1870 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1871 ; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx
1872 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
1873 ; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp
1874 ; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx
1875 ; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi
1876 ; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi
1877 ; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx
1878 ; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx
1879 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
1880 ; X86-SSE2-NEXT: movl %ecx, 56(%eax)
1881 ; X86-SSE2-NEXT: movl %edx, 60(%eax)
1882 ; X86-SSE2-NEXT: movl %esi, 48(%eax)
1883 ; X86-SSE2-NEXT: movl %edi, 52(%eax)
1884 ; X86-SSE2-NEXT: movl %ebx, 40(%eax)
1885 ; X86-SSE2-NEXT: movl %ebp, 44(%eax)
1886 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
1887 ; X86-SSE2-NEXT: movl %ecx, 32(%eax)
1888 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1889 ; X86-SSE2-NEXT: movl %ecx, 36(%eax)
1890 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1891 ; X86-SSE2-NEXT: movl %ecx, 24(%eax)
1892 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1893 ; X86-SSE2-NEXT: movl %ecx, 28(%eax)
1894 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1895 ; X86-SSE2-NEXT: movl %ecx, 16(%eax)
1896 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1897 ; X86-SSE2-NEXT: movl %ecx, 20(%eax)
1898 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1899 ; X86-SSE2-NEXT: movl %ecx, 8(%eax)
1900 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1901 ; X86-SSE2-NEXT: movl %ecx, 12(%eax)
1902 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1903 ; X86-SSE2-NEXT: movl %ecx, (%eax)
1904 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1905 ; X86-SSE2-NEXT: movl %ecx, 4(%eax)
1906 ; X86-SSE2-NEXT: addl $168, %esp
1907 ; X86-SSE2-NEXT: popl %esi
1908 ; X86-SSE2-NEXT: popl %edi
1909 ; X86-SSE2-NEXT: popl %ebx
1910 ; X86-SSE2-NEXT: popl %ebp
1911 ; X86-SSE2-NEXT: retl
1913 ; X86-SSE42-LABEL: lshr_64bytes:
1914 ; X86-SSE42: # %bb.0:
1915 ; X86-SSE42-NEXT: subl $128, %esp
1916 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
1917 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
1918 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
1919 ; X86-SSE42-NEXT: movups (%edx), %xmm0
1920 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1
1921 ; X86-SSE42-NEXT: movups 32(%edx), %xmm2
1922 ; X86-SSE42-NEXT: movups 48(%edx), %xmm3
1923 ; X86-SSE42-NEXT: movl (%ecx), %ecx
1924 ; X86-SSE42-NEXT: xorps %xmm4, %xmm4
1925 ; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
1926 ; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
1927 ; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
1928 ; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
1929 ; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp)
1930 ; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
1931 ; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
1932 ; X86-SSE42-NEXT: movups %xmm0, (%esp)
1933 ; X86-SSE42-NEXT: andl $63, %ecx
1934 ; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
1935 ; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
1936 ; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2
1937 ; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3
1938 ; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
1939 ; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
1940 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
1941 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
1942 ; X86-SSE42-NEXT: addl $128, %esp
1943 ; X86-SSE42-NEXT: retl
1945 ; X86-AVX1-LABEL: lshr_64bytes:
1946 ; X86-AVX1: # %bb.0:
1947 ; X86-AVX1-NEXT: subl $128, %esp
1948 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
1949 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
1950 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
1951 ; X86-AVX1-NEXT: vmovups (%edx), %ymm0
1952 ; X86-AVX1-NEXT: vmovups 32(%edx), %ymm1
1953 ; X86-AVX1-NEXT: movl (%ecx), %ecx
1954 ; X86-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
1955 ; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
1956 ; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
1957 ; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
1958 ; X86-AVX1-NEXT: vmovups %ymm0, (%esp)
1959 ; X86-AVX1-NEXT: andl $63, %ecx
1960 ; X86-AVX1-NEXT: vmovups (%esp,%ecx), %xmm0
1961 ; X86-AVX1-NEXT: vmovups 16(%esp,%ecx), %xmm1
1962 ; X86-AVX1-NEXT: vmovups 32(%esp,%ecx), %xmm2
1963 ; X86-AVX1-NEXT: vmovups 48(%esp,%ecx), %xmm3
1964 ; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax)
1965 ; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
1966 ; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
1967 ; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
1968 ; X86-AVX1-NEXT: addl $128, %esp
1969 ; X86-AVX1-NEXT: vzeroupper
1970 ; X86-AVX1-NEXT: retl
1972 ; X86-AVX512-LABEL: lshr_64bytes:
1973 ; X86-AVX512: # %bb.0:
1974 ; X86-AVX512-NEXT: subl $128, %esp
1975 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
1976 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
1977 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
1978 ; X86-AVX512-NEXT: vmovups (%edx), %zmm0
1979 ; X86-AVX512-NEXT: movl (%ecx), %ecx
1980 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
1981 ; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
1982 ; X86-AVX512-NEXT: vmovups %zmm0, (%esp)
1983 ; X86-AVX512-NEXT: andl $63, %ecx
1984 ; X86-AVX512-NEXT: vmovups (%esp,%ecx), %xmm0
1985 ; X86-AVX512-NEXT: vmovups 16(%esp,%ecx), %xmm1
1986 ; X86-AVX512-NEXT: vmovups 32(%esp,%ecx), %xmm2
1987 ; X86-AVX512-NEXT: vmovups 48(%esp,%ecx), %xmm3
1988 ; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
1989 ; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
1990 ; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
1991 ; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
1992 ; X86-AVX512-NEXT: addl $128, %esp
1993 ; X86-AVX512-NEXT: vzeroupper
1994 ; X86-AVX512-NEXT: retl
1995 %src = load i512, ptr %src.ptr, align 1
1996 %byteOff = load i512, ptr %byteOff.ptr, align 1
1997 %bitOff = shl i512 %byteOff, 3
1998 %res = lshr i512 %src, %bitOff
1999 store i512 %res, ptr %dst, align 1
2002 define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
2003 ; X64-SSE2-LABEL: shl_64bytes:
2004 ; X64-SSE2: # %bb.0:
2005 ; X64-SSE2-NEXT: pushq %rbx
2006 ; X64-SSE2-NEXT: movq (%rdi), %rax
2007 ; X64-SSE2-NEXT: movq 8(%rdi), %rcx
2008 ; X64-SSE2-NEXT: movq 16(%rdi), %r8
2009 ; X64-SSE2-NEXT: movq 24(%rdi), %r9
2010 ; X64-SSE2-NEXT: movq 32(%rdi), %r10
2011 ; X64-SSE2-NEXT: movq 40(%rdi), %r11
2012 ; X64-SSE2-NEXT: movq 48(%rdi), %rbx
2013 ; X64-SSE2-NEXT: movq 56(%rdi), %rdi
2014 ; X64-SSE2-NEXT: movl (%rsi), %esi
2015 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2016 ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
2017 ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
2018 ; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
2019 ; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
2020 ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
2021 ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2022 ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
2023 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
2024 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
2025 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
2026 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
2027 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
2028 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
2029 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
2030 ; X64-SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
2031 ; X64-SSE2-NEXT: andl $63, %esi
2032 ; X64-SSE2-NEXT: negl %esi
2033 ; X64-SSE2-NEXT: movslq %esi, %rax
2034 ; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx
2035 ; X64-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
2036 ; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rdi
2037 ; X64-SSE2-NEXT: movq -48(%rsp,%rax), %r8
2038 ; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r9
2039 ; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r10
2040 ; X64-SSE2-NEXT: movq -8(%rsp,%rax), %r11
2041 ; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax
2042 ; X64-SSE2-NEXT: movq %rax, 48(%rdx)
2043 ; X64-SSE2-NEXT: movq %r11, 56(%rdx)
2044 ; X64-SSE2-NEXT: movq %r10, 32(%rdx)
2045 ; X64-SSE2-NEXT: movq %r9, 40(%rdx)
2046 ; X64-SSE2-NEXT: movq %r8, 16(%rdx)
2047 ; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
2048 ; X64-SSE2-NEXT: movq %rcx, (%rdx)
2049 ; X64-SSE2-NEXT: movq %rsi, 8(%rdx)
2050 ; X64-SSE2-NEXT: popq %rbx
2051 ; X64-SSE2-NEXT: retq
2053 ; X64-SSE42-LABEL: shl_64bytes:
2054 ; X64-SSE42: # %bb.0:
2055 ; X64-SSE42-NEXT: movups (%rdi), %xmm0
2056 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
2057 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
2058 ; X64-SSE42-NEXT: movups 48(%rdi), %xmm3
2059 ; X64-SSE42-NEXT: movl (%rsi), %eax
2060 ; X64-SSE42-NEXT: xorps %xmm4, %xmm4
2061 ; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
2062 ; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
2063 ; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
2064 ; X64-SSE42-NEXT: movups %xmm4, -{{[0-9]+}}(%rsp)
2065 ; X64-SSE42-NEXT: movups %xmm3, -{{[0-9]+}}(%rsp)
2066 ; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
2067 ; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
2068 ; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
2069 ; X64-SSE42-NEXT: andl $63, %eax
2070 ; X64-SSE42-NEXT: negl %eax
2071 ; X64-SSE42-NEXT: cltq
2072 ; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0
2073 ; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1
2074 ; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm2
2075 ; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm3
2076 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
2077 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
2078 ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
2079 ; X64-SSE42-NEXT: movups %xmm0, (%rdx)
2080 ; X64-SSE42-NEXT: retq
2082 ; X64-AVX1-LABEL: shl_64bytes:
2083 ; X64-AVX1: # %bb.0:
2084 ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
2085 ; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
2086 ; X64-AVX1-NEXT: movl (%rsi), %eax
2087 ; X64-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
2088 ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
2089 ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
2090 ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
2091 ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
2092 ; X64-AVX1-NEXT: andl $63, %eax
2093 ; X64-AVX1-NEXT: negl %eax
2094 ; X64-AVX1-NEXT: cltq
2095 ; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0
2096 ; X64-AVX1-NEXT: vmovups -48(%rsp,%rax), %xmm1
2097 ; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %xmm2
2098 ; X64-AVX1-NEXT: vmovups -16(%rsp,%rax), %xmm3
2099 ; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
2100 ; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
2101 ; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
2102 ; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
2103 ; X64-AVX1-NEXT: vzeroupper
2104 ; X64-AVX1-NEXT: retq
2106 ; X64-AVX512-LABEL: shl_64bytes:
2107 ; X64-AVX512: # %bb.0:
2108 ; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
2109 ; X64-AVX512-NEXT: movl (%rsi), %eax
2110 ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
2111 ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
2112 ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
2113 ; X64-AVX512-NEXT: andl $63, %eax
2114 ; X64-AVX512-NEXT: negl %eax
2115 ; X64-AVX512-NEXT: cltq
2116 ; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0
2117 ; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1
2118 ; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2
2119 ; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3
2120 ; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
2121 ; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
2122 ; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
2123 ; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
2124 ; X64-AVX512-NEXT: vzeroupper
2125 ; X64-AVX512-NEXT: retq
2127 ; X86-SSE2-LABEL: shl_64bytes:
2128 ; X86-SSE2: # %bb.0:
2129 ; X86-SSE2-NEXT: pushl %ebp
2130 ; X86-SSE2-NEXT: pushl %ebx
2131 ; X86-SSE2-NEXT: pushl %edi
2132 ; X86-SSE2-NEXT: pushl %esi
2133 ; X86-SSE2-NEXT: subl $168, %esp
2134 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
2135 ; X86-SSE2-NEXT: movl (%eax), %ecx
2136 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2137 ; X86-SSE2-NEXT: movl 4(%eax), %ecx
2138 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2139 ; X86-SSE2-NEXT: movl 8(%eax), %ecx
2140 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2141 ; X86-SSE2-NEXT: movl 12(%eax), %ecx
2142 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2143 ; X86-SSE2-NEXT: movl 16(%eax), %ecx
2144 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2145 ; X86-SSE2-NEXT: movl 20(%eax), %ecx
2146 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2147 ; X86-SSE2-NEXT: movl 24(%eax), %ecx
2148 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2149 ; X86-SSE2-NEXT: movl 28(%eax), %ecx
2150 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2151 ; X86-SSE2-NEXT: movl 32(%eax), %ecx
2152 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2153 ; X86-SSE2-NEXT: movl 36(%eax), %ecx
2154 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
2155 ; X86-SSE2-NEXT: movl 40(%eax), %ebp
2156 ; X86-SSE2-NEXT: movl 44(%eax), %ebx
2157 ; X86-SSE2-NEXT: movl 48(%eax), %edi
2158 ; X86-SSE2-NEXT: movl 52(%eax), %esi
2159 ; X86-SSE2-NEXT: movl 56(%eax), %edx
2160 ; X86-SSE2-NEXT: movl 60(%eax), %ecx
2161 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
2162 ; X86-SSE2-NEXT: movl (%eax), %eax
2163 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2164 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2165 ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
2166 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
2167 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
2168 ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
2169 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
2170 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2171 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2172 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2173 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2174 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2175 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2176 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2177 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2178 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2179 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2180 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2181 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2182 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2183 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2184 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2185 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2186 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2187 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2188 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2189 ; X86-SSE2-NEXT: andl $63, %eax
2190 ; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx
2191 ; X86-SSE2-NEXT: subl %eax, %ecx
2192 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2193 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2194 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2195 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2196 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2197 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2198 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2199 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2200 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2201 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2202 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2203 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2204 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2205 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2206 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2207 ; X86-SSE2-NEXT: movl $0, {{[0-9]+}}(%esp)
2208 ; X86-SSE2-NEXT: movl (%ecx), %edx
2209 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2210 ; X86-SSE2-NEXT: movl 4(%ecx), %edx
2211 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2212 ; X86-SSE2-NEXT: movl 12(%ecx), %edx
2213 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2214 ; X86-SSE2-NEXT: movl 8(%ecx), %edx
2215 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2216 ; X86-SSE2-NEXT: movl 20(%ecx), %edx
2217 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2218 ; X86-SSE2-NEXT: movl 16(%ecx), %edx
2219 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2220 ; X86-SSE2-NEXT: movl 28(%ecx), %edx
2221 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2222 ; X86-SSE2-NEXT: movl 24(%ecx), %edx
2223 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2224 ; X86-SSE2-NEXT: movl 36(%ecx), %edx
2225 ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2226 ; X86-SSE2-NEXT: movl 32(%ecx), %edx
2227 ; X86-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
2228 ; X86-SSE2-NEXT: movl 44(%ecx), %ebp
2229 ; X86-SSE2-NEXT: movl 40(%ecx), %ebx
2230 ; X86-SSE2-NEXT: movl 52(%ecx), %edi
2231 ; X86-SSE2-NEXT: movl 60(%ecx), %esi
2232 ; X86-SSE2-NEXT: movl 56(%ecx), %edx
2233 ; X86-SSE2-NEXT: negl %eax
2234 ; X86-SSE2-NEXT: movl 152(%esp,%eax), %ecx
2235 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
2236 ; X86-SSE2-NEXT: movl %edx, 56(%eax)
2237 ; X86-SSE2-NEXT: movl %esi, 60(%eax)
2238 ; X86-SSE2-NEXT: movl %ecx, 48(%eax)
2239 ; X86-SSE2-NEXT: movl %edi, 52(%eax)
2240 ; X86-SSE2-NEXT: movl %ebx, 40(%eax)
2241 ; X86-SSE2-NEXT: movl %ebp, 44(%eax)
2242 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
2243 ; X86-SSE2-NEXT: movl %ecx, 32(%eax)
2244 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2245 ; X86-SSE2-NEXT: movl %ecx, 36(%eax)
2246 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2247 ; X86-SSE2-NEXT: movl %ecx, 24(%eax)
2248 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2249 ; X86-SSE2-NEXT: movl %ecx, 28(%eax)
2250 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2251 ; X86-SSE2-NEXT: movl %ecx, 16(%eax)
2252 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2253 ; X86-SSE2-NEXT: movl %ecx, 20(%eax)
2254 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2255 ; X86-SSE2-NEXT: movl %ecx, 8(%eax)
2256 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2257 ; X86-SSE2-NEXT: movl %ecx, 12(%eax)
2258 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2259 ; X86-SSE2-NEXT: movl %ecx, (%eax)
2260 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2261 ; X86-SSE2-NEXT: movl %ecx, 4(%eax)
2262 ; X86-SSE2-NEXT: addl $168, %esp
2263 ; X86-SSE2-NEXT: popl %esi
2264 ; X86-SSE2-NEXT: popl %edi
2265 ; X86-SSE2-NEXT: popl %ebx
2266 ; X86-SSE2-NEXT: popl %ebp
2267 ; X86-SSE2-NEXT: retl
2269 ; X86-SSE42-LABEL: shl_64bytes:
2270 ; X86-SSE42: # %bb.0:
2271 ; X86-SSE42-NEXT: subl $128, %esp
2272 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
2273 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
2274 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
2275 ; X86-SSE42-NEXT: movups (%edx), %xmm0
2276 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1
2277 ; X86-SSE42-NEXT: movups 32(%edx), %xmm2
2278 ; X86-SSE42-NEXT: movups 48(%edx), %xmm3
2279 ; X86-SSE42-NEXT: movl (%ecx), %ecx
2280 ; X86-SSE42-NEXT: xorps %xmm4, %xmm4
2281 ; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
2282 ; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
2283 ; X86-SSE42-NEXT: movups %xmm4, {{[0-9]+}}(%esp)
2284 ; X86-SSE42-NEXT: movups %xmm4, (%esp)
2285 ; X86-SSE42-NEXT: movups %xmm3, {{[0-9]+}}(%esp)
2286 ; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
2287 ; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
2288 ; X86-SSE42-NEXT: movups %xmm0, {{[0-9]+}}(%esp)
2289 ; X86-SSE42-NEXT: andl $63, %ecx
2290 ; X86-SSE42-NEXT: leal {{[0-9]+}}(%esp), %edx
2291 ; X86-SSE42-NEXT: subl %ecx, %edx
2292 ; X86-SSE42-NEXT: movups (%edx), %xmm0
2293 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1
2294 ; X86-SSE42-NEXT: movups 32(%edx), %xmm2
2295 ; X86-SSE42-NEXT: negl %ecx
2296 ; X86-SSE42-NEXT: movups 112(%esp,%ecx), %xmm3
2297 ; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
2298 ; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
2299 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
2300 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
2301 ; X86-SSE42-NEXT: addl $128, %esp
2302 ; X86-SSE42-NEXT: retl
2304 ; X86-AVX1-LABEL: shl_64bytes:
2305 ; X86-AVX1: # %bb.0:
2306 ; X86-AVX1-NEXT: subl $128, %esp
2307 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
2308 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
2309 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
2310 ; X86-AVX1-NEXT: vmovups (%edx), %ymm0
2311 ; X86-AVX1-NEXT: vmovups 32(%edx), %ymm1
2312 ; X86-AVX1-NEXT: movl (%ecx), %ecx
2313 ; X86-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
2314 ; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
2315 ; X86-AVX1-NEXT: vmovups %ymm2, (%esp)
2316 ; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
2317 ; X86-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
2318 ; X86-AVX1-NEXT: andl $63, %ecx
2319 ; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx
2320 ; X86-AVX1-NEXT: subl %ecx, %edx
2321 ; X86-AVX1-NEXT: vmovups (%edx), %xmm0
2322 ; X86-AVX1-NEXT: vmovups 16(%edx), %xmm1
2323 ; X86-AVX1-NEXT: vmovups 32(%edx), %xmm2
2324 ; X86-AVX1-NEXT: negl %ecx
2325 ; X86-AVX1-NEXT: vmovups 112(%esp,%ecx), %xmm3
2326 ; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax)
2327 ; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
2328 ; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
2329 ; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
2330 ; X86-AVX1-NEXT: addl $128, %esp
2331 ; X86-AVX1-NEXT: vzeroupper
2332 ; X86-AVX1-NEXT: retl
2334 ; X86-AVX512-LABEL: shl_64bytes:
2335 ; X86-AVX512: # %bb.0:
2336 ; X86-AVX512-NEXT: subl $128, %esp
2337 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
2338 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
2339 ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
2340 ; X86-AVX512-NEXT: vmovups (%edx), %zmm0
2341 ; X86-AVX512-NEXT: movl (%ecx), %ecx
2342 ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
2343 ; X86-AVX512-NEXT: vmovups %zmm1, (%esp)
2344 ; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
2345 ; X86-AVX512-NEXT: andl $63, %ecx
2346 ; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
2347 ; X86-AVX512-NEXT: subl %ecx, %edx
2348 ; X86-AVX512-NEXT: vmovups (%edx), %xmm0
2349 ; X86-AVX512-NEXT: vmovups 16(%edx), %xmm1
2350 ; X86-AVX512-NEXT: vmovups 32(%edx), %xmm2
2351 ; X86-AVX512-NEXT: negl %ecx
2352 ; X86-AVX512-NEXT: vmovups 112(%esp,%ecx), %xmm3
2353 ; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
2354 ; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
2355 ; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
2356 ; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
2357 ; X86-AVX512-NEXT: addl $128, %esp
2358 ; X86-AVX512-NEXT: vzeroupper
2359 ; X86-AVX512-NEXT: retl
2360 %src = load i512, ptr %src.ptr, align 1
2361 %byteOff = load i512, ptr %byteOff.ptr, align 1
2362 %bitOff = shl i512 %byteOff, 3
2363 %res = shl i512 %src, %bitOff
2364 store i512 %res, ptr %dst, align 1
2367 define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
2368 ; X64-SSE2-LABEL: ashr_64bytes:
2369 ; X64-SSE2: # %bb.0:
2370 ; X64-SSE2-NEXT: pushq %rbx
2371 ; X64-SSE2-NEXT: movq (%rdi), %rax
2372 ; X64-SSE2-NEXT: movq 8(%rdi), %rcx
2373 ; X64-SSE2-NEXT: movq 16(%rdi), %r8
2374 ; X64-SSE2-NEXT: movq 24(%rdi), %r9
2375 ; X64-SSE2-NEXT: movq 32(%rdi), %r10
2376 ; X64-SSE2-NEXT: movq 40(%rdi), %r11
2377 ; X64-SSE2-NEXT: movq 48(%rdi), %rbx
2378 ; X64-SSE2-NEXT: movq 56(%rdi), %rdi
2379 ; X64-SSE2-NEXT: movl (%rsi), %esi
2380 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2381 ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
2382 ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
2383 ; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
2384 ; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
2385 ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
2386 ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2387 ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
2388 ; X64-SSE2-NEXT: sarq $63, %rdi
2389 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2390 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2391 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2392 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2393 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2394 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2395 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2396 ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
2397 ; X64-SSE2-NEXT: andl $63, %esi
2398 ; X64-SSE2-NEXT: movq -128(%rsp,%rsi), %rax
2399 ; X64-SSE2-NEXT: movq -120(%rsp,%rsi), %rcx
2400 ; X64-SSE2-NEXT: movq -104(%rsp,%rsi), %rdi
2401 ; X64-SSE2-NEXT: movq -112(%rsp,%rsi), %r8
2402 ; X64-SSE2-NEXT: movq -88(%rsp,%rsi), %r9
2403 ; X64-SSE2-NEXT: movq -96(%rsp,%rsi), %r10
2404 ; X64-SSE2-NEXT: movq -72(%rsp,%rsi), %r11
2405 ; X64-SSE2-NEXT: movq -80(%rsp,%rsi), %rsi
2406 ; X64-SSE2-NEXT: movq %rsi, 48(%rdx)
2407 ; X64-SSE2-NEXT: movq %r11, 56(%rdx)
2408 ; X64-SSE2-NEXT: movq %r10, 32(%rdx)
2409 ; X64-SSE2-NEXT: movq %r9, 40(%rdx)
2410 ; X64-SSE2-NEXT: movq %r8, 16(%rdx)
2411 ; X64-SSE2-NEXT: movq %rdi, 24(%rdx)
2412 ; X64-SSE2-NEXT: movq %rax, (%rdx)
2413 ; X64-SSE2-NEXT: movq %rcx, 8(%rdx)
2414 ; X64-SSE2-NEXT: popq %rbx
2415 ; X64-SSE2-NEXT: retq
2417 ; X64-SSE42-LABEL: ashr_64bytes:
2418 ; X64-SSE42: # %bb.0:
2419 ; X64-SSE42-NEXT: movups (%rdi), %xmm0
2420 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1
2421 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2
2422 ; X64-SSE42-NEXT: movq 48(%rdi), %rax
2423 ; X64-SSE42-NEXT: movq 56(%rdi), %rcx
2424 ; X64-SSE42-NEXT: movl (%rsi), %esi
2425 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2426 ; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
2427 ; X64-SSE42-NEXT: movups %xmm2, -{{[0-9]+}}(%rsp)
2428 ; X64-SSE42-NEXT: movups %xmm1, -{{[0-9]+}}(%rsp)
2429 ; X64-SSE42-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
2430 ; X64-SSE42-NEXT: sarq $63, %rcx
2431 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2432 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2433 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2434 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2435 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2436 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2437 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2438 ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2439 ; X64-SSE42-NEXT: andl $63, %esi
2440 ; X64-SSE42-NEXT: movups -128(%rsp,%rsi), %xmm0
2441 ; X64-SSE42-NEXT: movups -112(%rsp,%rsi), %xmm1
2442 ; X64-SSE42-NEXT: movups -96(%rsp,%rsi), %xmm2
2443 ; X64-SSE42-NEXT: movups -80(%rsp,%rsi), %xmm3
2444 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx)
2445 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx)
2446 ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx)
2447 ; X64-SSE42-NEXT: movups %xmm0, (%rdx)
2448 ; X64-SSE42-NEXT: retq
2450 ; X64-AVX-LABEL: ashr_64bytes:
2452 ; X64-AVX-NEXT: vmovups (%rdi), %ymm0
2453 ; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1
2454 ; X64-AVX-NEXT: movq 48(%rdi), %rax
2455 ; X64-AVX-NEXT: movq 56(%rdi), %rcx
2456 ; X64-AVX-NEXT: movl (%rsi), %esi
2457 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2458 ; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
2459 ; X64-AVX-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
2460 ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
2461 ; X64-AVX-NEXT: sarq $63, %rcx
2462 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2463 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2464 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2465 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2466 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2467 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2468 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2469 ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
2470 ; X64-AVX-NEXT: andl $63, %esi
2471 ; X64-AVX-NEXT: vmovups -128(%rsp,%rsi), %xmm0
2472 ; X64-AVX-NEXT: vmovups -112(%rsp,%rsi), %xmm1
2473 ; X64-AVX-NEXT: vmovups -96(%rsp,%rsi), %xmm2
2474 ; X64-AVX-NEXT: vmovups -80(%rsp,%rsi), %xmm3
2475 ; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx)
2476 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
2477 ; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx)
2478 ; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
2479 ; X64-AVX-NEXT: vzeroupper
2480 ; X64-AVX-NEXT: retq
2482 ; X86-SSE2-LABEL: ashr_64bytes:
2483 ; X86-SSE2: # %bb.0:
2484 ; X86-SSE2-NEXT: pushl %ebp
2485 ; X86-SSE2-NEXT: pushl %ebx
2486 ; X86-SSE2-NEXT: pushl %edi
2487 ; X86-SSE2-NEXT: pushl %esi
2488 ; X86-SSE2-NEXT: subl $168, %esp
2489 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
2490 ; X86-SSE2-NEXT: movl (%eax), %ecx
2491 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2492 ; X86-SSE2-NEXT: movl 4(%eax), %ecx
2493 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2494 ; X86-SSE2-NEXT: movl 8(%eax), %ecx
2495 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2496 ; X86-SSE2-NEXT: movl 12(%eax), %ecx
2497 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2498 ; X86-SSE2-NEXT: movl 16(%eax), %ecx
2499 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2500 ; X86-SSE2-NEXT: movl 20(%eax), %ecx
2501 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2502 ; X86-SSE2-NEXT: movl 24(%eax), %ecx
2503 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2504 ; X86-SSE2-NEXT: movl 28(%eax), %ecx
2505 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2506 ; X86-SSE2-NEXT: movl 32(%eax), %ecx
2507 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2508 ; X86-SSE2-NEXT: movl 36(%eax), %ecx
2509 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
2510 ; X86-SSE2-NEXT: movl 40(%eax), %ebp
2511 ; X86-SSE2-NEXT: movl 44(%eax), %ebx
2512 ; X86-SSE2-NEXT: movl 48(%eax), %edi
2513 ; X86-SSE2-NEXT: movl 52(%eax), %esi
2514 ; X86-SSE2-NEXT: movl 56(%eax), %edx
2515 ; X86-SSE2-NEXT: movl 60(%eax), %ecx
2516 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
2517 ; X86-SSE2-NEXT: movl (%eax), %eax
2518 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2519 ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
2520 ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
2521 ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
2522 ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
2523 ; X86-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
2524 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2525 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2526 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2527 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2528 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2529 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2530 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2531 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2532 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2533 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2534 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2535 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2536 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2537 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2538 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2539 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2540 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2541 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2542 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
2543 ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
2544 ; X86-SSE2-NEXT: sarl $31, %ecx
2545 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2546 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2547 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2548 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2549 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2550 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2551 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2552 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2553 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2554 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2555 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2556 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2557 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2558 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2559 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2560 ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
2561 ; X86-SSE2-NEXT: andl $63, %eax
2562 ; X86-SSE2-NEXT: movl 40(%esp,%eax), %ecx
2563 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2564 ; X86-SSE2-NEXT: movl 44(%esp,%eax), %ecx
2565 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2566 ; X86-SSE2-NEXT: movl 52(%esp,%eax), %ecx
2567 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2568 ; X86-SSE2-NEXT: movl 48(%esp,%eax), %ecx
2569 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2570 ; X86-SSE2-NEXT: movl 60(%esp,%eax), %ecx
2571 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2572 ; X86-SSE2-NEXT: movl 56(%esp,%eax), %ecx
2573 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2574 ; X86-SSE2-NEXT: movl 68(%esp,%eax), %ecx
2575 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2576 ; X86-SSE2-NEXT: movl 64(%esp,%eax), %ecx
2577 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2578 ; X86-SSE2-NEXT: movl 76(%esp,%eax), %ecx
2579 ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
2580 ; X86-SSE2-NEXT: movl 72(%esp,%eax), %ecx
2581 ; X86-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
2582 ; X86-SSE2-NEXT: movl 84(%esp,%eax), %ebp
2583 ; X86-SSE2-NEXT: movl 80(%esp,%eax), %ebx
2584 ; X86-SSE2-NEXT: movl 92(%esp,%eax), %edi
2585 ; X86-SSE2-NEXT: movl 88(%esp,%eax), %esi
2586 ; X86-SSE2-NEXT: movl 100(%esp,%eax), %edx
2587 ; X86-SSE2-NEXT: movl 96(%esp,%eax), %ecx
2588 ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
2589 ; X86-SSE2-NEXT: movl %ecx, 56(%eax)
2590 ; X86-SSE2-NEXT: movl %edx, 60(%eax)
2591 ; X86-SSE2-NEXT: movl %esi, 48(%eax)
2592 ; X86-SSE2-NEXT: movl %edi, 52(%eax)
2593 ; X86-SSE2-NEXT: movl %ebx, 40(%eax)
2594 ; X86-SSE2-NEXT: movl %ebp, 44(%eax)
2595 ; X86-SSE2-NEXT: movl (%esp), %ecx # 4-byte Reload
2596 ; X86-SSE2-NEXT: movl %ecx, 32(%eax)
2597 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2598 ; X86-SSE2-NEXT: movl %ecx, 36(%eax)
2599 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2600 ; X86-SSE2-NEXT: movl %ecx, 24(%eax)
2601 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2602 ; X86-SSE2-NEXT: movl %ecx, 28(%eax)
2603 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2604 ; X86-SSE2-NEXT: movl %ecx, 16(%eax)
2605 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2606 ; X86-SSE2-NEXT: movl %ecx, 20(%eax)
2607 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2608 ; X86-SSE2-NEXT: movl %ecx, 8(%eax)
2609 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2610 ; X86-SSE2-NEXT: movl %ecx, 12(%eax)
2611 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2612 ; X86-SSE2-NEXT: movl %ecx, (%eax)
2613 ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
2614 ; X86-SSE2-NEXT: movl %ecx, 4(%eax)
2615 ; X86-SSE2-NEXT: addl $168, %esp
2616 ; X86-SSE2-NEXT: popl %esi
2617 ; X86-SSE2-NEXT: popl %edi
2618 ; X86-SSE2-NEXT: popl %ebx
2619 ; X86-SSE2-NEXT: popl %ebp
2620 ; X86-SSE2-NEXT: retl
2622 ; X86-SSE42-LABEL: ashr_64bytes:
2623 ; X86-SSE42: # %bb.0:
2624 ; X86-SSE42-NEXT: pushl %ebx
2625 ; X86-SSE42-NEXT: pushl %edi
2626 ; X86-SSE42-NEXT: pushl %esi
2627 ; X86-SSE42-NEXT: subl $128, %esp
2628 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
2629 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
2630 ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx
2631 ; X86-SSE42-NEXT: movups (%edx), %xmm0
2632 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1
2633 ; X86-SSE42-NEXT: movups 32(%edx), %xmm2
2634 ; X86-SSE42-NEXT: movl 48(%edx), %esi
2635 ; X86-SSE42-NEXT: movl 52(%edx), %edi
2636 ; X86-SSE42-NEXT: movl 56(%edx), %ebx
2637 ; X86-SSE42-NEXT: movl 60(%edx), %edx
2638 ; X86-SSE42-NEXT: movl (%ecx), %ecx
2639 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2640 ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp)
2641 ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp)
2642 ; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp)
2643 ; X86-SSE42-NEXT: movups %xmm2, {{[0-9]+}}(%esp)
2644 ; X86-SSE42-NEXT: movups %xmm1, {{[0-9]+}}(%esp)
2645 ; X86-SSE42-NEXT: movups %xmm0, (%esp)
2646 ; X86-SSE42-NEXT: sarl $31, %edx
2647 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2648 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2649 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2650 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2651 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2652 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2653 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2654 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2655 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2656 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2657 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2658 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2659 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2660 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2661 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2662 ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp)
2663 ; X86-SSE42-NEXT: andl $63, %ecx
2664 ; X86-SSE42-NEXT: movups (%esp,%ecx), %xmm0
2665 ; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm1
2666 ; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm2
2667 ; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm3
2668 ; X86-SSE42-NEXT: movups %xmm3, 48(%eax)
2669 ; X86-SSE42-NEXT: movups %xmm2, 32(%eax)
2670 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax)
2671 ; X86-SSE42-NEXT: movups %xmm0, (%eax)
2672 ; X86-SSE42-NEXT: addl $128, %esp
2673 ; X86-SSE42-NEXT: popl %esi
2674 ; X86-SSE42-NEXT: popl %edi
2675 ; X86-SSE42-NEXT: popl %ebx
2676 ; X86-SSE42-NEXT: retl
2678 ; X86-AVX-LABEL: ashr_64bytes:
2680 ; X86-AVX-NEXT: pushl %ebx
2681 ; X86-AVX-NEXT: pushl %edi
2682 ; X86-AVX-NEXT: pushl %esi
2683 ; X86-AVX-NEXT: subl $128, %esp
2684 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
2685 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
2686 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
2687 ; X86-AVX-NEXT: vmovups (%edx), %ymm0
2688 ; X86-AVX-NEXT: vmovups 32(%edx), %xmm1
2689 ; X86-AVX-NEXT: movl 48(%edx), %esi
2690 ; X86-AVX-NEXT: movl 52(%edx), %edi
2691 ; X86-AVX-NEXT: movl 56(%edx), %ebx
2692 ; X86-AVX-NEXT: movl 60(%edx), %edx
2693 ; X86-AVX-NEXT: movl (%ecx), %ecx
2694 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2695 ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
2696 ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
2697 ; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
2698 ; X86-AVX-NEXT: vmovups %xmm1, {{[0-9]+}}(%esp)
2699 ; X86-AVX-NEXT: vmovups %ymm0, (%esp)
2700 ; X86-AVX-NEXT: sarl $31, %edx
2701 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2702 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2703 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2704 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2705 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2706 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2707 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2708 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2709 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2710 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2711 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2712 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2713 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2714 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2715 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2716 ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
2717 ; X86-AVX-NEXT: andl $63, %ecx
2718 ; X86-AVX-NEXT: vmovups (%esp,%ecx), %xmm0
2719 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm1
2720 ; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm2
2721 ; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm3
2722 ; X86-AVX-NEXT: vmovups %xmm3, 48(%eax)
2723 ; X86-AVX-NEXT: vmovups %xmm2, 32(%eax)
2724 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
2725 ; X86-AVX-NEXT: vmovups %xmm0, (%eax)
2726 ; X86-AVX-NEXT: addl $128, %esp
2727 ; X86-AVX-NEXT: popl %esi
2728 ; X86-AVX-NEXT: popl %edi
2729 ; X86-AVX-NEXT: popl %ebx
2730 ; X86-AVX-NEXT: vzeroupper
2731 ; X86-AVX-NEXT: retl
2732 %src = load i512, ptr %src.ptr, align 1
2733 %byteOff = load i512, ptr %byteOff.ptr, align 1
2734 %bitOff = shl i512 %byteOff, 3
2735 %res = ashr i512 %src, %bitOff
2736 store i512 %res, ptr %dst, align 1
2739 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2743 ; FALLBACK10: {{.*}}
2744 ; FALLBACK11: {{.*}}
2745 ; FALLBACK12: {{.*}}
2746 ; FALLBACK13: {{.*}}
2747 ; FALLBACK14: {{.*}}
2748 ; FALLBACK15: {{.*}}
2749 ; FALLBACK16: {{.*}}
2750 ; FALLBACK17: {{.*}}
2751 ; FALLBACK18: {{.*}}
2752 ; FALLBACK19: {{.*}}
2754 ; FALLBACK20: {{.*}}
2755 ; FALLBACK21: {{.*}}
2756 ; FALLBACK22: {{.*}}
2757 ; FALLBACK23: {{.*}}
2758 ; FALLBACK24: {{.*}}
2759 ; FALLBACK25: {{.*}}
2760 ; FALLBACK26: {{.*}}
2761 ; FALLBACK27: {{.*}}
2762 ; FALLBACK28: {{.*}}
2763 ; FALLBACK29: {{.*}}
2765 ; FALLBACK30: {{.*}}
2766 ; FALLBACK31: {{.*}}