clarify the purpose of this project
[nyanglibc.git] / mathvec / svml_s_sincosf8_core.shared.s
blobda9482a729f9eedfc85eee60e4a24d20cf346ed2
1 .macro WRAPPER_IMPL_SSE2 callee
2 subq $40, %rsp
3 movaps %xmm0, (%rsp)
4 call \ callee@PLT
5 movss %xmm0, 16(%rsp)
6 movss 4(%rsp), %xmm0
7 call \ callee@PLT
8 movss %xmm0, 20(%rsp)
9 movss 8(%rsp), %xmm0
10 call \ callee@PLT
11 movss %xmm0, 24(%rsp)
12 movss 12(%rsp), %xmm0
13 call \ callee@PLT
14 movss 16(%rsp), %xmm3
15 movss 20(%rsp), %xmm2
16 movss 24(%rsp), %xmm1
17 movss %xmm0, 28(%rsp)
18 unpcklps %xmm1, %xmm3
19 unpcklps %xmm0, %xmm2
20 unpcklps %xmm2, %xmm3
21 movaps %xmm3, %xmm0
22 addq $40, %rsp
23 ret
24 .endm
25 .macro WRAPPER_IMPL_SSE2_ff callee
26 subq $56, %rsp
27 movaps %xmm0, (%rsp)
28 movaps %xmm1, 16(%rsp)
29 call \ callee@PLT
30 movss %xmm0, 32(%rsp)
31 movss 4(%rsp), %xmm0
32 movss 20(%rsp), %xmm1
33 call \ callee@PLT
34 movss %xmm0, 36(%rsp)
35 movss 8(%rsp), %xmm0
36 movss 24(%rsp), %xmm1
37 call \ callee@PLT
38 movss %xmm0, 40(%rsp)
39 movss 12(%rsp), %xmm0
40 movss 28(%rsp), %xmm1
41 call \ callee@PLT
42 movss 32(%rsp), %xmm3
43 movss 36(%rsp), %xmm2
44 movss 40(%rsp), %xmm1
45 movss %xmm0, 44(%rsp)
46 unpcklps %xmm1, %xmm3
47 unpcklps %xmm0, %xmm2
48 unpcklps %xmm2, %xmm3
49 movaps %xmm3, %xmm0
50 addq $56, %rsp
51 ret
52 .endm
53 .macro WRAPPER_IMPL_SSE2_fFF callee
54 pushq %rbp
55 pushq %rbx
56 movq %rdi, %rbp
57 movq %rsi, %rbx
58 subq $40, %rsp
59 leaq 24(%rsp), %rsi
60 leaq 28(%rsp), %rdi
61 movaps %xmm0, (%rsp)
62 call \ callee@PLT
63 leaq 24(%rsp), %rsi
64 leaq 28(%rsp), %rdi
65 movss 28(%rsp), %xmm0
66 movss %xmm0, 0(%rbp)
67 movaps (%rsp), %xmm1
68 movss 24(%rsp), %xmm0
69 movss %xmm0, (%rbx)
70 movaps %xmm1, %xmm0
71 shufps $85, %xmm1, %xmm0
72 call \ callee@PLT
73 movss 28(%rsp), %xmm0
74 leaq 24(%rsp), %rsi
75 movss %xmm0, 4(%rbp)
76 leaq 28(%rsp), %rdi
77 movaps (%rsp), %xmm1
78 movss 24(%rsp), %xmm0
79 movss %xmm0, 4(%rbx)
80 movaps %xmm1, %xmm0
81 unpckhps %xmm1, %xmm0
82 call \ callee@PLT
83 movaps (%rsp), %xmm1
84 leaq 24(%rsp), %rsi
85 leaq 28(%rsp), %rdi
86 movss 28(%rsp), %xmm0
87 shufps $255, %xmm1, %xmm1
88 movss %xmm0, 8(%rbp)
89 movss 24(%rsp), %xmm0
90 movss %xmm0, 8(%rbx)
91 movaps %xmm1, %xmm0
92 call \ callee@PLT
93 movss 28(%rsp), %xmm0
94 movss %xmm0, 12(%rbp)
95 movss 24(%rsp), %xmm0
96 movss %xmm0, 12(%rbx)
97 addq $40, %rsp
98 popq %rbx
99 popq %rbp
101 .endm
102 .macro WRAPPER_IMPL_AVX callee
103 pushq %rbp
104 movq %rsp, %rbp
105 andq $-32, %rsp
106 subq $32, %rsp
107 vextractf128 $1, %ymm0, (%rsp)
108 vzeroupper
109 call __GI_\callee
110 vmovaps %xmm0, 16(%rsp)
111 vmovaps (%rsp), %xmm0
112 call __GI_\callee
113 vmovaps %xmm0, %xmm1
114 vmovaps 16(%rsp), %xmm0
115 vinsertf128 $1, %xmm1, %ymm0, %ymm0
116 movq %rbp, %rsp
117 popq %rbp
119 .endm
120 .macro WRAPPER_IMPL_AVX_ff callee
121 pushq %rbp
122 movq %rsp, %rbp
123 andq $-32, %rsp
124 subq $64, %rsp
125 vextractf128 $1, %ymm0, 16(%rsp)
126 vextractf128 $1, %ymm1, (%rsp)
127 vzeroupper
128 call __GI_\callee
129 vmovaps %xmm0, 32(%rsp)
130 vmovaps 16(%rsp), %xmm0
131 vmovaps (%rsp), %xmm1
132 call __GI_\callee
133 vmovaps %xmm0, %xmm1
134 vmovaps 32(%rsp), %xmm0
135 vinsertf128 $1, %xmm1, %ymm0, %ymm0
136 movq %rbp, %rsp
137 popq %rbp
139 .endm
140 .macro WRAPPER_IMPL_AVX_fFF callee
141 pushq %rbp
142 movq %rsp, %rbp
143 andq $-32, %rsp
144 pushq %r13
145 pushq %r14
146 subq $48, %rsp
147 movq %rsi, %r14
148 vmovaps %ymm0, (%rsp)
149 movq %rdi, %r13
150 vmovaps 16(%rsp), %xmm1
151 vmovaps %xmm1, 32(%rsp)
152 vzeroupper
153 vmovaps (%rsp), %xmm0
154 call __GI_\callee
155 vmovaps 32(%rsp), %xmm0
156 lea (%rsp), %rdi
157 lea 16(%rsp), %rsi
158 call __GI_\callee
159 vmovaps (%rsp), %xmm0
160 vmovaps 16(%rsp), %xmm1
161 vmovaps %xmm0, 16(%r13)
162 vmovaps %xmm1, 16(%r14)
163 addq $48, %rsp
164 popq %r14
165 popq %r13
166 movq %rbp, %rsp
167 popq %rbp
169 .endm
170 .macro WRAPPER_IMPL_AVX512 callee
171 pushq %rbp
172 movq %rsp, %rbp
173 andq $-64, %rsp
174 subq $128, %rsp
175 vmovups %zmm0, (%rsp)
176 vmovupd (%rsp), %ymm0
177 call __GI_\callee
178 vmovupd %ymm0, 64(%rsp)
179 vmovupd 32(%rsp), %ymm0
180 call __GI_\callee
181 vmovupd %ymm0, 96(%rsp)
182 vmovups 64(%rsp), %zmm0
183 movq %rbp, %rsp
184 popq %rbp
186 .endm
187 .macro WRAPPER_IMPL_AVX512_ff callee
188 pushq %rbp
189 movq %rsp, %rbp
190 andq $-64, %rsp
191 subq $192, %rsp
192 vmovups %zmm0, (%rsp)
193 vmovups %zmm1, 64(%rsp)
194 vmovups (%rsp), %ymm0
195 vmovups 64(%rsp), %ymm1
196 call __GI_\callee
197 vmovups %ymm0, 128(%rsp)
198 vmovups 32(%rsp), %ymm0
199 vmovups 96(%rsp), %ymm1
200 call __GI_\callee
201 vmovups %ymm0, 160(%rsp)
202 vmovups 128(%rsp), %zmm0
203 movq %rbp, %rsp
204 popq %rbp
206 .endm
207 .macro WRAPPER_IMPL_AVX512_fFF callee
208 pushq %rbp
209 movq %rsp, %rbp
210 andq $-64, %rsp
211 pushq %r12
212 pushq %r13
213 subq $176, %rsp
214 movq %rsi, %r13
215 vmovaps %zmm0, (%rsp)
216 movq %rdi, %r12
217 vmovaps (%rsp), %ymm0
218 call __GI_\callee
219 vmovaps 32(%rsp), %ymm0
220 lea 64(%rsp), %rdi
221 lea 96(%rsp), %rsi
222 call __GI_\callee
223 vmovaps 64(%rsp), %ymm0
224 vmovaps 96(%rsp), %ymm1
225 vmovaps %ymm0, 32(%r12)
226 vmovaps %ymm1, 32(%r13)
227 addq $176, %rsp
228 popq %r13
229 popq %r12
230 movq %rbp, %rsp
231 popq %rbp
233 .endm
234 .text
235 .globl _ZGVdN8vl4l4_sincosf
236 .type _ZGVdN8vl4l4_sincosf,@function
237 .align 1<<4
238 _ZGVdN8vl4l4_sincosf:
241 WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf
242 .size _ZGVdN8vl4l4_sincosf,.-_ZGVdN8vl4l4_sincosf
244 .globl __GI__ZGVdN8vl4l4_sincosf
245 .set __GI__ZGVdN8vl4l4_sincosf,_ZGVdN8vl4l4_sincosf
246 .macro WRAPPER_IMPL_AVX2_fFF_vvv callee
247 pushq %rbp
248 movq %rsp, %rbp
249 andq $-32, %rsp
250 subq $224, %rsp
251 vmovups %ymm0, 192(%rsp)
252 lea (%rsp), %rdi
253 vmovdqu %ymm1, 64(%rdi)
254 vmovdqu %ymm2, 96(%rdi)
255 vmovdqu %ymm3, 128(%rdi)
256 vmovdqu %ymm4, 160(%rdi)
257 lea 32(%rsp), %rsi
258 vzeroupper
259 call __GI_\callee
260 vmovups 208(%rsp), %xmm0
261 lea 16(%rsp), %rdi
262 lea 48(%rsp), %rsi
263 call __GI_\callee
264 movq 64(%rsp), %rdx
265 movq 72(%rsp), %rsi
266 movq 80(%rsp), %r8
267 movq 88(%rsp), %r10
268 movl (%rsp), %eax
269 movl 4(%rsp), %ecx
270 movl 8(%rsp), %edi
271 movl 12(%rsp), %r9d
272 movl %eax, (%rdx)
273 movl %ecx, (%rsi)
274 movq 96(%rsp), %rax
275 movq 104(%rsp), %rcx
276 movl %edi, (%r8)
277 movl %r9d, (%r10)
278 movq 112(%rsp), %rdi
279 movq 120(%rsp), %r9
280 movl 16(%rsp), %r11d
281 movl 20(%rsp), %edx
282 movl 24(%rsp), %esi
283 movl 28(%rsp), %r8d
284 movl %r11d, (%rax)
285 movl %edx, (%rcx)
286 movq 128(%rsp), %r11
287 movq 136(%rsp), %rdx
288 movl %esi, (%rdi)
289 movl %r8d, (%r9)
290 movq 144(%rsp), %rsi
291 movq 152(%rsp), %r8
292 movl 32(%rsp), %r10d
293 movl 36(%rsp), %eax
294 movl 40(%rsp), %ecx
295 movl 44(%rsp), %edi
296 movl %r10d, (%r11)
297 movl %eax, (%rdx)
298 movq 160(%rsp), %r10
299 movq 168(%rsp), %rax
300 movl %ecx, (%rsi)
301 movl %edi, (%r8)
302 movq 176(%rsp), %rcx
303 movq 184(%rsp), %rdi
304 movl 48(%rsp), %r9d
305 movl 52(%rsp), %r11d
306 movl 56(%rsp), %edx
307 movl 60(%rsp), %esi
308 movl %r9d, (%r10)
309 movl %r11d, (%rax)
310 movl %edx, (%rcx)
311 movl %esi, (%rdi)
312 movq %rbp, %rsp
313 popq %rbp
315 .endm
316 .globl _ZGVdN8vvv_sincosf
317 .type _ZGVdN8vvv_sincosf,@function
318 .align 1<<4
319 _ZGVdN8vvv_sincosf:
322 WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN4vl4l4_sincosf
323 .size _ZGVdN8vvv_sincosf,.-_ZGVdN8vvv_sincosf
325 .globl __GI__ZGVdN8vvv_sincosf
326 .set __GI__ZGVdN8vvv_sincosf,_ZGVdN8vvv_sincosf