clarify the purpose of this project
[nyanglibc.git] / mathvec / svml_s_sincosf16_core.s
blob5603b272db5aeaed8c46ab89366d2aa5960e44d3
1 .macro WRAPPER_IMPL_SSE2 callee
2 subq $40, %rsp
3 movaps %xmm0, (%rsp)
4 call \callee
5 movss %xmm0, 16(%rsp)
6 movss 4(%rsp), %xmm0
7 call \callee
8 movss %xmm0, 20(%rsp)
9 movss 8(%rsp), %xmm0
10 call \callee
11 movss %xmm0, 24(%rsp)
12 movss 12(%rsp), %xmm0
13 call \callee
14 movss 16(%rsp), %xmm3
15 movss 20(%rsp), %xmm2
16 movss 24(%rsp), %xmm1
17 movss %xmm0, 28(%rsp)
18 unpcklps %xmm1, %xmm3
19 unpcklps %xmm0, %xmm2
20 unpcklps %xmm2, %xmm3
21 movaps %xmm3, %xmm0
22 addq $40, %rsp
23 ret
24 .endm
25 .macro WRAPPER_IMPL_SSE2_ff callee
26 subq $56, %rsp
27 movaps %xmm0, (%rsp)
28 movaps %xmm1, 16(%rsp)
29 call \callee
30 movss %xmm0, 32(%rsp)
31 movss 4(%rsp), %xmm0
32 movss 20(%rsp), %xmm1
33 call \callee
34 movss %xmm0, 36(%rsp)
35 movss 8(%rsp), %xmm0
36 movss 24(%rsp), %xmm1
37 call \callee
38 movss %xmm0, 40(%rsp)
39 movss 12(%rsp), %xmm0
40 movss 28(%rsp), %xmm1
41 call \callee
42 movss 32(%rsp), %xmm3
43 movss 36(%rsp), %xmm2
44 movss 40(%rsp), %xmm1
45 movss %xmm0, 44(%rsp)
46 unpcklps %xmm1, %xmm3
47 unpcklps %xmm0, %xmm2
48 unpcklps %xmm2, %xmm3
49 movaps %xmm3, %xmm0
50 addq $56, %rsp
51 ret
52 .endm
53 .macro WRAPPER_IMPL_SSE2_fFF callee
54 pushq %rbp
55 pushq %rbx
56 movq %rdi, %rbp
57 movq %rsi, %rbx
58 subq $40, %rsp
59 leaq 24(%rsp), %rsi
60 leaq 28(%rsp), %rdi
61 movaps %xmm0, (%rsp)
62 call \callee
63 leaq 24(%rsp), %rsi
64 leaq 28(%rsp), %rdi
65 movss 28(%rsp), %xmm0
66 movss %xmm0, 0(%rbp)
67 movaps (%rsp), %xmm1
68 movss 24(%rsp), %xmm0
69 movss %xmm0, (%rbx)
70 movaps %xmm1, %xmm0
71 shufps $85, %xmm1, %xmm0
72 call \callee
73 movss 28(%rsp), %xmm0
74 leaq 24(%rsp), %rsi
75 movss %xmm0, 4(%rbp)
76 leaq 28(%rsp), %rdi
77 movaps (%rsp), %xmm1
78 movss 24(%rsp), %xmm0
79 movss %xmm0, 4(%rbx)
80 movaps %xmm1, %xmm0
81 unpckhps %xmm1, %xmm0
82 call \callee
83 movaps (%rsp), %xmm1
84 leaq 24(%rsp), %rsi
85 leaq 28(%rsp), %rdi
86 movss 28(%rsp), %xmm0
87 shufps $255, %xmm1, %xmm1
88 movss %xmm0, 8(%rbp)
89 movss 24(%rsp), %xmm0
90 movss %xmm0, 8(%rbx)
91 movaps %xmm1, %xmm0
92 call \callee
93 movss 28(%rsp), %xmm0
94 movss %xmm0, 12(%rbp)
95 movss 24(%rsp), %xmm0
96 movss %xmm0, 12(%rbx)
97 addq $40, %rsp
98 popq %rbx
99 popq %rbp
101 .endm
102 .macro WRAPPER_IMPL_AVX callee
103 pushq %rbp
104 movq %rsp, %rbp
105 andq $-32, %rsp
106 subq $32, %rsp
107 vextractf128 $1, %ymm0, (%rsp)
108 vzeroupper
109 call \callee
110 vmovaps %xmm0, 16(%rsp)
111 vmovaps (%rsp), %xmm0
112 call \callee
113 vmovaps %xmm0, %xmm1
114 vmovaps 16(%rsp), %xmm0
115 vinsertf128 $1, %xmm1, %ymm0, %ymm0
116 movq %rbp, %rsp
117 popq %rbp
119 .endm
120 .macro WRAPPER_IMPL_AVX_ff callee
121 pushq %rbp
122 movq %rsp, %rbp
123 andq $-32, %rsp
124 subq $64, %rsp
125 vextractf128 $1, %ymm0, 16(%rsp)
126 vextractf128 $1, %ymm1, (%rsp)
127 vzeroupper
128 call \callee
129 vmovaps %xmm0, 32(%rsp)
130 vmovaps 16(%rsp), %xmm0
131 vmovaps (%rsp), %xmm1
132 call \callee
133 vmovaps %xmm0, %xmm1
134 vmovaps 32(%rsp), %xmm0
135 vinsertf128 $1, %xmm1, %ymm0, %ymm0
136 movq %rbp, %rsp
137 popq %rbp
139 .endm
140 .macro WRAPPER_IMPL_AVX_fFF callee
141 pushq %rbp
142 movq %rsp, %rbp
143 andq $-32, %rsp
144 pushq %r13
145 pushq %r14
146 subq $48, %rsp
147 movq %rsi, %r14
148 vmovaps %ymm0, (%rsp)
149 movq %rdi, %r13
150 vmovaps 16(%rsp), %xmm1
151 vmovaps %xmm1, 32(%rsp)
152 vzeroupper
153 vmovaps (%rsp), %xmm0
154 call \callee
155 vmovaps 32(%rsp), %xmm0
156 lea (%rsp), %rdi
157 lea 16(%rsp), %rsi
158 call \callee
159 vmovaps (%rsp), %xmm0
160 vmovaps 16(%rsp), %xmm1
161 vmovaps %xmm0, 16(%r13)
162 vmovaps %xmm1, 16(%r14)
163 addq $48, %rsp
164 popq %r14
165 popq %r13
166 movq %rbp, %rsp
167 popq %rbp
169 .endm
170 .macro WRAPPER_IMPL_AVX512 callee
171 pushq %rbp
172 movq %rsp, %rbp
173 andq $-64, %rsp
174 subq $128, %rsp
175 vmovups %zmm0, (%rsp)
176 vmovupd (%rsp), %ymm0
177 call \callee
178 vmovupd %ymm0, 64(%rsp)
179 vmovupd 32(%rsp), %ymm0
180 call \callee
181 vmovupd %ymm0, 96(%rsp)
182 vmovups 64(%rsp), %zmm0
183 movq %rbp, %rsp
184 popq %rbp
186 .endm
187 .macro WRAPPER_IMPL_AVX512_ff callee
188 pushq %rbp
189 movq %rsp, %rbp
190 andq $-64, %rsp
191 subq $192, %rsp
192 vmovups %zmm0, (%rsp)
193 vmovups %zmm1, 64(%rsp)
194 vmovups (%rsp), %ymm0
195 vmovups 64(%rsp), %ymm1
196 call \callee
197 vmovups %ymm0, 128(%rsp)
198 vmovups 32(%rsp), %ymm0
199 vmovups 96(%rsp), %ymm1
200 call \callee
201 vmovups %ymm0, 160(%rsp)
202 vmovups 128(%rsp), %zmm0
203 movq %rbp, %rsp
204 popq %rbp
206 .endm
207 .macro WRAPPER_IMPL_AVX512_fFF callee
208 pushq %rbp
209 movq %rsp, %rbp
210 andq $-64, %rsp
211 pushq %r12
212 pushq %r13
213 subq $176, %rsp
214 movq %rsi, %r13
215 vmovaps %zmm0, (%rsp)
216 movq %rdi, %r12
217 vmovaps (%rsp), %ymm0
218 call \callee
219 vmovaps 32(%rsp), %ymm0
220 lea 64(%rsp), %rdi
221 lea 96(%rsp), %rsi
222 call \callee
223 vmovaps 64(%rsp), %ymm0
224 vmovaps 96(%rsp), %ymm1
225 vmovaps %ymm0, 32(%r12)
226 vmovaps %ymm1, 32(%r13)
227 addq $176, %rsp
228 popq %r13
229 popq %r12
230 movq %rbp, %rsp
231 popq %rbp
233 .endm
234 .text
235 .globl _ZGVeN16vl4l4_sincosf
236 .type _ZGVeN16vl4l4_sincosf,@function
237 .align 1<<4
238 _ZGVeN16vl4l4_sincosf:
241 WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
242 .size _ZGVeN16vl4l4_sincosf,.-_ZGVeN16vl4l4_sincosf
244 .macro WRAPPER_IMPL_AVX512_fFF_vvv callee
245 pushq %rbp
246 movq %rsp, %rbp
247 andq $-64, %rsp
248 subq $448, %rsp
249 vmovups %zmm0, 384(%rsp)
250 lea (%rsp), %rdi
251 vmovups %zmm1, 128(%rdi)
252 vmovups %zmm2, 192(%rdi)
253 vmovups %zmm3, 256(%rdi)
254 vmovups %zmm4, 320(%rdi)
255 lea 64(%rsp), %rsi
256 call \callee
257 vmovdqu 416(%rsp), %ymm0
258 lea 32(%rsp), %rdi
259 lea 96(%rsp), %rsi
260 call \callee
261 movq 128(%rsp), %rdx
262 movq 136(%rsp), %rsi
263 movq 144(%rsp), %r8
264 movq 152(%rsp), %r10
265 movl (%rsp), %eax
266 movl 4(%rsp), %ecx
267 movl 8(%rsp), %edi
268 movl 12(%rsp), %r9d
269 movl %eax, (%rdx)
270 movl %ecx, (%rsi)
271 movq 160(%rsp), %rax
272 movq 168(%rsp), %rcx
273 movl %edi, (%r8)
274 movl %r9d, (%r10)
275 movq 176(%rsp), %rdi
276 movq 184(%rsp), %r9
277 movl 16(%rsp), %r11d
278 movl 20(%rsp), %edx
279 movl 24(%rsp), %esi
280 movl 28(%rsp), %r8d
281 movl %r11d, (%rax)
282 movl %edx, (%rcx)
283 movq 192(%rsp), %r11
284 movq 200(%rsp), %rdx
285 movl %esi, (%rdi)
286 movl %r8d, (%r9)
287 movq 208(%rsp), %rsi
288 movq 216(%rsp), %r8
289 movl 32(%rsp), %r10d
290 movl 36(%rsp), %eax
291 movl 40(%rsp), %ecx
292 movl 44(%rsp), %edi
293 movl %r10d, (%r11)
294 movl %eax, (%rdx)
295 movq 224(%rsp), %r10
296 movq 232(%rsp), %rax
297 movl %ecx, (%rsi)
298 movl %edi, (%r8)
299 movq 240(%rsp), %rcx
300 movq 248(%rsp), %rdi
301 movl 48(%rsp), %r9d
302 movl 52(%rsp), %r11d
303 movl 56(%rsp), %edx
304 movl 60(%rsp), %esi
305 movl %r9d, (%r10)
306 movl %r11d, (%rax)
307 movq 256(%rsp), %r9
308 movq 264(%rsp), %r11
309 movl %edx, (%rcx)
310 movl %esi, (%rdi)
311 movq 272(%rsp), %rdx
312 movq 280(%rsp), %rsi
313 movl 64(%rsp), %r8d
314 movl 68(%rsp), %r10d
315 movl 72(%rsp), %eax
316 movl 76(%rsp), %ecx
317 movl %r8d, (%r9)
318 movl %r10d, (%r11)
319 movq 288(%rsp), %r8
320 movq 296(%rsp), %r10
321 movl %eax, (%rdx)
322 movl %ecx, (%rsi)
323 movq 304(%rsp), %rax
324 movq 312(%rsp), %rcx
325 movl 80(%rsp), %edi
326 movl 84(%rsp), %r9d
327 movl 88(%rsp), %r11d
328 movl 92(%rsp), %edx
329 movl %edi, (%r8)
330 movl %r9d, (%r10)
331 movq 320(%rsp), %rdi
332 movq 328(%rsp), %r9
333 movl %r11d, (%rax)
334 movl %edx, (%rcx)
335 movq 336(%rsp), %r11
336 movq 344(%rsp), %rdx
337 movl 96(%rsp), %esi
338 movl 100(%rsp), %r8d
339 movl 104(%rsp), %r10d
340 movl 108(%rsp), %eax
341 movl %esi, (%rdi)
342 movl %r8d, (%r9)
343 movq 352(%rsp), %rsi
344 movq 360(%rsp), %r8
345 movl %r10d, (%r11)
346 movl %eax, (%rdx)
347 movq 368(%rsp), %r10
348 movq 376(%rsp), %rax
349 movl 112(%rsp), %ecx
350 movl 116(%rsp), %edi
351 movl 120(%rsp), %r9d
352 movl 124(%rsp), %r11d
353 movl %ecx, (%rsi)
354 movl %edi, (%r8)
355 movl %r9d, (%r10)
356 movl %r11d, (%rax)
357 movq %rbp, %rsp
358 popq %rbp
360 .endm
361 .globl _ZGVeN16vvv_sincosf
362 .type _ZGVeN16vvv_sincosf,@function
363 .align 1<<4
364 _ZGVeN16vvv_sincosf:
367 WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf
368 .size _ZGVeN16vvv_sincosf,.-_ZGVeN16vvv_sincosf