clarify the purpose of this project
[nyanglibc.git] / mathvec / svml_s_sincosf8_core_avx.s
blob0f5943003abf370052efb726e71302b42a4e3ade
1 .macro WRAPPER_IMPL_SSE2 callee
2 subq $40, %rsp
3 movaps %xmm0, (%rsp)
4 call \callee
5 movss %xmm0, 16(%rsp)
6 movss 4(%rsp), %xmm0
7 call \callee
8 movss %xmm0, 20(%rsp)
9 movss 8(%rsp), %xmm0
10 call \callee
11 movss %xmm0, 24(%rsp)
12 movss 12(%rsp), %xmm0
13 call \callee
14 movss 16(%rsp), %xmm3
15 movss 20(%rsp), %xmm2
16 movss 24(%rsp), %xmm1
17 movss %xmm0, 28(%rsp)
18 unpcklps %xmm1, %xmm3
19 unpcklps %xmm0, %xmm2
20 unpcklps %xmm2, %xmm3
21 movaps %xmm3, %xmm0
22 addq $40, %rsp
23 ret
24 .endm
25 .macro WRAPPER_IMPL_SSE2_ff callee
26 subq $56, %rsp
27 movaps %xmm0, (%rsp)
28 movaps %xmm1, 16(%rsp)
29 call \callee
30 movss %xmm0, 32(%rsp)
31 movss 4(%rsp), %xmm0
32 movss 20(%rsp), %xmm1
33 call \callee
34 movss %xmm0, 36(%rsp)
35 movss 8(%rsp), %xmm0
36 movss 24(%rsp), %xmm1
37 call \callee
38 movss %xmm0, 40(%rsp)
39 movss 12(%rsp), %xmm0
40 movss 28(%rsp), %xmm1
41 call \callee
42 movss 32(%rsp), %xmm3
43 movss 36(%rsp), %xmm2
44 movss 40(%rsp), %xmm1
45 movss %xmm0, 44(%rsp)
46 unpcklps %xmm1, %xmm3
47 unpcklps %xmm0, %xmm2
48 unpcklps %xmm2, %xmm3
49 movaps %xmm3, %xmm0
50 addq $56, %rsp
51 ret
52 .endm
53 .macro WRAPPER_IMPL_SSE2_fFF callee
54 pushq %rbp
55 pushq %rbx
56 movq %rdi, %rbp
57 movq %rsi, %rbx
58 subq $40, %rsp
59 leaq 24(%rsp), %rsi
60 leaq 28(%rsp), %rdi
61 movaps %xmm0, (%rsp)
62 call \callee
63 leaq 24(%rsp), %rsi
64 leaq 28(%rsp), %rdi
65 movss 28(%rsp), %xmm0
66 movss %xmm0, 0(%rbp)
67 movaps (%rsp), %xmm1
68 movss 24(%rsp), %xmm0
69 movss %xmm0, (%rbx)
70 movaps %xmm1, %xmm0
71 shufps $85, %xmm1, %xmm0
72 call \callee
73 movss 28(%rsp), %xmm0
74 leaq 24(%rsp), %rsi
75 movss %xmm0, 4(%rbp)
76 leaq 28(%rsp), %rdi
77 movaps (%rsp), %xmm1
78 movss 24(%rsp), %xmm0
79 movss %xmm0, 4(%rbx)
80 movaps %xmm1, %xmm0
81 unpckhps %xmm1, %xmm0
82 call \callee
83 movaps (%rsp), %xmm1
84 leaq 24(%rsp), %rsi
85 leaq 28(%rsp), %rdi
86 movss 28(%rsp), %xmm0
87 shufps $255, %xmm1, %xmm1
88 movss %xmm0, 8(%rbp)
89 movss 24(%rsp), %xmm0
90 movss %xmm0, 8(%rbx)
91 movaps %xmm1, %xmm0
92 call \callee
93 movss 28(%rsp), %xmm0
94 movss %xmm0, 12(%rbp)
95 movss 24(%rsp), %xmm0
96 movss %xmm0, 12(%rbx)
97 addq $40, %rsp
98 popq %rbx
99 popq %rbp
101 .endm
102 .macro WRAPPER_IMPL_AVX callee
103 pushq %rbp
104 movq %rsp, %rbp
105 andq $-32, %rsp
106 subq $32, %rsp
107 vextractf128 $1, %ymm0, (%rsp)
108 vzeroupper
109 call \callee
110 vmovaps %xmm0, 16(%rsp)
111 vmovaps (%rsp), %xmm0
112 call \callee
113 vmovaps %xmm0, %xmm1
114 vmovaps 16(%rsp), %xmm0
115 vinsertf128 $1, %xmm1, %ymm0, %ymm0
116 movq %rbp, %rsp
117 popq %rbp
119 .endm
120 .macro WRAPPER_IMPL_AVX_ff callee
121 pushq %rbp
122 movq %rsp, %rbp
123 andq $-32, %rsp
124 subq $64, %rsp
125 vextractf128 $1, %ymm0, 16(%rsp)
126 vextractf128 $1, %ymm1, (%rsp)
127 vzeroupper
128 call \callee
129 vmovaps %xmm0, 32(%rsp)
130 vmovaps 16(%rsp), %xmm0
131 vmovaps (%rsp), %xmm1
132 call \callee
133 vmovaps %xmm0, %xmm1
134 vmovaps 32(%rsp), %xmm0
135 vinsertf128 $1, %xmm1, %ymm0, %ymm0
136 movq %rbp, %rsp
137 popq %rbp
139 .endm
140 .macro WRAPPER_IMPL_AVX_fFF callee
141 pushq %rbp
142 movq %rsp, %rbp
143 andq $-32, %rsp
144 pushq %r13
145 pushq %r14
146 subq $48, %rsp
147 movq %rsi, %r14
148 vmovaps %ymm0, (%rsp)
149 movq %rdi, %r13
150 vmovaps 16(%rsp), %xmm1
151 vmovaps %xmm1, 32(%rsp)
152 vzeroupper
153 vmovaps (%rsp), %xmm0
154 call \callee
155 vmovaps 32(%rsp), %xmm0
156 lea (%rsp), %rdi
157 lea 16(%rsp), %rsi
158 call \callee
159 vmovaps (%rsp), %xmm0
160 vmovaps 16(%rsp), %xmm1
161 vmovaps %xmm0, 16(%r13)
162 vmovaps %xmm1, 16(%r14)
163 addq $48, %rsp
164 popq %r14
165 popq %r13
166 movq %rbp, %rsp
167 popq %rbp
169 .endm
170 .macro WRAPPER_IMPL_AVX512 callee
171 pushq %rbp
172 movq %rsp, %rbp
173 andq $-64, %rsp
174 subq $128, %rsp
175 vmovups %zmm0, (%rsp)
176 vmovupd (%rsp), %ymm0
177 call \callee
178 vmovupd %ymm0, 64(%rsp)
179 vmovupd 32(%rsp), %ymm0
180 call \callee
181 vmovupd %ymm0, 96(%rsp)
182 vmovups 64(%rsp), %zmm0
183 movq %rbp, %rsp
184 popq %rbp
186 .endm
187 .macro WRAPPER_IMPL_AVX512_ff callee
188 pushq %rbp
189 movq %rsp, %rbp
190 andq $-64, %rsp
191 subq $192, %rsp
192 vmovups %zmm0, (%rsp)
193 vmovups %zmm1, 64(%rsp)
194 vmovups (%rsp), %ymm0
195 vmovups 64(%rsp), %ymm1
196 call \callee
197 vmovups %ymm0, 128(%rsp)
198 vmovups 32(%rsp), %ymm0
199 vmovups 96(%rsp), %ymm1
200 call \callee
201 vmovups %ymm0, 160(%rsp)
202 vmovups 128(%rsp), %zmm0
203 movq %rbp, %rsp
204 popq %rbp
206 .endm
207 .macro WRAPPER_IMPL_AVX512_fFF callee
208 pushq %rbp
209 movq %rsp, %rbp
210 andq $-64, %rsp
211 pushq %r12
212 pushq %r13
213 subq $176, %rsp
214 movq %rsi, %r13
215 vmovaps %zmm0, (%rsp)
216 movq %rdi, %r12
217 vmovaps (%rsp), %ymm0
218 call \callee
219 vmovaps 32(%rsp), %ymm0
220 lea 64(%rsp), %rdi
221 lea 96(%rsp), %rsi
222 call \callee
223 vmovaps 64(%rsp), %ymm0
224 vmovaps 96(%rsp), %ymm1
225 vmovaps %ymm0, 32(%r12)
226 vmovaps %ymm1, 32(%r13)
227 addq $176, %rsp
228 popq %r13
229 popq %r12
230 movq %rbp, %rsp
231 popq %rbp
233 .endm
234 .text
235 .globl _ZGVcN8vl4l4_sincosf
236 .type _ZGVcN8vl4l4_sincosf,@function
237 .align 1<<4
238 _ZGVcN8vl4l4_sincosf:
241 WRAPPER_IMPL_AVX_fFF _ZGVbN4vl4l4_sincosf
242 .size _ZGVcN8vl4l4_sincosf,.-_ZGVcN8vl4l4_sincosf
244 .macro WRAPPER_IMPL_AVX_fFF_vvv callee
245 pushq %rbp
246 movq %rsp, %rbp
247 andq $-32, %rsp
248 subq $224, %rsp
249 vmovups %ymm0, 64(%rsp)
250 lea (%rsp), %rdi
251 vmovdqu %xmm1, 96(%rdi)
252 vmovdqu %xmm2, 112(%rdi)
253 vmovdqu %xmm3, 128(%rdi)
254 vmovdqu %xmm4, 144(%rdi)
255 vmovdqu %xmm5, 160(%rdi)
256 lea 32(%rsp), %rsi
257 vmovdqu %xmm6, 144(%rsi)
258 vmovdqu %xmm7, 160(%rsi)
259 vzeroupper
260 call \callee
261 vmovdqu 80(%rsp), %xmm0
262 lea 16(%rsp), %rdi
263 lea 48(%rsp), %rsi
264 call \callee
265 movq 96(%rsp), %rdx
266 movq 104(%rsp), %rsi
267 movq 112(%rsp), %r8
268 movq 120(%rsp), %r10
269 movl (%rsp), %eax
270 movl 4(%rsp), %ecx
271 movl 8(%rsp), %edi
272 movl 12(%rsp), %r9d
273 movl %eax, (%rdx)
274 movl %ecx, (%rsi)
275 movq 128(%rsp), %rax
276 movq 136(%rsp), %rcx
277 movl %edi, (%r8)
278 movl %r9d, (%r10)
279 movq 144(%rsp), %rdi
280 movq 152(%rsp), %r9
281 movl 16(%rsp), %r11d
282 movl 20(%rsp), %edx
283 movl 24(%rsp), %esi
284 movl 28(%rsp), %r8d
285 movl %r11d, (%rax)
286 movl %edx, (%rcx)
287 movq 160(%rsp), %r11
288 movq 168(%rsp), %rdx
289 movl %esi, (%rdi)
290 movl %r8d, (%r9)
291 movq 176(%rsp), %rsi
292 movq 184(%rsp), %r8
293 movl 32(%rsp), %r10d
294 movl 36(%rsp), %eax
295 movl 40(%rsp), %ecx
296 movl 44(%rsp), %edi
297 movl %r10d, (%r11)
298 movl %eax, (%rdx)
299 movq 192(%rsp), %r10
300 movq 200(%rsp), %rax
301 movl %ecx, (%rsi)
302 movl %edi, (%r8)
303 movq 16(%rbp), %rcx
304 movq 24(%rbp), %rdi
305 movl 48(%rsp), %r9d
306 movl 52(%rsp), %r11d
307 movl 56(%rsp), %edx
308 movl 60(%rsp), %esi
309 movl %r9d, (%r10)
310 movl %r11d, (%rax)
311 movl %edx, (%rcx)
312 movl %esi, (%rdi)
313 movq %rbp, %rsp
314 popq %rbp
316 .endm
317 .globl _ZGVcN8vvv_sincosf
318 .type _ZGVcN8vvv_sincosf,@function
319 .align 1<<4
320 _ZGVcN8vvv_sincosf:
323 WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN4vl4l4_sincosf
324 .size _ZGVcN8vvv_sincosf,.-_ZGVcN8vvv_sincosf