clarify the purpose of this project
[nyanglibc.git] / mathvec / svml_s_sincosf4_core.shared.s
blob572d08747d757e182ae89850dad748516a028cfb
1 .macro WRAPPER_IMPL_SSE2 callee
2 subq $40, %rsp
3 movaps %xmm0, (%rsp)
4 call \ callee@PLT
5 movss %xmm0, 16(%rsp)
6 movss 4(%rsp), %xmm0
7 call \ callee@PLT
8 movss %xmm0, 20(%rsp)
9 movss 8(%rsp), %xmm0
10 call \ callee@PLT
11 movss %xmm0, 24(%rsp)
12 movss 12(%rsp), %xmm0
13 call \ callee@PLT
14 movss 16(%rsp), %xmm3
15 movss 20(%rsp), %xmm2
16 movss 24(%rsp), %xmm1
17 movss %xmm0, 28(%rsp)
18 unpcklps %xmm1, %xmm3
19 unpcklps %xmm0, %xmm2
20 unpcklps %xmm2, %xmm3
21 movaps %xmm3, %xmm0
22 addq $40, %rsp
23 ret
24 .endm
25 .macro WRAPPER_IMPL_SSE2_ff callee
26 subq $56, %rsp
27 movaps %xmm0, (%rsp)
28 movaps %xmm1, 16(%rsp)
29 call \ callee@PLT
30 movss %xmm0, 32(%rsp)
31 movss 4(%rsp), %xmm0
32 movss 20(%rsp), %xmm1
33 call \ callee@PLT
34 movss %xmm0, 36(%rsp)
35 movss 8(%rsp), %xmm0
36 movss 24(%rsp), %xmm1
37 call \ callee@PLT
38 movss %xmm0, 40(%rsp)
39 movss 12(%rsp), %xmm0
40 movss 28(%rsp), %xmm1
41 call \ callee@PLT
42 movss 32(%rsp), %xmm3
43 movss 36(%rsp), %xmm2
44 movss 40(%rsp), %xmm1
45 movss %xmm0, 44(%rsp)
46 unpcklps %xmm1, %xmm3
47 unpcklps %xmm0, %xmm2
48 unpcklps %xmm2, %xmm3
49 movaps %xmm3, %xmm0
50 addq $56, %rsp
51 ret
52 .endm
53 .macro WRAPPER_IMPL_SSE2_fFF callee
54 pushq %rbp
55 pushq %rbx
56 movq %rdi, %rbp
57 movq %rsi, %rbx
58 subq $40, %rsp
59 leaq 24(%rsp), %rsi
60 leaq 28(%rsp), %rdi
61 movaps %xmm0, (%rsp)
62 call \ callee@PLT
63 leaq 24(%rsp), %rsi
64 leaq 28(%rsp), %rdi
65 movss 28(%rsp), %xmm0
66 movss %xmm0, 0(%rbp)
67 movaps (%rsp), %xmm1
68 movss 24(%rsp), %xmm0
69 movss %xmm0, (%rbx)
70 movaps %xmm1, %xmm0
71 shufps $85, %xmm1, %xmm0
72 call \ callee@PLT
73 movss 28(%rsp), %xmm0
74 leaq 24(%rsp), %rsi
75 movss %xmm0, 4(%rbp)
76 leaq 28(%rsp), %rdi
77 movaps (%rsp), %xmm1
78 movss 24(%rsp), %xmm0
79 movss %xmm0, 4(%rbx)
80 movaps %xmm1, %xmm0
81 unpckhps %xmm1, %xmm0
82 call \ callee@PLT
83 movaps (%rsp), %xmm1
84 leaq 24(%rsp), %rsi
85 leaq 28(%rsp), %rdi
86 movss 28(%rsp), %xmm0
87 shufps $255, %xmm1, %xmm1
88 movss %xmm0, 8(%rbp)
89 movss 24(%rsp), %xmm0
90 movss %xmm0, 8(%rbx)
91 movaps %xmm1, %xmm0
92 call \ callee@PLT
93 movss 28(%rsp), %xmm0
94 movss %xmm0, 12(%rbp)
95 movss 24(%rsp), %xmm0
96 movss %xmm0, 12(%rbx)
97 addq $40, %rsp
98 popq %rbx
99 popq %rbp
101 .endm
102 .macro WRAPPER_IMPL_AVX callee
103 pushq %rbp
104 movq %rsp, %rbp
105 andq $-32, %rsp
106 subq $32, %rsp
107 vextractf128 $1, %ymm0, (%rsp)
108 vzeroupper
109 call __GI_\callee
110 vmovaps %xmm0, 16(%rsp)
111 vmovaps (%rsp), %xmm0
112 call __GI_\callee
113 vmovaps %xmm0, %xmm1
114 vmovaps 16(%rsp), %xmm0
115 vinsertf128 $1, %xmm1, %ymm0, %ymm0
116 movq %rbp, %rsp
117 popq %rbp
119 .endm
120 .macro WRAPPER_IMPL_AVX_ff callee
121 pushq %rbp
122 movq %rsp, %rbp
123 andq $-32, %rsp
124 subq $64, %rsp
125 vextractf128 $1, %ymm0, 16(%rsp)
126 vextractf128 $1, %ymm1, (%rsp)
127 vzeroupper
128 call __GI_\callee
129 vmovaps %xmm0, 32(%rsp)
130 vmovaps 16(%rsp), %xmm0
131 vmovaps (%rsp), %xmm1
132 call __GI_\callee
133 vmovaps %xmm0, %xmm1
134 vmovaps 32(%rsp), %xmm0
135 vinsertf128 $1, %xmm1, %ymm0, %ymm0
136 movq %rbp, %rsp
137 popq %rbp
139 .endm
140 .macro WRAPPER_IMPL_AVX_fFF callee
141 pushq %rbp
142 movq %rsp, %rbp
143 andq $-32, %rsp
144 pushq %r13
145 pushq %r14
146 subq $48, %rsp
147 movq %rsi, %r14
148 vmovaps %ymm0, (%rsp)
149 movq %rdi, %r13
150 vmovaps 16(%rsp), %xmm1
151 vmovaps %xmm1, 32(%rsp)
152 vzeroupper
153 vmovaps (%rsp), %xmm0
154 call __GI_\callee
155 vmovaps 32(%rsp), %xmm0
156 lea (%rsp), %rdi
157 lea 16(%rsp), %rsi
158 call __GI_\callee
159 vmovaps (%rsp), %xmm0
160 vmovaps 16(%rsp), %xmm1
161 vmovaps %xmm0, 16(%r13)
162 vmovaps %xmm1, 16(%r14)
163 addq $48, %rsp
164 popq %r14
165 popq %r13
166 movq %rbp, %rsp
167 popq %rbp
169 .endm
170 .macro WRAPPER_IMPL_AVX512 callee
171 pushq %rbp
172 movq %rsp, %rbp
173 andq $-64, %rsp
174 subq $128, %rsp
175 vmovups %zmm0, (%rsp)
176 vmovupd (%rsp), %ymm0
177 call __GI_\callee
178 vmovupd %ymm0, 64(%rsp)
179 vmovupd 32(%rsp), %ymm0
180 call __GI_\callee
181 vmovupd %ymm0, 96(%rsp)
182 vmovups 64(%rsp), %zmm0
183 movq %rbp, %rsp
184 popq %rbp
186 .endm
187 .macro WRAPPER_IMPL_AVX512_ff callee
188 pushq %rbp
189 movq %rsp, %rbp
190 andq $-64, %rsp
191 subq $192, %rsp
192 vmovups %zmm0, (%rsp)
193 vmovups %zmm1, 64(%rsp)
194 vmovups (%rsp), %ymm0
195 vmovups 64(%rsp), %ymm1
196 call __GI_\callee
197 vmovups %ymm0, 128(%rsp)
198 vmovups 32(%rsp), %ymm0
199 vmovups 96(%rsp), %ymm1
200 call __GI_\callee
201 vmovups %ymm0, 160(%rsp)
202 vmovups 128(%rsp), %zmm0
203 movq %rbp, %rsp
204 popq %rbp
206 .endm
207 .macro WRAPPER_IMPL_AVX512_fFF callee
208 pushq %rbp
209 movq %rsp, %rbp
210 andq $-64, %rsp
211 pushq %r12
212 pushq %r13
213 subq $176, %rsp
214 movq %rsi, %r13
215 vmovaps %zmm0, (%rsp)
216 movq %rdi, %r12
217 vmovaps (%rsp), %ymm0
218 call __GI_\callee
219 vmovaps 32(%rsp), %ymm0
220 lea 64(%rsp), %rdi
221 lea 96(%rsp), %rsi
222 call __GI_\callee
223 vmovaps 64(%rsp), %ymm0
224 vmovaps 96(%rsp), %ymm1
225 vmovaps %ymm0, 32(%r12)
226 vmovaps %ymm1, 32(%r13)
227 addq $176, %rsp
228 popq %r13
229 popq %r12
230 movq %rbp, %rsp
231 popq %rbp
233 .endm
234 .text
235 .globl _ZGVbN4vl4l4_sincosf
236 .type _ZGVbN4vl4l4_sincosf,@function
237 .align 1<<4
238 _ZGVbN4vl4l4_sincosf:
241 WRAPPER_IMPL_SSE2_fFF sincosf
242 .size _ZGVbN4vl4l4_sincosf,.-_ZGVbN4vl4l4_sincosf
244 .globl __GI__ZGVbN4vl4l4_sincosf
245 .set __GI__ZGVbN4vl4l4_sincosf,_ZGVbN4vl4l4_sincosf
246 .macro WRAPPER_IMPL_SSE2_fFF_vvv callee
247 subq $120, %rsp
248 movaps %xmm0, 96(%rsp)
249 lea (%rsp), %rdi
250 movdqa %xmm1, 32(%rdi)
251 lea 16(%rsp), %rsi
252 movdqa %xmm2, 32(%rsi)
253 movdqa %xmm3, 48(%rsi)
254 movdqa %xmm4, 64(%rsi)
255 call \ callee@PLT
256 movss 100(%rsp), %xmm0
257 lea 4(%rsp), %rdi
258 lea 20(%rsp), %rsi
259 call \ callee@PLT
260 movss 104(%rsp), %xmm0
261 lea 8(%rsp), %rdi
262 lea 24(%rsp), %rsi
263 call \ callee@PLT
264 movss 108(%rsp), %xmm0
265 lea 12(%rsp), %rdi
266 lea 28(%rsp), %rsi
267 call \ callee@PLT
268 movq 32(%rsp), %rdx
269 movq 40(%rsp), %rsi
270 movq 48(%rsp), %r8
271 movq 56(%rsp), %r10
272 movl (%rsp), %eax
273 movl 4(%rsp), %ecx
274 movl 8(%rsp), %edi
275 movl 12(%rsp), %r9d
276 movl %eax, (%rdx)
277 movl %ecx, (%rsi)
278 movq 64(%rsp), %rax
279 movq 72(%rsp), %rcx
280 movl %edi, (%r8)
281 movl %r9d, (%r10)
282 movq 80(%rsp), %rdi
283 movq 88(%rsp), %r9
284 movl 16(%rsp), %r11d
285 movl 20(%rsp), %edx
286 movl 24(%rsp), %esi
287 movl 28(%rsp), %r8d
288 movl %r11d, (%rax)
289 movl %edx, (%rcx)
290 movl %esi, (%rdi)
291 movl %r8d, (%r9)
292 addq $120, %rsp
294 .endm
295 .globl _ZGVbN4vvv_sincosf
296 .type _ZGVbN4vvv_sincosf,@function
297 .align 1<<4
298 _ZGVbN4vvv_sincosf:
301 WRAPPER_IMPL_SSE2_fFF_vvv sincosf
302 .size _ZGVbN4vvv_sincosf,.-_ZGVbN4vvv_sincosf
304 .globl __GI__ZGVbN4vvv_sincosf
305 .set __GI__ZGVbN4vvv_sincosf,_ZGVbN4vvv_sincosf