clarify the purpose of this project
[nyanglibc.git] / mathvec / svml_s_powf8_core_avx.shared.s
blob1f51faef03ce5c73fbfd6b5691bce6af243fa8d4
1 .macro WRAPPER_IMPL_SSE2 callee
2 subq $40, %rsp
3 movaps %xmm0, (%rsp)
4 call \ callee@PLT
5 movss %xmm0, 16(%rsp)
6 movss 4(%rsp), %xmm0
7 call \ callee@PLT
8 movss %xmm0, 20(%rsp)
9 movss 8(%rsp), %xmm0
10 call \ callee@PLT
11 movss %xmm0, 24(%rsp)
12 movss 12(%rsp), %xmm0
13 call \ callee@PLT
14 movss 16(%rsp), %xmm3
15 movss 20(%rsp), %xmm2
16 movss 24(%rsp), %xmm1
17 movss %xmm0, 28(%rsp)
18 unpcklps %xmm1, %xmm3
19 unpcklps %xmm0, %xmm2
20 unpcklps %xmm2, %xmm3
21 movaps %xmm3, %xmm0
22 addq $40, %rsp
23 ret
24 .endm
25 .macro WRAPPER_IMPL_SSE2_ff callee
26 subq $56, %rsp
27 movaps %xmm0, (%rsp)
28 movaps %xmm1, 16(%rsp)
29 call \ callee@PLT
30 movss %xmm0, 32(%rsp)
31 movss 4(%rsp), %xmm0
32 movss 20(%rsp), %xmm1
33 call \ callee@PLT
34 movss %xmm0, 36(%rsp)
35 movss 8(%rsp), %xmm0
36 movss 24(%rsp), %xmm1
37 call \ callee@PLT
38 movss %xmm0, 40(%rsp)
39 movss 12(%rsp), %xmm0
40 movss 28(%rsp), %xmm1
41 call \ callee@PLT
42 movss 32(%rsp), %xmm3
43 movss 36(%rsp), %xmm2
44 movss 40(%rsp), %xmm1
45 movss %xmm0, 44(%rsp)
46 unpcklps %xmm1, %xmm3
47 unpcklps %xmm0, %xmm2
48 unpcklps %xmm2, %xmm3
49 movaps %xmm3, %xmm0
50 addq $56, %rsp
51 ret
52 .endm
53 .macro WRAPPER_IMPL_SSE2_fFF callee
54 pushq %rbp
55 pushq %rbx
56 movq %rdi, %rbp
57 movq %rsi, %rbx
58 subq $40, %rsp
59 leaq 24(%rsp), %rsi
60 leaq 28(%rsp), %rdi
61 movaps %xmm0, (%rsp)
62 call \ callee@PLT
63 leaq 24(%rsp), %rsi
64 leaq 28(%rsp), %rdi
65 movss 28(%rsp), %xmm0
66 movss %xmm0, 0(%rbp)
67 movaps (%rsp), %xmm1
68 movss 24(%rsp), %xmm0
69 movss %xmm0, (%rbx)
70 movaps %xmm1, %xmm0
71 shufps $85, %xmm1, %xmm0
72 call \ callee@PLT
73 movss 28(%rsp), %xmm0
74 leaq 24(%rsp), %rsi
75 movss %xmm0, 4(%rbp)
76 leaq 28(%rsp), %rdi
77 movaps (%rsp), %xmm1
78 movss 24(%rsp), %xmm0
79 movss %xmm0, 4(%rbx)
80 movaps %xmm1, %xmm0
81 unpckhps %xmm1, %xmm0
82 call \ callee@PLT
83 movaps (%rsp), %xmm1
84 leaq 24(%rsp), %rsi
85 leaq 28(%rsp), %rdi
86 movss 28(%rsp), %xmm0
87 shufps $255, %xmm1, %xmm1
88 movss %xmm0, 8(%rbp)
89 movss 24(%rsp), %xmm0
90 movss %xmm0, 8(%rbx)
91 movaps %xmm1, %xmm0
92 call \ callee@PLT
93 movss 28(%rsp), %xmm0
94 movss %xmm0, 12(%rbp)
95 movss 24(%rsp), %xmm0
96 movss %xmm0, 12(%rbx)
97 addq $40, %rsp
98 popq %rbx
99 popq %rbp
101 .endm
102 .macro WRAPPER_IMPL_AVX callee
103 pushq %rbp
104 movq %rsp, %rbp
105 andq $-32, %rsp
106 subq $32, %rsp
107 vextractf128 $1, %ymm0, (%rsp)
108 vzeroupper
109 call __GI_\callee
110 vmovaps %xmm0, 16(%rsp)
111 vmovaps (%rsp), %xmm0
112 call __GI_\callee
113 vmovaps %xmm0, %xmm1
114 vmovaps 16(%rsp), %xmm0
115 vinsertf128 $1, %xmm1, %ymm0, %ymm0
116 movq %rbp, %rsp
117 popq %rbp
119 .endm
120 .macro WRAPPER_IMPL_AVX_ff callee
121 pushq %rbp
122 movq %rsp, %rbp
123 andq $-32, %rsp
124 subq $64, %rsp
125 vextractf128 $1, %ymm0, 16(%rsp)
126 vextractf128 $1, %ymm1, (%rsp)
127 vzeroupper
128 call __GI_\callee
129 vmovaps %xmm0, 32(%rsp)
130 vmovaps 16(%rsp), %xmm0
131 vmovaps (%rsp), %xmm1
132 call __GI_\callee
133 vmovaps %xmm0, %xmm1
134 vmovaps 32(%rsp), %xmm0
135 vinsertf128 $1, %xmm1, %ymm0, %ymm0
136 movq %rbp, %rsp
137 popq %rbp
139 .endm
140 .macro WRAPPER_IMPL_AVX_fFF callee
141 pushq %rbp
142 movq %rsp, %rbp
143 andq $-32, %rsp
144 pushq %r13
145 pushq %r14
146 subq $48, %rsp
147 movq %rsi, %r14
148 vmovaps %ymm0, (%rsp)
149 movq %rdi, %r13
150 vmovaps 16(%rsp), %xmm1
151 vmovaps %xmm1, 32(%rsp)
152 vzeroupper
153 vmovaps (%rsp), %xmm0
154 call __GI_\callee
155 vmovaps 32(%rsp), %xmm0
156 lea (%rsp), %rdi
157 lea 16(%rsp), %rsi
158 call __GI_\callee
159 vmovaps (%rsp), %xmm0
160 vmovaps 16(%rsp), %xmm1
161 vmovaps %xmm0, 16(%r13)
162 vmovaps %xmm1, 16(%r14)
163 addq $48, %rsp
164 popq %r14
165 popq %r13
166 movq %rbp, %rsp
167 popq %rbp
169 .endm
170 .macro WRAPPER_IMPL_AVX512 callee
171 pushq %rbp
172 movq %rsp, %rbp
173 andq $-64, %rsp
174 subq $128, %rsp
175 vmovups %zmm0, (%rsp)
176 vmovupd (%rsp), %ymm0
177 call __GI_\callee
178 vmovupd %ymm0, 64(%rsp)
179 vmovupd 32(%rsp), %ymm0
180 call __GI_\callee
181 vmovupd %ymm0, 96(%rsp)
182 vmovups 64(%rsp), %zmm0
183 movq %rbp, %rsp
184 popq %rbp
186 .endm
187 .macro WRAPPER_IMPL_AVX512_ff callee
188 pushq %rbp
189 movq %rsp, %rbp
190 andq $-64, %rsp
191 subq $192, %rsp
192 vmovups %zmm0, (%rsp)
193 vmovups %zmm1, 64(%rsp)
194 vmovups (%rsp), %ymm0
195 vmovups 64(%rsp), %ymm1
196 call __GI_\callee
197 vmovups %ymm0, 128(%rsp)
198 vmovups 32(%rsp), %ymm0
199 vmovups 96(%rsp), %ymm1
200 call __GI_\callee
201 vmovups %ymm0, 160(%rsp)
202 vmovups 128(%rsp), %zmm0
203 movq %rbp, %rsp
204 popq %rbp
206 .endm
207 .macro WRAPPER_IMPL_AVX512_fFF callee
208 pushq %rbp
209 movq %rsp, %rbp
210 andq $-64, %rsp
211 pushq %r12
212 pushq %r13
213 subq $176, %rsp
214 movq %rsi, %r13
215 vmovaps %zmm0, (%rsp)
216 movq %rdi, %r12
217 vmovaps (%rsp), %ymm0
218 call __GI_\callee
219 vmovaps 32(%rsp), %ymm0
220 lea 64(%rsp), %rdi
221 lea 96(%rsp), %rsi
222 call __GI_\callee
223 vmovaps 64(%rsp), %ymm0
224 vmovaps 96(%rsp), %ymm1
225 vmovaps %ymm0, 32(%r12)
226 vmovaps %ymm1, 32(%r13)
227 addq $176, %rsp
228 popq %r13
229 popq %r12
230 movq %rbp, %rsp
231 popq %rbp
233 .endm
234 .text
235 .globl _ZGVcN8vv_powf
236 .type _ZGVcN8vv_powf,@function
237 .align 1<<4
238 _ZGVcN8vv_powf:
241 WRAPPER_IMPL_AVX_ff _ZGVbN4vv_powf
242 .size _ZGVcN8vv_powf,.-_ZGVcN8vv_powf