aarch64: Add assembly support for -fsanitize=hwaddress tagged globals.
[libav.git] / libavcodec / x86 / sbrdsp.asm
blobb449de5f9ad6d90fa34a43eb9d637db538248964
1 ;******************************************************************************
2 ;* AAC Spectral Band Replication decoding functions
3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
4 ;*
5 ;* This file is part of Libav.
6 ;*
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 SECTION_RODATA
25 ; mask equivalent for multiply by -1.0 1.0
26 ps_mask times 2 dd 1<<31, 0
27 ps_mask2 times 2 dd 0, 1<<31
28 ps_neg times 4 dd 1<<31
30 SECTION .text
32 INIT_XMM sse
33 cglobal sbr_sum_square, 2, 3, 6
34 mov r2, r1
35 xorps m0, m0
36 xorps m1, m1
37 sar r2, 3
38 jz .prepare
39 .loop:
40 movu m2, [r0 + 0]
41 movu m3, [r0 + 16]
42 movu m4, [r0 + 32]
43 movu m5, [r0 + 48]
44 mulps m2, m2
45 mulps m3, m3
46 mulps m4, m4
47 mulps m5, m5
48 addps m0, m2
49 addps m1, m3
50 addps m0, m4
51 addps m1, m5
52 add r0, 64
53 dec r2
54 jnz .loop
55 .prepare:
56 and r1, 7
57 sar r1, 1
58 jz .end
59 ; len is a multiple of 2, thus there are at least 4 elements to process
60 .endloop:
61 movu m2, [r0]
62 add r0, 16
63 mulps m2, m2
64 dec r1
65 addps m0, m2
66 jnz .endloop
67 .end:
68 addps m0, m1
69 movhlps m2, m0
70 addps m0, m2
71 movss m1, m0
72 shufps m0, m0, 1
73 addss m0, m1
74 %if ARCH_X86_64 == 0
75 movss r0m, m0
76 fld dword r0m
77 %endif
78 RET
80 %define STEP 40*4*2
81 cglobal sbr_hf_g_filt, 5, 6, 5
82 lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
83 mov r5, r3
84 and r3, 0xFC
85 lea r2, [r2 + r3*4]
86 lea r0, [r0 + r3*8]
87 neg r3
88 jz .loop1
89 .loop4:
90 movlps m0, [r2 + 4*r3 + 0]
91 movlps m1, [r2 + 4*r3 + 8]
92 movlps m2, [r1 + 0*STEP]
93 movlps m3, [r1 + 2*STEP]
94 movhps m2, [r1 + 1*STEP]
95 movhps m3, [r1 + 3*STEP]
96 unpcklps m0, m0
97 unpcklps m1, m1
98 mulps m0, m2
99 mulps m1, m3
100 movu [r0 + 8*r3 + 0], m0
101 movu [r0 + 8*r3 + 16], m1
102 add r1, 4*STEP
103 add r3, 4
104 jnz .loop4
105 and r5, 3 ; number of single element loops
106 jz .end
107 .loop1: ; element 0 and 1 can be computed at the same time
108 movss m0, [r2]
109 movlps m2, [r1]
110 unpcklps m0, m0
111 mulps m2, m0
112 movlps [r0], m2
113 add r0, 8
114 add r2, 4
115 add r1, STEP
116 dec r5
117 jnz .loop1
118 .end:
121 ; void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
122 ; const float alpha0[2], const float alpha1[2],
123 ; float bw, int start, int end)
125 cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
126 ; load alpha factors
127 %define bw m0
128 %if ARCH_X86_64 == 0 || WIN64
129 movss bw, BWm
130 %endif
131 movlps m2, [alpha1q]
132 movlps m1, [alpha0q]
133 shufps bw, bw, 0
134 mulps m2, bw ; (a1[0] a1[1])*bw
135 mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
136 mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
137 mova m3, m1
138 mova m4, m2
139 mova m7, [ps_mask]
141 ; Set pointers
142 %if ARCH_X86_64 == 0 || WIN64
143 ; start and end 6th and 7th args on stack
144 mov r2d, Sm
145 mov r3d, Em
146 %define start r2q
147 %define end r3q
148 %else
149 ; BW does not actually occupy a register, so shift by 1
150 %define start BWq
151 %define end Sq
152 %endif
153 sub start, end ; neg num of loops
154 lea X_highq, [X_highq + end*2*4]
155 lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
156 shl start, 3 ; offset from num loops
158 mova m0, [X_lowq + start]
159 movlhps m1, m1 ; (a2 a3 a2 a3)
160 movlhps m2, m2 ; (a0 a1 a0 a1)
161 shufps m3, m3, q0101 ; (a3 a2 a3 a2)
162 shufps m4, m4, q0101 ; (a1 a0 a1 a0)
163 xorps m3, m7 ; (-a3 a2 -a3 a2)
164 xorps m4, m7 ; (-a1 a0 -a1 a0)
165 .loop2:
166 mova m5, m0
167 mova m6, m0
168 shufps m0, m0, q2200 ; {Xl[-2][0],",Xl[-1][0],"}
169 shufps m5, m5, q3311 ; {Xl[-2][1],",Xl[-1][1],"}
170 mulps m0, m2
171 mulps m5, m4
172 mova m7, m6
173 addps m5, m0
174 mova m0, [X_lowq + start + 2*2*4]
175 shufps m6, m0, q0022 ; {Xl[-1][0],",Xl[0][0],"}
176 shufps m7, m0, q1133 ; {Xl[-1][1],",Xl[1][1],"}
177 mulps m6, m1
178 mulps m7, m3
179 addps m5, m6
180 addps m7, m0
181 addps m5, m7
182 mova [X_highq + start], m5
183 add start, 16
184 jnz .loop2
187 cglobal sbr_sum64x5, 1,2,4,z
188 lea r1q, [zq+ 256]
189 .loop:
190 mova m0, [zq+ 0]
191 mova m2, [zq+ 16]
192 mova m1, [zq+ 256]
193 mova m3, [zq+ 272]
194 addps m0, [zq+ 512]
195 addps m2, [zq+ 528]
196 addps m1, [zq+ 768]
197 addps m3, [zq+ 784]
198 addps m0, [zq+1024]
199 addps m2, [zq+1040]
200 addps m0, m1
201 addps m2, m3
202 mova [zq], m0
203 mova [zq+16], m2
204 add zq, 32
205 cmp zq, r1q
206 jne .loop
207 REP_RET
209 INIT_XMM sse
210 cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
211 lea r2q, [zq + (64-4)*4]
212 mova m3, [ps_neg]
213 .loop:
214 mova m1, [zq]
215 xorps m0, m3, [r2q]
216 shufps m0, m0, m0, q0123
217 unpcklps m2, m0, m1
218 unpckhps m0, m0, m1
219 mova [Wq + 0], m2
220 mova [Wq + 16], m0
221 add Wq, 32
222 sub r2q, 16
223 add zq, 16
224 cmp zq, r2q
225 jl .loop
226 REP_RET
228 INIT_XMM sse
229 cglobal sbr_neg_odd_64, 1,2,4,z
230 lea r1q, [zq+256]
231 .loop:
232 mova m0, [zq+ 0]
233 mova m1, [zq+16]
234 mova m2, [zq+32]
235 mova m3, [zq+48]
236 xorps m0, [ps_mask2]
237 xorps m1, [ps_mask2]
238 xorps m2, [ps_mask2]
239 xorps m3, [ps_mask2]
240 mova [zq+ 0], m0
241 mova [zq+16], m1
242 mova [zq+32], m2
243 mova [zq+48], m3
244 add zq, 64
245 cmp zq, r1q
246 jne .loop
247 REP_RET
249 INIT_XMM sse2
250 ; void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1)
251 cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
252 mov cq, 64*4-2*mmsize
253 lea vrevq, [vq + 64*4]
254 .loop:
255 mova m0, [src0q+cq]
256 mova m1, [src1q]
257 mova m2, [src0q+cq+mmsize]
258 mova m3, [src1q+mmsize]
259 pshufd m4, m0, q0123
260 pshufd m5, m1, q0123
261 pshufd m6, m2, q0123
262 pshufd m7, m3, q0123
263 addps m3, m4
264 subps m0, m7
265 addps m1, m6
266 subps m2, m5
267 mova [vrevq], m1
268 mova [vrevq+mmsize], m3
269 mova [vq+cq], m0
270 mova [vq+cq+mmsize], m2
271 add src1q, 2*mmsize
272 add vrevq, 2*mmsize
273 sub cq, 2*mmsize
274 jge .loop
275 REP_RET
277 INIT_XMM sse2
278 cglobal sbr_qmf_pre_shuffle, 1,4,6,z
279 %define OFFSET (32*4-2*mmsize)
280 mov r3q, OFFSET
281 lea r1q, [zq + (32+1)*4]
282 lea r2q, [zq + 64*4]
283 mova m5, [ps_neg]
284 .loop:
285 movu m0, [r1q]
286 movu m2, [r1q + mmsize]
287 movu m1, [zq + r3q + 4 + mmsize]
288 movu m3, [zq + r3q + 4]
290 pxor m2, m5
291 pxor m0, m5
292 pshufd m2, m2, q0123
293 pshufd m0, m0, q0123
294 SBUTTERFLY dq, 2, 3, 4
295 SBUTTERFLY dq, 0, 1, 4
296 mova [r2q + 2*r3q + 0*mmsize], m2
297 mova [r2q + 2*r3q + 1*mmsize], m3
298 mova [r2q + 2*r3q + 2*mmsize], m0
299 mova [r2q + 2*r3q + 3*mmsize], m1
300 add r1q, 2*mmsize
301 sub r3q, 2*mmsize
302 jge .loop
303 movq m2, [zq]
304 movq [r2q], m2
305 REP_RET