aarch64: Add assembly support for -fsanitize=hwaddress tagged globals.
[libav.git] / libavcodec / x86 / rv40dsp.asm
blob4949842e8f13ab13b41cfd6dcee5a784a0abcc8e
1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV40 decoder
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
6 ;*
7 ;* This file is part of Libav.
8 ;*
9 ;* Libav is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* Libav is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with Libav; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
26 SECTION_RODATA 16
28 pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
30 sixtap_filter_hb_m: times 8 db 1, -5
31 times 8 db 52, 20
32 ; multiplied by 2 to have the same shift
33 times 8 db 2, -10
34 times 8 db 40, 40
35 ; back to normal
36 times 8 db 1, -5
37 times 8 db 20, 52
39 sixtap_filter_v_m: times 8 dw 1
40 times 8 dw -5
41 times 8 dw 52
42 times 8 dw 20
43 ; multiplied by 2 to have the same shift
44 times 8 dw 2
45 times 8 dw -10
46 times 8 dw 40
47 times 8 dw 40
48 ; back to normal
49 times 8 dw 1
50 times 8 dw -5
51 times 8 dw 20
52 times 8 dw 52
54 %ifdef PIC
55 %define sixtap_filter_hw picregq
56 %define sixtap_filter_hb picregq
57 %define sixtap_filter_v picregq
58 %define npicregs 1
59 %else
60 %define sixtap_filter_hw sixtap_filter_hw_m
61 %define sixtap_filter_hb sixtap_filter_hb_m
62 %define sixtap_filter_v sixtap_filter_v_m
63 %define npicregs 0
64 %endif
66 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
67 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
68 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
70 cextern pw_32
71 cextern pw_16
72 cextern pw_512
74 SECTION .text
76 ;-----------------------------------------------------------------------------
77 ; subpel MC functions:
79 ; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
80 ; uint8_t *src, int srcstride,
81 ; int len, int m);
82 ;----------------------------------------------------------------------
83 %macro LOAD 2
84 %if WIN64
85 movsxd %1q, %1d
86 %endif
87 %ifdef PIC
88 add %1q, picregq
89 %else
90 add %1q, %2
91 %endif
92 %endmacro
94 %macro STORE 3
95 %ifidn %3, avg
96 movh %2, [dstq]
97 %endif
98 packuswb %1, %1
99 %ifidn %3, avg
100 PAVGB %1, %2
101 %endif
102 movh [dstq], %1
103 %endmacro
105 %macro FILTER_V 1
106 cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
107 %ifdef PIC
108 lea picregq, [sixtap_filter_v_m]
109 %endif
110 pxor m7, m7
111 LOAD my, sixtap_filter_v
113 ; read 5 lines
114 sub srcq, srcstrideq
115 sub srcq, srcstrideq
116 movh m0, [srcq]
117 movh m1, [srcq+srcstrideq]
118 movh m2, [srcq+srcstrideq*2]
119 lea srcq, [srcq+srcstrideq*2]
120 add srcq, srcstrideq
121 movh m3, [srcq]
122 movh m4, [srcq+srcstrideq]
123 punpcklbw m0, m7
124 punpcklbw m1, m7
125 punpcklbw m2, m7
126 punpcklbw m3, m7
127 punpcklbw m4, m7
129 %ifdef m8
130 mova m8, [myq+ 0]
131 mova m9, [myq+16]
132 mova m10, [myq+32]
133 mova m11, [myq+48]
134 %define COEFF05 m8
135 %define COEFF14 m9
136 %define COEFF2 m10
137 %define COEFF3 m11
138 %else
139 %define COEFF05 [myq+ 0]
140 %define COEFF14 [myq+16]
141 %define COEFF2 [myq+32]
142 %define COEFF3 [myq+48]
143 %endif
144 .nextrow:
145 mova m6, m1
146 movh m5, [srcq+2*srcstrideq] ; read new row
147 paddw m6, m4
148 punpcklbw m5, m7
149 pmullw m6, COEFF14
150 paddw m0, m5
151 pmullw m0, COEFF05
152 paddw m6, m0
153 mova m0, m1
154 paddw m6, [pw_32]
155 mova m1, m2
156 pmullw m2, COEFF2
157 paddw m6, m2
158 mova m2, m3
159 pmullw m3, COEFF3
160 paddw m6, m3
162 ; round/clip/store
163 mova m3, m4
164 psraw m6, 6
165 mova m4, m5
166 STORE m6, m5, %1
168 ; go to next line
169 add dstq, dststrideq
170 add srcq, srcstrideq
171 dec heightd ; next row
172 jg .nextrow
173 REP_RET
174 %endmacro
176 %macro FILTER_H 1
177 cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
178 %ifdef PIC
179 lea picregq, [sixtap_filter_v_m]
180 %endif
181 pxor m7, m7
182 LOAD mx, sixtap_filter_v
183 mova m6, [pw_32]
184 %ifdef m8
185 mova m8, [mxq+ 0]
186 mova m9, [mxq+16]
187 mova m10, [mxq+32]
188 mova m11, [mxq+48]
189 %define COEFF05 m8
190 %define COEFF14 m9
191 %define COEFF2 m10
192 %define COEFF3 m11
193 %else
194 %define COEFF05 [mxq+ 0]
195 %define COEFF14 [mxq+16]
196 %define COEFF2 [mxq+32]
197 %define COEFF3 [mxq+48]
198 %endif
199 .nextrow:
200 movq m0, [srcq-2]
201 movq m5, [srcq+3]
202 movq m1, [srcq-1]
203 movq m4, [srcq+2]
204 punpcklbw m0, m7
205 punpcklbw m5, m7
206 punpcklbw m1, m7
207 punpcklbw m4, m7
208 movq m2, [srcq-0]
209 movq m3, [srcq+1]
210 paddw m0, m5
211 paddw m1, m4
212 punpcklbw m2, m7
213 punpcklbw m3, m7
214 pmullw m0, COEFF05
215 pmullw m1, COEFF14
216 pmullw m2, COEFF2
217 pmullw m3, COEFF3
218 paddw m0, m6
219 paddw m1, m2
220 paddw m0, m3
221 paddw m0, m1
222 psraw m0, 6
223 STORE m0, m1, %1
225 ; go to next line
226 add dstq, dststrideq
227 add srcq, srcstrideq
228 dec heightd ; next row
229 jg .nextrow
230 REP_RET
231 %endmacro
233 %if ARCH_X86_32
234 INIT_MMX mmx
235 FILTER_V put
236 FILTER_H put
238 INIT_MMX mmxext
239 FILTER_V avg
240 FILTER_H avg
242 INIT_MMX 3dnow
243 FILTER_V avg
244 FILTER_H avg
245 %endif
247 INIT_XMM sse2
248 FILTER_H put
249 FILTER_H avg
250 FILTER_V put
251 FILTER_V avg
253 %macro FILTER_SSSE3 1
254 cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
255 %ifdef PIC
256 lea picregq, [sixtap_filter_hb_m]
257 %endif
259 ; read 5 lines
260 sub srcq, srcstrideq
261 LOAD my, sixtap_filter_hb
262 sub srcq, srcstrideq
263 movh m0, [srcq]
264 movh m1, [srcq+srcstrideq]
265 movh m2, [srcq+srcstrideq*2]
266 lea srcq, [srcq+srcstrideq*2]
267 add srcq, srcstrideq
268 mova m5, [myq]
269 movh m3, [srcq]
270 movh m4, [srcq+srcstrideq]
271 lea srcq, [srcq+2*srcstrideq]
273 .nextrow:
274 mova m6, m2
275 punpcklbw m0, m1
276 punpcklbw m6, m3
277 pmaddubsw m0, m5
278 pmaddubsw m6, [myq+16]
279 movh m7, [srcq] ; read new row
280 paddw m6, m0
281 mova m0, m1
282 mova m1, m2
283 mova m2, m3
284 mova m3, m4
285 mova m4, m7
286 punpcklbw m7, m3
287 pmaddubsw m7, m5
288 paddw m6, m7
289 pmulhrsw m6, [pw_512]
290 STORE m6, m7, %1
292 ; go to next line
293 add dstq, dststrideq
294 add srcq, srcstrideq
295 dec heightd ; next row
296 jg .nextrow
297 REP_RET
299 cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
300 %ifdef PIC
301 lea picregq, [sixtap_filter_hb_m]
302 %endif
303 mova m3, [filter_h6_shuf2]
304 mova m4, [filter_h6_shuf3]
305 LOAD mx, sixtap_filter_hb
306 mova m5, [mxq] ; set up 6tap filter in bytes
307 mova m6, [mxq+16]
308 mova m7, [filter_h6_shuf1]
310 .nextrow:
311 movu m0, [srcq-2]
312 mova m1, m0
313 mova m2, m0
314 pshufb m0, m7
315 pshufb m1, m3
316 pshufb m2, m4
317 pmaddubsw m0, m5
318 pmaddubsw m1, m6
319 pmaddubsw m2, m5
320 paddw m0, m1
321 paddw m0, m2
322 pmulhrsw m0, [pw_512]
323 STORE m0, m1, %1
325 ; go to next line
326 add dstq, dststrideq
327 add srcq, srcstrideq
328 dec heightd ; next row
329 jg .nextrow
330 REP_RET
331 %endmacro
333 INIT_XMM ssse3
334 FILTER_SSSE3 put
335 FILTER_SSSE3 avg
337 ; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2
338 %macro RV40_WCORE 4-5
339 movh m4, [%3 + r6 + 0]
340 movh m5, [%4 + r6 + 0]
341 %if %0 == 4
342 %define OFFSET r6 + mmsize / 2
343 %else
344 ; 8x8 block and SSE2, stride was provided
345 %define OFFSET r6
346 add r6, r5
347 %endif
348 movh m6, [%3 + OFFSET]
349 movh m7, [%4 + OFFSET]
351 %if %1 == 0
352 ; 14-bit weights
353 punpcklbw m4, m0
354 punpcklbw m5, m0
355 punpcklbw m6, m0
356 punpcklbw m7, m0
358 psllw m4, 7
359 psllw m5, 7
360 psllw m6, 7
361 psllw m7, 7
362 pmulhw m4, m3
363 pmulhw m5, m2
364 pmulhw m6, m3
365 pmulhw m7, m2
367 paddw m4, m5
368 paddw m6, m7
369 %else
370 ; 5-bit weights
371 %if cpuflag(ssse3)
372 punpcklbw m4, m5
373 punpcklbw m6, m7
375 pmaddubsw m4, m3
376 pmaddubsw m6, m3
377 %else
378 punpcklbw m4, m0
379 punpcklbw m5, m0
380 punpcklbw m6, m0
381 punpcklbw m7, m0
383 pmullw m4, m3
384 pmullw m5, m2
385 pmullw m6, m3
386 pmullw m7, m2
387 paddw m4, m5
388 paddw m6, m7
389 %endif
391 %endif
393 ; bias and shift down
394 %if cpuflag(ssse3)
395 pmulhrsw m4, m1
396 pmulhrsw m6, m1
397 %else
398 paddw m4, m1
399 paddw m6, m1
400 psrlw m4, 5
401 psrlw m6, 5
402 %endif
404 packuswb m4, m6
405 %if %0 == 5
406 ; Only called for 8x8 blocks and SSE2
407 sub r6, r5
408 movh [%2 + r6], m4
409 add r6, r5
410 movhps [%2 + r6], m4
411 %else
412 mova [%2 + r6], m4
413 %endif
414 %endmacro
417 %macro MAIN_LOOP 2
418 %if mmsize == 8
419 RV40_WCORE %2, r0, r1, r2
420 %if %1 == 16
421 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
422 %endif
424 ; Prepare for next loop
425 add r6, r5
426 %else
427 %ifidn %1, 8
428 RV40_WCORE %2, r0, r1, r2, r5
429 ; Prepare 2 next lines
430 add r6, r5
431 %else
432 RV40_WCORE %2, r0, r1, r2
433 ; Prepare single next line
434 add r6, r5
435 %endif
436 %endif
438 %endmacro
440 ; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
441 ; %1=size %2=num of xmm regs
442 ; The weights are FP0.14 notation of fractions depending on pts.
443 ; For timebases without rounding error (i.e. PAL), the fractions
444 ; can be simplified, and several operations can be avoided.
445 ; Therefore, we check here whether they are multiples of 2^9 for
446 ; those simplifications to occur.
447 %macro RV40_WEIGHT 3
448 cglobal rv40_weight_func_%1_%2, 6, 7, 8
449 %if cpuflag(ssse3)
450 mova m1, [pw_1024]
451 %else
452 mova m1, [pw_16]
453 %endif
454 pxor m0, m0
455 ; Set loop counter and increments
456 mov r6, r5
457 shl r6, %3
458 add r0, r6
459 add r1, r6
460 add r2, r6
461 neg r6
463 movd m2, r3d
464 movd m3, r4d
465 %ifidn %1,rnd
466 %define RND 0
467 SPLATW m2, m2
468 %else
469 %define RND 1
470 %if cpuflag(ssse3)
471 punpcklbw m3, m2
472 %else
473 SPLATW m2, m2
474 %endif
475 %endif
476 SPLATW m3, m3
478 .loop:
479 MAIN_LOOP %2, RND
480 jnz .loop
481 REP_RET
482 %endmacro
484 INIT_MMX mmxext
485 RV40_WEIGHT rnd, 8, 3
486 RV40_WEIGHT rnd, 16, 4
487 RV40_WEIGHT nornd, 8, 3
488 RV40_WEIGHT nornd, 16, 4
490 INIT_XMM sse2
491 RV40_WEIGHT rnd, 8, 3
492 RV40_WEIGHT rnd, 16, 4
493 RV40_WEIGHT nornd, 8, 3
494 RV40_WEIGHT nornd, 16, 4
496 INIT_XMM ssse3
497 RV40_WEIGHT rnd, 8, 3
498 RV40_WEIGHT rnd, 16, 4
499 RV40_WEIGHT nornd, 8, 3
500 RV40_WEIGHT nornd, 16, 4