threadprogress: reorder instructions to fix race.
[ffmpeg.git] / libavcodec / x86 / hpeldsp.asm
blob3bc278618cb5410b94465471d46a06649fcd6893
1 ;******************************************************************************
2 ;*
3 ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4 ;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
5 ;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6 ;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7 ;* Copyright (c) 2013 Daniel Kang
8 ;*
9 ;* SIMD-optimized halfpel functions
11 ;* This file is part of FFmpeg.
13 ;* FFmpeg is free software; you can redistribute it and/or
14 ;* modify it under the terms of the GNU Lesser General Public
15 ;* License as published by the Free Software Foundation; either
16 ;* version 2.1 of the License, or (at your option) any later version.
18 ;* FFmpeg is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 ;* Lesser General Public License for more details.
23 ;* You should have received a copy of the GNU Lesser General Public
24 ;* License along with FFmpeg; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 ;******************************************************************************
28 %include "libavutil/x86/x86util.asm"
30 SECTION_RODATA
31 cextern pb_1
32 cextern pw_2
33 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
34 pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
36 cextern pw_8192
38 SECTION .text
40 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
41 %macro PUT_PIXELS8_X2 0
42 %if cpuflag(sse2)
43 cglobal put_pixels16_x2, 4,5,4
44 %else
45 cglobal put_pixels8_x2, 4,5
46 %endif
47 lea r4, [r2*2]
48 .loop:
49 movu m0, [r1+1]
50 movu m1, [r1+r2+1]
51 %if cpuflag(sse2)
52 movu m2, [r1]
53 movu m3, [r1+r2]
54 pavgb m0, m2
55 pavgb m1, m3
56 %else
57 PAVGB m0, [r1]
58 PAVGB m1, [r1+r2]
59 %endif
60 mova [r0], m0
61 mova [r0+r2], m1
62 add r1, r4
63 add r0, r4
64 movu m0, [r1+1]
65 movu m1, [r1+r2+1]
66 %if cpuflag(sse2)
67 movu m2, [r1]
68 movu m3, [r1+r2]
69 pavgb m0, m2
70 pavgb m1, m3
71 %else
72 PAVGB m0, [r1]
73 PAVGB m1, [r1+r2]
74 %endif
75 add r1, r4
76 mova [r0], m0
77 mova [r0+r2], m1
78 add r0, r4
79 sub r3d, 4
80 jne .loop
81 RET
82 %endmacro
84 INIT_MMX mmxext
85 PUT_PIXELS8_X2
88 ; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
89 %macro PUT_PIXELS_16 0
90 cglobal put_pixels16_x2, 4,5
91 lea r4, [r2*2]
92 .loop:
93 mova m0, [r1]
94 mova m1, [r1+r2]
95 mova m2, [r1+8]
96 mova m3, [r1+r2+8]
97 PAVGB m0, [r1+1]
98 PAVGB m1, [r1+r2+1]
99 PAVGB m2, [r1+9]
100 PAVGB m3, [r1+r2+9]
101 mova [r0], m0
102 mova [r0+r2], m1
103 mova [r0+8], m2
104 mova [r0+r2+8], m3
105 add r1, r4
106 add r0, r4
107 mova m0, [r1]
108 mova m1, [r1+r2]
109 mova m2, [r1+8]
110 mova m3, [r1+r2+8]
111 PAVGB m0, [r1+1]
112 PAVGB m1, [r1+r2+1]
113 PAVGB m2, [r1+9]
114 PAVGB m3, [r1+r2+9]
115 add r1, r4
116 mova [r0], m0
117 mova [r0+r2], m1
118 mova [r0+8], m2
119 mova [r0+r2+8], m3
120 add r0, r4
121 sub r3d, 4
122 jne .loop
124 %endmacro
126 INIT_MMX mmxext
127 PUT_PIXELS_16
128 ; The 8_X2 macro can easily be used here
129 INIT_XMM sse2
130 PUT_PIXELS8_X2
133 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
134 INIT_MMX mmxext
135 cglobal put_no_rnd_pixels8_x2, 4,5
136 mova m6, [pb_1]
137 lea r4, [r2*2]
138 .loop:
139 mova m0, [r1]
140 mova m2, [r1+r2]
141 mova m1, [r1+1]
142 mova m3, [r1+r2+1]
143 add r1, r4
144 psubusb m0, m6
145 psubusb m2, m6
146 PAVGB m0, m1
147 PAVGB m2, m3
148 mova [r0], m0
149 mova [r0+r2], m2
150 mova m0, [r1]
151 mova m1, [r1+1]
152 mova m2, [r1+r2]
153 mova m3, [r1+r2+1]
154 add r0, r4
155 add r1, r4
156 psubusb m0, m6
157 psubusb m2, m6
158 PAVGB m0, m1
159 PAVGB m2, m3
160 mova [r0], m0
161 mova [r0+r2], m2
162 add r0, r4
163 sub r3d, 4
164 jne .loop
168 ; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
169 INIT_MMX mmxext
170 cglobal put_no_rnd_pixels8_x2_exact, 4,5
171 lea r4, [r2*3]
172 pcmpeqb m6, m6
173 .loop:
174 mova m0, [r1]
175 mova m2, [r1+r2]
176 mova m1, [r1+1]
177 mova m3, [r1+r2+1]
178 pxor m0, m6
179 pxor m2, m6
180 pxor m1, m6
181 pxor m3, m6
182 PAVGB m0, m1
183 PAVGB m2, m3
184 pxor m0, m6
185 pxor m2, m6
186 mova [r0], m0
187 mova [r0+r2], m2
188 mova m0, [r1+r2*2]
189 mova m1, [r1+r2*2+1]
190 mova m2, [r1+r4]
191 mova m3, [r1+r4+1]
192 pxor m0, m6
193 pxor m1, m6
194 pxor m2, m6
195 pxor m3, m6
196 PAVGB m0, m1
197 PAVGB m2, m3
198 pxor m0, m6
199 pxor m2, m6
200 mova [r0+r2*2], m0
201 mova [r0+r4], m2
202 lea r1, [r1+r2*4]
203 lea r0, [r0+r2*4]
204 sub r3d, 4
205 jg .loop
209 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
210 %macro PUT_PIXELS8_Y2 0
211 %if cpuflag(sse2)
212 cglobal put_pixels16_y2, 4,5,3
213 %else
214 cglobal put_pixels8_y2, 4,5
215 %endif
216 lea r4, [r2*2]
217 movu m0, [r1]
218 sub r0, r2
219 .loop:
220 movu m1, [r1+r2]
221 movu m2, [r1+r4]
222 add r1, r4
223 PAVGB m0, m1
224 PAVGB m1, m2
225 mova [r0+r2], m0
226 mova [r0+r4], m1
227 movu m1, [r1+r2]
228 movu m0, [r1+r4]
229 add r0, r4
230 add r1, r4
231 PAVGB m2, m1
232 PAVGB m1, m0
233 mova [r0+r2], m2
234 mova [r0+r4], m1
235 add r0, r4
236 sub r3d, 4
237 jne .loop
239 %endmacro
241 INIT_MMX mmxext
242 PUT_PIXELS8_Y2
243 ; actually, put_pixels16_y2_sse2
244 INIT_XMM sse2
245 PUT_PIXELS8_Y2
248 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
249 INIT_MMX mmxext
250 cglobal put_no_rnd_pixels8_y2, 4,5
251 mova m6, [pb_1]
252 lea r4, [r2+r2]
253 mova m0, [r1]
254 sub r0, r2
255 .loop:
256 mova m1, [r1+r2]
257 mova m2, [r1+r4]
258 add r1, r4
259 psubusb m1, m6
260 PAVGB m0, m1
261 PAVGB m1, m2
262 mova [r0+r2], m0
263 mova [r0+r4], m1
264 mova m1, [r1+r2]
265 mova m0, [r1+r4]
266 add r0, r4
267 add r1, r4
268 psubusb m1, m6
269 PAVGB m2, m1
270 PAVGB m1, m0
271 mova [r0+r2], m2
272 mova [r0+r4], m1
273 add r0, r4
274 sub r3d, 4
275 jne .loop
279 ; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
280 INIT_MMX mmxext
281 cglobal put_no_rnd_pixels8_y2_exact, 4,5
282 lea r4, [r2*3]
283 mova m0, [r1]
284 pcmpeqb m6, m6
285 add r1, r2
286 pxor m0, m6
287 .loop:
288 mova m1, [r1]
289 mova m2, [r1+r2]
290 pxor m1, m6
291 pxor m2, m6
292 PAVGB m0, m1
293 PAVGB m1, m2
294 pxor m0, m6
295 pxor m1, m6
296 mova [r0], m0
297 mova [r0+r2], m1
298 mova m1, [r1+r2*2]
299 mova m0, [r1+r4]
300 pxor m1, m6
301 pxor m0, m6
302 PAVGB m2, m1
303 PAVGB m1, m0
304 pxor m2, m6
305 pxor m1, m6
306 mova [r0+r2*2], m2
307 mova [r0+r4], m1
308 lea r1, [r1+r2*4]
309 lea r0, [r0+r2*4]
310 sub r3d, 4
311 jg .loop
315 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
316 %macro AVG_PIXELS8_X2 0
317 %if cpuflag(sse2)
318 cglobal avg_pixels16_x2, 4,5,4
319 %else
320 cglobal avg_pixels8_x2, 4,5
321 %endif
322 lea r4, [r2*2]
323 .loop:
324 movu m0, [r1]
325 movu m2, [r1+r2]
326 %if cpuflag(sse2)
327 movu m1, [r1+1]
328 movu m3, [r1+r2+1]
329 pavgb m0, m1
330 pavgb m2, m3
331 %else
332 PAVGB m0, [r1+1], m3, m5
333 PAVGB m2, [r1+r2+1], m4, m5
334 %endif
335 PAVGB m0, [r0], m3, m5
336 PAVGB m2, [r0+r2], m4, m5
337 add r1, r4
338 mova [r0], m0
339 mova [r0+r2], m2
340 movu m0, [r1]
341 movu m2, [r1+r2]
342 %if cpuflag(sse2)
343 movu m1, [r1+1]
344 movu m3, [r1+r2+1]
345 pavgb m0, m1
346 pavgb m2, m3
347 %else
348 PAVGB m0, [r1+1], m3, m5
349 PAVGB m2, [r1+r2+1], m4, m5
350 %endif
351 add r0, r4
352 add r1, r4
353 PAVGB m0, [r0], m3, m5
354 PAVGB m2, [r0+r2], m4, m5
355 mova [r0], m0
356 mova [r0+r2], m2
357 add r0, r4
358 sub r3d, 4
359 jne .loop
361 %endmacro
363 INIT_MMX mmxext
364 AVG_PIXELS8_X2
365 ; actually avg_pixels16_x2
366 INIT_XMM sse2
367 AVG_PIXELS8_X2
370 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
371 %macro AVG_PIXELS8_Y2 0
372 %if cpuflag(sse2)
373 cglobal avg_pixels16_y2, 4,5,3
374 %else
375 cglobal avg_pixels8_y2, 4,5
376 %endif
377 lea r4, [r2*2]
378 movu m0, [r1]
379 sub r0, r2
380 .loop:
381 movu m1, [r1+r2]
382 movu m2, [r1+r4]
383 add r1, r4
384 PAVGB m0, m1
385 PAVGB m1, m2
386 PAVGB m0, [r0+r2]
387 PAVGB m1, [r0+r4]
388 mova [r0+r2], m0
389 mova [r0+r4], m1
390 movu m1, [r1+r2]
391 movu m0, [r1+r4]
392 PAVGB m2, m1
393 PAVGB m1, m0
394 add r0, r4
395 add r1, r4
396 PAVGB m2, [r0+r2]
397 PAVGB m1, [r0+r4]
398 mova [r0+r2], m2
399 mova [r0+r4], m1
400 add r0, r4
401 sub r3d, 4
402 jne .loop
404 %endmacro
406 INIT_MMX mmxext
407 AVG_PIXELS8_Y2
408 ; actually avg_pixels16_y2
409 INIT_XMM sse2
410 AVG_PIXELS8_Y2
413 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
414 ; Note this is not correctly rounded, and is therefore used for
415 ; not-bitexact output
416 INIT_MMX mmxext
417 cglobal avg_approx_pixels8_xy2, 4,5
418 mova m6, [pb_1]
419 lea r4, [r2*2]
420 mova m0, [r1]
421 PAVGB m0, [r1+1]
422 .loop:
423 mova m2, [r1+r4]
424 mova m1, [r1+r2]
425 psubusb m2, m6
426 PAVGB m1, [r1+r2+1]
427 PAVGB m2, [r1+r4+1]
428 add r1, r4
429 PAVGB m0, m1
430 PAVGB m1, m2
431 PAVGB m0, [r0]
432 PAVGB m1, [r0+r2]
433 mova [r0], m0
434 mova [r0+r2], m1
435 mova m1, [r1+r2]
436 mova m0, [r1+r4]
437 PAVGB m1, [r1+r2+1]
438 PAVGB m0, [r1+r4+1]
439 add r0, r4
440 add r1, r4
441 PAVGB m2, m1
442 PAVGB m1, m0
443 PAVGB m2, [r0]
444 PAVGB m1, [r0+r2]
445 mova [r0], m2
446 mova [r0+r2], m1
447 add r0, r4
448 sub r3d, 4
449 jne .loop
453 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
454 %macro SET_PIXELS_XY2 1
455 %if cpuflag(sse2)
456 cglobal %1_pixels16_xy2, 4,5,8
457 %else
458 cglobal %1_pixels8_xy2, 4,5
459 %endif
460 pxor m7, m7
461 mova m6, [pw_2]
462 movu m0, [r1]
463 movu m4, [r1+1]
464 mova m1, m0
465 mova m5, m4
466 punpcklbw m0, m7
467 punpcklbw m4, m7
468 punpckhbw m1, m7
469 punpckhbw m5, m7
470 paddusw m4, m0
471 paddusw m5, m1
472 xor r4, r4
473 add r1, r2
474 .loop:
475 movu m0, [r1+r4]
476 movu m2, [r1+r4+1]
477 mova m1, m0
478 mova m3, m2
479 punpcklbw m0, m7
480 punpcklbw m2, m7
481 punpckhbw m1, m7
482 punpckhbw m3, m7
483 paddusw m0, m2
484 paddusw m1, m3
485 paddusw m4, m6
486 paddusw m5, m6
487 paddusw m4, m0
488 paddusw m5, m1
489 psrlw m4, 2
490 psrlw m5, 2
491 %ifidn %1, avg
492 mova m3, [r0+r4]
493 packuswb m4, m5
494 PAVGB m4, m3
495 %else
496 packuswb m4, m5
497 %endif
498 mova [r0+r4], m4
499 add r4, r2
501 movu m2, [r1+r4]
502 movu m4, [r1+r4+1]
503 mova m3, m2
504 mova m5, m4
505 punpcklbw m2, m7
506 punpcklbw m4, m7
507 punpckhbw m3, m7
508 punpckhbw m5, m7
509 paddusw m4, m2
510 paddusw m5, m3
511 paddusw m0, m6
512 paddusw m1, m6
513 paddusw m0, m4
514 paddusw m1, m5
515 psrlw m0, 2
516 psrlw m1, 2
517 %ifidn %1, avg
518 mova m3, [r0+r4]
519 packuswb m0, m1
520 PAVGB m0, m3
521 %else
522 packuswb m0, m1
523 %endif
524 mova [r0+r4], m0
525 add r4, r2
526 sub r3d, 2
527 jnz .loop
529 %endmacro
531 INIT_MMX mmxext
532 SET_PIXELS_XY2 avg
533 INIT_XMM sse2
534 SET_PIXELS_XY2 put
535 SET_PIXELS_XY2 avg
537 %macro SSSE3_PIXELS_XY2 1-2
538 %if %0 == 2 ; sse2
539 cglobal %1_pixels16_xy2, 4,5,%2
540 mova m4, [pb_interleave16]
541 %else
542 cglobal %1_pixels8_xy2, 4,5
543 mova m4, [pb_interleave8]
544 %endif
545 mova m5, [pb_1]
546 movu m0, [r1]
547 movu m1, [r1+1]
548 pmaddubsw m0, m5
549 pmaddubsw m1, m5
550 xor r4, r4
551 add r1, r2
552 .loop:
553 movu m2, [r1+r4]
554 movu m3, [r1+r4+1]
555 pmaddubsw m2, m5
556 pmaddubsw m3, m5
557 paddusw m0, m2
558 paddusw m1, m3
559 pmulhrsw m0, [pw_8192]
560 pmulhrsw m1, [pw_8192]
561 %ifidn %1, avg
562 mova m6, [r0+r4]
563 packuswb m0, m1
564 pshufb m0, m4
565 pavgb m0, m6
566 %else
567 packuswb m0, m1
568 pshufb m0, m4
569 %endif
570 mova [r0+r4], m0
571 add r4, r2
573 movu m0, [r1+r4]
574 movu m1, [r1+r4+1]
575 pmaddubsw m0, m5
576 pmaddubsw m1, m5
577 paddusw m2, m0
578 paddusw m3, m1
579 pmulhrsw m2, [pw_8192]
580 pmulhrsw m3, [pw_8192]
581 %ifidn %1, avg
582 mova m6, [r0+r4]
583 packuswb m2, m3
584 pshufb m2, m4
585 pavgb m2, m6
586 %else
587 packuswb m2, m3
588 pshufb m2, m4
589 %endif
590 mova [r0+r4], m2
591 add r4, r2
592 sub r3d, 2
593 jnz .loop
595 %endmacro
597 INIT_MMX ssse3
598 SSSE3_PIXELS_XY2 put
599 SSSE3_PIXELS_XY2 avg
600 INIT_XMM ssse3
601 SSSE3_PIXELS_XY2 put, 6
602 SSSE3_PIXELS_XY2 avg, 7