lavfi: switch to AVFrame.
[FFMpeg-mirror/mplayer-patches.git] / libavcodec / x86 / h264_intrapred_10bit.asm
blob1b7974b7909a440c30e8121ccde56d7db65be497
1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2011 x264 project
5 ;*
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7 ;*
8 ;* This file is part of Libav.
9 ;*
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
27 SECTION_RODATA
29 cextern pw_16
30 cextern pw_8
31 cextern pw_4
32 cextern pw_2
33 cextern pw_1
35 pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
36 pw_m3: times 8 dw -3
37 pw_pixel_max: times 8 dw ((1 << 10)-1)
38 pw_512: times 8 dw 512
39 pd_17: times 4 dd 17
40 pd_16: times 4 dd 16
42 SECTION .text
44 ; dest, left, right, src
45 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
46 %macro PRED4x4_LOWPASS 4
47 paddw %2, %3
48 psrlw %2, 1
49 pavgw %1, %4, %2
50 %endmacro
52 ;-----------------------------------------------------------------------------
53 ; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
54 ;-----------------------------------------------------------------------------
55 %macro PRED4x4_DR 0
56 cglobal pred4x4_down_right_10, 3, 3
57 sub r0, r2
58 lea r1, [r0+r2*2]
59 movhps m1, [r1-8]
60 movhps m2, [r0+r2*1-8]
61 movhps m4, [r0-8]
62 punpckhwd m2, m4
63 movq m3, [r0]
64 punpckhdq m1, m2
65 PALIGNR m3, m1, 10, m1
66 movhps m4, [r1+r2*1-8]
67 PALIGNR m0, m3, m4, 14, m4
68 movhps m4, [r1+r2*2-8]
69 PALIGNR m2, m0, m4, 14, m4
70 PRED4x4_LOWPASS m0, m2, m3, m0
71 movq [r1+r2*2], m0
72 psrldq m0, 2
73 movq [r1+r2*1], m0
74 psrldq m0, 2
75 movq [r0+r2*2], m0
76 psrldq m0, 2
77 movq [r0+r2*1], m0
78 RET
79 %endmacro
81 INIT_XMM sse2
82 PRED4x4_DR
83 INIT_XMM ssse3
84 PRED4x4_DR
85 INIT_XMM avx
86 PRED4x4_DR
88 ;-----------------------------------------------------------------------------
89 ; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
90 ;-----------------------------------------------------------------------------
91 %macro PRED4x4_VR 0
92 cglobal pred4x4_vertical_right_10, 3, 3, 6
93 sub r0, r2
94 lea r1, [r0+r2*2]
95 movq m5, [r0] ; ........t3t2t1t0
96 movhps m1, [r0-8]
97 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
98 pavgw m5, m0
99 movhps m1, [r0+r2*1-8]
100 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
101 movhps m2, [r0+r2*2-8]
102 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
103 movhps m3, [r1+r2*1-8]
104 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
105 PRED4x4_LOWPASS m1, m0, m2, m1
106 pslldq m0, m1, 12
107 psrldq m1, 4
108 movq [r0+r2*1], m5
109 movq [r0+r2*2], m1
110 PALIGNR m5, m0, 14, m2
111 pslldq m0, 2
112 movq [r1+r2*1], m5
113 PALIGNR m1, m0, 14, m0
114 movq [r1+r2*2], m1
116 %endmacro
118 INIT_XMM sse2
119 PRED4x4_VR
120 INIT_XMM ssse3
121 PRED4x4_VR
122 INIT_XMM avx
123 PRED4x4_VR
125 ;-----------------------------------------------------------------------------
126 ; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
127 ;-----------------------------------------------------------------------------
128 %macro PRED4x4_HD 0
129 cglobal pred4x4_horizontal_down_10, 3, 3
130 sub r0, r2
131 lea r1, [r0+r2*2]
132 movq m0, [r0-8] ; lt ..
133 movhps m0, [r0]
134 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
135 movq m1, [r1+r2*2-8] ; l3
136 movq m3, [r1+r2*1-8]
137 punpcklwd m1, m3 ; l2 l3
138 movq m2, [r0+r2*2-8] ; l1
139 movq m3, [r0+r2*1-8]
140 punpcklwd m2, m3 ; l0 l1
141 punpckhdq m1, m2 ; l0 l1 l2 l3
142 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
143 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
144 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
145 pavgw m5, m1, m3
146 PRED4x4_LOWPASS m3, m1, m0, m3
147 punpcklwd m5, m3
148 psrldq m3, 8
149 PALIGNR m3, m5, 12, m4
150 movq [r1+r2*2], m5
151 movhps [r0+r2*2], m5
152 psrldq m5, 4
153 movq [r1+r2*1], m5
154 movq [r0+r2*1], m3
156 %endmacro
158 INIT_XMM sse2
159 PRED4x4_HD
160 INIT_XMM ssse3
161 PRED4x4_HD
162 INIT_XMM avx
163 PRED4x4_HD
165 ;-----------------------------------------------------------------------------
166 ; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
167 ;-----------------------------------------------------------------------------
168 %macro HADDD 2 ; sum junk
169 %if mmsize == 16
170 movhlps %2, %1
171 paddd %1, %2
172 pshuflw %2, %1, 0xE
173 paddd %1, %2
174 %else
175 pshufw %2, %1, 0xE
176 paddd %1, %2
177 %endif
178 %endmacro
180 %macro HADDW 2
181 pmaddwd %1, [pw_1]
182 HADDD %1, %2
183 %endmacro
185 INIT_MMX mmxext
186 cglobal pred4x4_dc_10, 3, 3
187 sub r0, r2
188 lea r1, [r0+r2*2]
189 movq m2, [r0+r2*1-8]
190 paddw m2, [r0+r2*2-8]
191 paddw m2, [r1+r2*1-8]
192 paddw m2, [r1+r2*2-8]
193 psrlq m2, 48
194 movq m0, [r0]
195 HADDW m0, m1
196 paddw m0, [pw_4]
197 paddw m0, m2
198 psrlw m0, 3
199 SPLATW m0, m0, 0
200 movq [r0+r2*1], m0
201 movq [r0+r2*2], m0
202 movq [r1+r2*1], m0
203 movq [r1+r2*2], m0
206 ;-----------------------------------------------------------------------------
207 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
208 ;-----------------------------------------------------------------------------
209 %macro PRED4x4_DL 0
210 cglobal pred4x4_down_left_10, 3, 3
211 sub r0, r2
212 movq m0, [r0]
213 movhps m0, [r1]
214 psrldq m2, m0, 2
215 pslldq m3, m0, 2
216 pshufhw m2, m2, 10100100b
217 PRED4x4_LOWPASS m0, m3, m2, m0
218 lea r1, [r0+r2*2]
219 movhps [r1+r2*2], m0
220 psrldq m0, 2
221 movq [r0+r2*1], m0
222 psrldq m0, 2
223 movq [r0+r2*2], m0
224 psrldq m0, 2
225 movq [r1+r2*1], m0
227 %endmacro
229 INIT_XMM sse2
230 PRED4x4_DL
231 INIT_XMM avx
232 PRED4x4_DL
234 ;-----------------------------------------------------------------------------
235 ; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
236 ;-----------------------------------------------------------------------------
237 %macro PRED4x4_VL 0
238 cglobal pred4x4_vertical_left_10, 3, 3
239 sub r0, r2
240 movu m1, [r0]
241 movhps m1, [r1]
242 psrldq m0, m1, 2
243 psrldq m2, m1, 4
244 pavgw m4, m0, m1
245 PRED4x4_LOWPASS m0, m1, m2, m0
246 lea r1, [r0+r2*2]
247 movq [r0+r2*1], m4
248 movq [r0+r2*2], m0
249 psrldq m4, 2
250 psrldq m0, 2
251 movq [r1+r2*1], m4
252 movq [r1+r2*2], m0
254 %endmacro
256 INIT_XMM sse2
257 PRED4x4_VL
258 INIT_XMM avx
259 PRED4x4_VL
261 ;-----------------------------------------------------------------------------
262 ; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
263 ;-----------------------------------------------------------------------------
264 INIT_MMX mmxext
265 cglobal pred4x4_horizontal_up_10, 3, 3
266 sub r0, r2
267 lea r1, [r0+r2*2]
268 movq m0, [r0+r2*1-8]
269 punpckhwd m0, [r0+r2*2-8]
270 movq m1, [r1+r2*1-8]
271 punpckhwd m1, [r1+r2*2-8]
272 punpckhdq m0, m1
273 pshufw m1, m1, 0xFF
274 movq [r1+r2*2], m1
275 movd [r1+r2*1+4], m1
276 pshufw m2, m0, 11111001b
277 movq m1, m2
278 pavgw m2, m0
280 pshufw m5, m0, 11111110b
281 PRED4x4_LOWPASS m1, m0, m5, m1
282 movq m6, m2
283 punpcklwd m6, m1
284 movq [r0+r2*1], m6
285 psrlq m2, 16
286 psrlq m1, 16
287 punpcklwd m2, m1
288 movq [r0+r2*2], m2
289 psrlq m2, 32
290 movd [r1+r2*1], m2
295 ;-----------------------------------------------------------------------------
296 ; void pred8x8_vertical(pixel *src, int stride)
297 ;-----------------------------------------------------------------------------
298 INIT_XMM sse2
299 cglobal pred8x8_vertical_10, 2, 2
300 sub r0, r1
301 mova m0, [r0]
302 %rep 3
303 mova [r0+r1*1], m0
304 mova [r0+r1*2], m0
305 lea r0, [r0+r1*2]
306 %endrep
307 mova [r0+r1*1], m0
308 mova [r0+r1*2], m0
311 ;-----------------------------------------------------------------------------
312 ; void pred8x8_horizontal(pixel *src, int stride)
313 ;-----------------------------------------------------------------------------
314 INIT_XMM sse2
315 cglobal pred8x8_horizontal_10, 2, 3
316 mov r2d, 4
317 .loop:
318 movq m0, [r0+r1*0-8]
319 movq m1, [r0+r1*1-8]
320 pshuflw m0, m0, 0xff
321 pshuflw m1, m1, 0xff
322 punpcklqdq m0, m0
323 punpcklqdq m1, m1
324 mova [r0+r1*0], m0
325 mova [r0+r1*1], m1
326 lea r0, [r0+r1*2]
327 dec r2d
328 jg .loop
329 REP_RET
331 ;-----------------------------------------------------------------------------
332 ; void predict_8x8_dc(pixel *src, int stride)
333 ;-----------------------------------------------------------------------------
334 %macro MOV8 2-3
335 ; sort of a hack, but it works
336 %if mmsize==8
337 movq [%1+0], %2
338 movq [%1+8], %3
339 %else
340 movdqa [%1], %2
341 %endif
342 %endmacro
344 %macro PRED8x8_DC 1
345 cglobal pred8x8_dc_10, 2, 6
346 sub r0, r1
347 pxor m4, m4
348 movq m0, [r0+0]
349 movq m1, [r0+8]
350 %if mmsize==16
351 punpcklwd m0, m1
352 movhlps m1, m0
353 paddw m0, m1
354 %else
355 pshufw m2, m0, 00001110b
356 pshufw m3, m1, 00001110b
357 paddw m0, m2
358 paddw m1, m3
359 punpcklwd m0, m1
360 %endif
361 %1 m2, m0, 00001110b
362 paddw m0, m2
364 lea r5, [r1*3]
365 lea r4, [r0+r1*4]
366 movzx r2d, word [r0+r1*1-2]
367 movzx r3d, word [r0+r1*2-2]
368 add r2d, r3d
369 movzx r3d, word [r0+r5*1-2]
370 add r2d, r3d
371 movzx r3d, word [r4-2]
372 add r2d, r3d
373 movd m2, r2d ; s2
375 movzx r2d, word [r4+r1*1-2]
376 movzx r3d, word [r4+r1*2-2]
377 add r2d, r3d
378 movzx r3d, word [r4+r5*1-2]
379 add r2d, r3d
380 movzx r3d, word [r4+r1*4-2]
381 add r2d, r3d
382 movd m3, r2d ; s3
384 punpcklwd m2, m3
385 punpckldq m0, m2 ; s0, s1, s2, s3
386 %1 m3, m0, 11110110b ; s2, s1, s3, s3
387 %1 m0, m0, 01110100b ; s0, s1, s3, s1
388 paddw m0, m3
389 psrlw m0, 2
390 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
391 %if mmsize==16
392 punpcklwd m0, m0
393 pshufd m3, m0, 11111010b
394 punpckldq m0, m0
395 SWAP 0,1
396 %else
397 pshufw m1, m0, 0x00
398 pshufw m2, m0, 0x55
399 pshufw m3, m0, 0xaa
400 pshufw m4, m0, 0xff
401 %endif
402 MOV8 r0+r1*1, m1, m2
403 MOV8 r0+r1*2, m1, m2
404 MOV8 r0+r5*1, m1, m2
405 MOV8 r0+r1*4, m1, m2
406 MOV8 r4+r1*1, m3, m4
407 MOV8 r4+r1*2, m3, m4
408 MOV8 r4+r5*1, m3, m4
409 MOV8 r4+r1*4, m3, m4
411 %endmacro
413 INIT_MMX mmxext
414 PRED8x8_DC pshufw
415 INIT_XMM sse2
416 PRED8x8_DC pshuflw
418 ;-----------------------------------------------------------------------------
419 ; void pred8x8_top_dc(pixel *src, int stride)
420 ;-----------------------------------------------------------------------------
421 INIT_XMM sse2
422 cglobal pred8x8_top_dc_10, 2, 4
423 sub r0, r1
424 mova m0, [r0]
425 pshuflw m1, m0, 0x4e
426 pshufhw m1, m1, 0x4e
427 paddw m0, m1
428 pshuflw m1, m0, 0xb1
429 pshufhw m1, m1, 0xb1
430 paddw m0, m1
431 lea r2, [r1*3]
432 lea r3, [r0+r1*4]
433 paddw m0, [pw_2]
434 psrlw m0, 2
435 mova [r0+r1*1], m0
436 mova [r0+r1*2], m0
437 mova [r0+r2*1], m0
438 mova [r0+r1*4], m0
439 mova [r3+r1*1], m0
440 mova [r3+r1*2], m0
441 mova [r3+r2*1], m0
442 mova [r3+r1*4], m0
445 ;-----------------------------------------------------------------------------
446 ; void pred8x8_plane(pixel *src, int stride)
447 ;-----------------------------------------------------------------------------
448 INIT_XMM sse2
449 cglobal pred8x8_plane_10, 2, 7, 7
450 sub r0, r1
451 lea r2, [r1*3]
452 lea r3, [r0+r1*4]
453 mova m2, [r0]
454 pmaddwd m2, [pw_m32101234]
455 HADDD m2, m1
456 movd m0, [r0-4]
457 psrld m0, 14
458 psubw m2, m0 ; H
459 movd m0, [r3+r1*4-4]
460 movd m1, [r0+12]
461 paddw m0, m1
462 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
463 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
464 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
465 sub r4d, r5d
466 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
467 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
468 sub r6d, r5d
469 lea r4d, [r4+r6*2]
470 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
471 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
472 sub r5d, r6d
473 lea r5d, [r5*3]
474 add r4d, r5d
475 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
476 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
477 sub r6d, r5d
478 lea r4d, [r4+r6*4]
479 movd m3, r4d ; V
480 punpckldq m2, m3
481 pmaddwd m2, [pd_17]
482 paddd m2, [pd_16]
483 psrad m2, 5 ; b, c
485 mova m3, [pw_pixel_max]
486 pxor m1, m1
487 SPLATW m0, m0, 1
488 SPLATW m4, m2, 2
489 SPLATW m2, m2, 0
490 pmullw m2, [pw_m32101234] ; b
491 pmullw m5, m4, [pw_m3] ; c
492 paddw m5, [pw_16]
493 mov r2d, 8
494 add r0, r1
495 .loop:
496 paddsw m6, m2, m5
497 paddsw m6, m0
498 psraw m6, 5
499 CLIPW m6, m1, m3
500 mova [r0], m6
501 paddw m5, m4
502 add r0, r1
503 dec r2d
504 jg .loop
505 REP_RET
508 ;-----------------------------------------------------------------------------
509 ; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
510 ;-----------------------------------------------------------------------------
511 %macro PRED8x8L_128_DC 0
512 cglobal pred8x8l_128_dc_10, 4, 4
513 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
514 lea r1, [r3*3]
515 lea r2, [r0+r3*4]
516 MOV8 r0+r3*0, m0, m0
517 MOV8 r0+r3*1, m0, m0
518 MOV8 r0+r3*2, m0, m0
519 MOV8 r0+r1*1, m0, m0
520 MOV8 r2+r3*0, m0, m0
521 MOV8 r2+r3*1, m0, m0
522 MOV8 r2+r3*2, m0, m0
523 MOV8 r2+r1*1, m0, m0
525 %endmacro
527 INIT_MMX mmxext
528 PRED8x8L_128_DC
529 INIT_XMM sse2
530 PRED8x8L_128_DC
532 ;-----------------------------------------------------------------------------
533 ; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
534 ;-----------------------------------------------------------------------------
535 %macro PRED8x8L_TOP_DC 0
536 cglobal pred8x8l_top_dc_10, 4, 4, 6
537 sub r0, r3
538 mova m0, [r0]
539 shr r1d, 14
540 shr r2d, 13
541 neg r1
542 pslldq m1, m0, 2
543 psrldq m2, m0, 2
544 pinsrw m1, [r0+r1], 0
545 pinsrw m2, [r0+r2+14], 7
546 lea r1, [r3*3]
547 lea r2, [r0+r3*4]
548 PRED4x4_LOWPASS m0, m2, m1, m0
549 HADDW m0, m1
550 paddw m0, [pw_4]
551 psrlw m0, 3
552 SPLATW m0, m0, 0
553 mova [r0+r3*1], m0
554 mova [r0+r3*2], m0
555 mova [r0+r1*1], m0
556 mova [r0+r3*4], m0
557 mova [r2+r3*1], m0
558 mova [r2+r3*2], m0
559 mova [r2+r1*1], m0
560 mova [r2+r3*4], m0
562 %endmacro
564 INIT_XMM sse2
565 PRED8x8L_TOP_DC
566 INIT_XMM avx
567 PRED8x8L_TOP_DC
569 ;-----------------------------------------------------------------------------
570 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
571 ;-----------------------------------------------------------------------------
572 ;TODO: see if scalar is faster
573 %macro PRED8x8L_DC 0
574 cglobal pred8x8l_dc_10, 4, 6, 6
575 sub r0, r3
576 lea r4, [r0+r3*4]
577 lea r5, [r3*3]
578 mova m0, [r0+r3*2-16]
579 punpckhwd m0, [r0+r3*1-16]
580 mova m1, [r4+r3*0-16]
581 punpckhwd m1, [r0+r5*1-16]
582 punpckhdq m1, m0
583 mova m2, [r4+r3*2-16]
584 punpckhwd m2, [r4+r3*1-16]
585 mova m3, [r4+r3*4-16]
586 punpckhwd m3, [r4+r5*1-16]
587 punpckhdq m3, m2
588 punpckhqdq m3, m1
589 mova m0, [r0]
590 shr r1d, 14
591 shr r2d, 13
592 neg r1
593 pslldq m1, m0, 2
594 psrldq m2, m0, 2
595 pinsrw m1, [r0+r1], 0
596 pinsrw m2, [r0+r2+14], 7
597 not r1
598 and r1, r3
599 pslldq m4, m3, 2
600 psrldq m5, m3, 2
601 pshuflw m4, m4, 11100101b
602 pinsrw m5, [r0+r1-2], 7
603 PRED4x4_LOWPASS m3, m4, m5, m3
604 PRED4x4_LOWPASS m0, m2, m1, m0
605 paddw m0, m3
606 HADDW m0, m1
607 paddw m0, [pw_8]
608 psrlw m0, 4
609 SPLATW m0, m0
610 mova [r0+r3*1], m0
611 mova [r0+r3*2], m0
612 mova [r0+r5*1], m0
613 mova [r0+r3*4], m0
614 mova [r4+r3*1], m0
615 mova [r4+r3*2], m0
616 mova [r4+r5*1], m0
617 mova [r4+r3*4], m0
619 %endmacro
621 INIT_XMM sse2
622 PRED8x8L_DC
623 INIT_XMM avx
624 PRED8x8L_DC
626 ;-----------------------------------------------------------------------------
627 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
628 ;-----------------------------------------------------------------------------
629 %macro PRED8x8L_VERTICAL 0
630 cglobal pred8x8l_vertical_10, 4, 4, 6
631 sub r0, r3
632 mova m0, [r0]
633 shr r1d, 14
634 shr r2d, 13
635 neg r1
636 pslldq m1, m0, 2
637 psrldq m2, m0, 2
638 pinsrw m1, [r0+r1], 0
639 pinsrw m2, [r0+r2+14], 7
640 lea r1, [r3*3]
641 lea r2, [r0+r3*4]
642 PRED4x4_LOWPASS m0, m2, m1, m0
643 mova [r0+r3*1], m0
644 mova [r0+r3*2], m0
645 mova [r0+r1*1], m0
646 mova [r0+r3*4], m0
647 mova [r2+r3*1], m0
648 mova [r2+r3*2], m0
649 mova [r2+r1*1], m0
650 mova [r2+r3*4], m0
652 %endmacro
654 INIT_XMM sse2
655 PRED8x8L_VERTICAL
656 INIT_XMM avx
657 PRED8x8L_VERTICAL
659 ;-----------------------------------------------------------------------------
660 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
661 ;-----------------------------------------------------------------------------
662 %macro PRED8x8L_HORIZONTAL 0
663 cglobal pred8x8l_horizontal_10, 4, 4, 5
664 mova m0, [r0-16]
665 shr r1d, 14
666 dec r1
667 and r1, r3
668 sub r1, r3
669 punpckhwd m0, [r0+r1-16]
670 mova m1, [r0+r3*2-16]
671 punpckhwd m1, [r0+r3*1-16]
672 lea r2, [r0+r3*4]
673 lea r1, [r3*3]
674 punpckhdq m1, m0
675 mova m2, [r2+r3*0-16]
676 punpckhwd m2, [r0+r1-16]
677 mova m3, [r2+r3*2-16]
678 punpckhwd m3, [r2+r3*1-16]
679 punpckhdq m3, m2
680 punpckhqdq m3, m1
681 PALIGNR m4, m3, [r2+r1-16], 14, m0
682 pslldq m0, m4, 2
683 pshuflw m0, m0, 11100101b
684 PRED4x4_LOWPASS m4, m3, m0, m4
685 punpckhwd m3, m4, m4
686 punpcklwd m4, m4
687 pshufd m0, m3, 0xff
688 pshufd m1, m3, 0xaa
689 pshufd m2, m3, 0x55
690 pshufd m3, m3, 0x00
691 mova [r0+r3*0], m0
692 mova [r0+r3*1], m1
693 mova [r0+r3*2], m2
694 mova [r0+r1*1], m3
695 pshufd m0, m4, 0xff
696 pshufd m1, m4, 0xaa
697 pshufd m2, m4, 0x55
698 pshufd m3, m4, 0x00
699 mova [r2+r3*0], m0
700 mova [r2+r3*1], m1
701 mova [r2+r3*2], m2
702 mova [r2+r1*1], m3
704 %endmacro
706 INIT_XMM sse2
707 PRED8x8L_HORIZONTAL
708 INIT_XMM ssse3
709 PRED8x8L_HORIZONTAL
710 INIT_XMM avx
711 PRED8x8L_HORIZONTAL
713 ;-----------------------------------------------------------------------------
714 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
715 ;-----------------------------------------------------------------------------
716 %macro PRED8x8L_DOWN_LEFT 0
717 cglobal pred8x8l_down_left_10, 4, 4, 7
718 sub r0, r3
719 mova m3, [r0]
720 shr r1d, 14
721 neg r1
722 shr r2d, 13
723 pslldq m1, m3, 2
724 psrldq m2, m3, 2
725 pinsrw m1, [r0+r1], 0
726 pinsrw m2, [r0+r2+14], 7
727 PRED4x4_LOWPASS m6, m2, m1, m3
728 jz .fix_tr ; flags from shr r2d
729 mova m1, [r0+16]
730 psrldq m5, m1, 2
731 PALIGNR m2, m1, m3, 14, m3
732 pshufhw m5, m5, 10100100b
733 PRED4x4_LOWPASS m1, m2, m5, m1
734 .do_topright:
735 lea r1, [r3*3]
736 psrldq m5, m1, 14
737 lea r2, [r0+r3*4]
738 PALIGNR m2, m1, m6, 2, m0
739 PALIGNR m3, m1, m6, 14, m0
740 PALIGNR m5, m1, 2, m0
741 pslldq m4, m6, 2
742 PRED4x4_LOWPASS m6, m4, m2, m6
743 PRED4x4_LOWPASS m1, m3, m5, m1
744 mova [r2+r3*4], m1
745 PALIGNR m1, m6, 14, m2
746 pslldq m6, 2
747 mova [r2+r1*1], m1
748 PALIGNR m1, m6, 14, m2
749 pslldq m6, 2
750 mova [r2+r3*2], m1
751 PALIGNR m1, m6, 14, m2
752 pslldq m6, 2
753 mova [r2+r3*1], m1
754 PALIGNR m1, m6, 14, m2
755 pslldq m6, 2
756 mova [r0+r3*4], m1
757 PALIGNR m1, m6, 14, m2
758 pslldq m6, 2
759 mova [r0+r1*1], m1
760 PALIGNR m1, m6, 14, m2
761 pslldq m6, 2
762 mova [r0+r3*2], m1
763 PALIGNR m1, m6, 14, m6
764 mova [r0+r3*1], m1
766 .fix_tr:
767 punpckhwd m3, m3
768 pshufd m1, m3, 0xFF
769 jmp .do_topright
770 %endmacro
772 INIT_XMM sse2
773 PRED8x8L_DOWN_LEFT
774 INIT_XMM ssse3
775 PRED8x8L_DOWN_LEFT
776 INIT_XMM avx
777 PRED8x8L_DOWN_LEFT
779 ;-----------------------------------------------------------------------------
780 ;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
781 ;-----------------------------------------------------------------------------
782 %macro PRED8x8L_DOWN_RIGHT 0
783 ; standard forbids this when has_topleft is false
784 ; no need to check
785 cglobal pred8x8l_down_right_10, 4, 5, 8
786 sub r0, r3
787 lea r4, [r0+r3*4]
788 lea r1, [r3*3]
789 mova m0, [r0+r3*1-16]
790 punpckhwd m0, [r0+r3*0-16]
791 mova m1, [r0+r1*1-16]
792 punpckhwd m1, [r0+r3*2-16]
793 punpckhdq m1, m0
794 mova m2, [r4+r3*1-16]
795 punpckhwd m2, [r4+r3*0-16]
796 mova m3, [r4+r1*1-16]
797 punpckhwd m3, [r4+r3*2-16]
798 punpckhdq m3, m2
799 punpckhqdq m3, m1
800 mova m0, [r4+r3*4-16]
801 mova m1, [r0]
802 PALIGNR m4, m3, m0, 14, m0
803 PALIGNR m1, m3, 2, m2
804 pslldq m0, m4, 2
805 pshuflw m0, m0, 11100101b
806 PRED4x4_LOWPASS m6, m1, m4, m3
807 PRED4x4_LOWPASS m4, m3, m0, m4
808 mova m3, [r0]
809 shr r2d, 13
810 pslldq m1, m3, 2
811 psrldq m2, m3, 2
812 pinsrw m1, [r0-2], 0
813 pinsrw m2, [r0+r2+14], 7
814 PRED4x4_LOWPASS m3, m2, m1, m3
815 PALIGNR m2, m3, m6, 2, m0
816 PALIGNR m5, m3, m6, 14, m0
817 psrldq m7, m3, 2
818 PRED4x4_LOWPASS m6, m4, m2, m6
819 PRED4x4_LOWPASS m3, m5, m7, m3
820 mova [r4+r3*4], m6
821 PALIGNR m3, m6, 14, m2
822 pslldq m6, 2
823 mova [r0+r3*1], m3
824 PALIGNR m3, m6, 14, m2
825 pslldq m6, 2
826 mova [r0+r3*2], m3
827 PALIGNR m3, m6, 14, m2
828 pslldq m6, 2
829 mova [r0+r1*1], m3
830 PALIGNR m3, m6, 14, m2
831 pslldq m6, 2
832 mova [r0+r3*4], m3
833 PALIGNR m3, m6, 14, m2
834 pslldq m6, 2
835 mova [r4+r3*1], m3
836 PALIGNR m3, m6, 14, m2
837 pslldq m6, 2
838 mova [r4+r3*2], m3
839 PALIGNR m3, m6, 14, m6
840 mova [r4+r1*1], m3
842 %endmacro
844 INIT_XMM sse2
845 PRED8x8L_DOWN_RIGHT
846 INIT_XMM ssse3
847 PRED8x8L_DOWN_RIGHT
848 INIT_XMM avx
849 PRED8x8L_DOWN_RIGHT
851 ;-----------------------------------------------------------------------------
852 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
853 ;-----------------------------------------------------------------------------
854 %macro PRED8x8L_VERTICAL_RIGHT 0
855 ; likewise with 8x8l_down_right
856 cglobal pred8x8l_vertical_right_10, 4, 5, 7
857 sub r0, r3
858 lea r4, [r0+r3*4]
859 lea r1, [r3*3]
860 mova m0, [r0+r3*1-16]
861 punpckhwd m0, [r0+r3*0-16]
862 mova m1, [r0+r1*1-16]
863 punpckhwd m1, [r0+r3*2-16]
864 punpckhdq m1, m0
865 mova m2, [r4+r3*1-16]
866 punpckhwd m2, [r4+r3*0-16]
867 mova m3, [r4+r1*1-16]
868 punpckhwd m3, [r4+r3*2-16]
869 punpckhdq m3, m2
870 punpckhqdq m3, m1
871 mova m0, [r4+r3*4-16]
872 mova m1, [r0]
873 PALIGNR m4, m3, m0, 14, m0
874 PALIGNR m1, m3, 2, m2
875 PRED4x4_LOWPASS m3, m1, m4, m3
876 mova m2, [r0]
877 shr r2d, 13
878 pslldq m1, m2, 2
879 psrldq m5, m2, 2
880 pinsrw m1, [r0-2], 0
881 pinsrw m5, [r0+r2+14], 7
882 PRED4x4_LOWPASS m2, m5, m1, m2
883 PALIGNR m6, m2, m3, 12, m1
884 PALIGNR m5, m2, m3, 14, m0
885 PRED4x4_LOWPASS m0, m6, m2, m5
886 pavgw m2, m5
887 mova [r0+r3*2], m0
888 mova [r0+r3*1], m2
889 pslldq m6, m3, 4
890 pslldq m1, m3, 2
891 PRED4x4_LOWPASS m1, m3, m6, m1
892 PALIGNR m2, m1, 14, m4
893 mova [r0+r1*1], m2
894 pslldq m1, 2
895 PALIGNR m0, m1, 14, m3
896 mova [r0+r3*4], m0
897 pslldq m1, 2
898 PALIGNR m2, m1, 14, m4
899 mova [r4+r3*1], m2
900 pslldq m1, 2
901 PALIGNR m0, m1, 14, m3
902 mova [r4+r3*2], m0
903 pslldq m1, 2
904 PALIGNR m2, m1, 14, m4
905 mova [r4+r1*1], m2
906 pslldq m1, 2
907 PALIGNR m0, m1, 14, m1
908 mova [r4+r3*4], m0
910 %endmacro
912 INIT_XMM sse2
913 PRED8x8L_VERTICAL_RIGHT
914 INIT_XMM ssse3
915 PRED8x8L_VERTICAL_RIGHT
916 INIT_XMM avx
917 PRED8x8L_VERTICAL_RIGHT
919 ;-----------------------------------------------------------------------------
920 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
921 ;-----------------------------------------------------------------------------
922 %macro PRED8x8L_HORIZONTAL_UP 0
923 cglobal pred8x8l_horizontal_up_10, 4, 4, 6
924 mova m0, [r0+r3*0-16]
925 punpckhwd m0, [r0+r3*1-16]
926 shr r1d, 14
927 dec r1
928 and r1, r3
929 sub r1, r3
930 mova m4, [r0+r1*1-16]
931 lea r1, [r3*3]
932 lea r2, [r0+r3*4]
933 mova m1, [r0+r3*2-16]
934 punpckhwd m1, [r0+r1*1-16]
935 punpckhdq m0, m1
936 mova m2, [r2+r3*0-16]
937 punpckhwd m2, [r2+r3*1-16]
938 mova m3, [r2+r3*2-16]
939 punpckhwd m3, [r2+r1*1-16]
940 punpckhdq m2, m3
941 punpckhqdq m0, m2
942 PALIGNR m1, m0, m4, 14, m4
943 psrldq m2, m0, 2
944 pshufhw m2, m2, 10100100b
945 PRED4x4_LOWPASS m0, m1, m2, m0
946 psrldq m1, m0, 2
947 psrldq m2, m0, 4
948 pshufhw m1, m1, 10100100b
949 pshufhw m2, m2, 01010100b
950 pavgw m4, m0, m1
951 PRED4x4_LOWPASS m1, m2, m0, m1
952 punpckhwd m5, m4, m1
953 punpcklwd m4, m1
954 mova [r2+r3*0], m5
955 mova [r0+r3*0], m4
956 pshufd m0, m5, 11111001b
957 pshufd m1, m5, 11111110b
958 pshufd m2, m5, 11111111b
959 mova [r2+r3*1], m0
960 mova [r2+r3*2], m1
961 mova [r2+r1*1], m2
962 PALIGNR m2, m5, m4, 4, m0
963 PALIGNR m3, m5, m4, 8, m1
964 PALIGNR m5, m5, m4, 12, m4
965 mova [r0+r3*1], m2
966 mova [r0+r3*2], m3
967 mova [r0+r1*1], m5
969 %endmacro
971 INIT_XMM sse2
972 PRED8x8L_HORIZONTAL_UP
973 INIT_XMM ssse3
974 PRED8x8L_HORIZONTAL_UP
975 INIT_XMM avx
976 PRED8x8L_HORIZONTAL_UP
979 ;-----------------------------------------------------------------------------
980 ; void pred16x16_vertical(pixel *src, int stride)
981 ;-----------------------------------------------------------------------------
982 %macro MOV16 3-5
983 mova [%1+ 0], %2
984 mova [%1+mmsize], %3
985 %if mmsize==8
986 mova [%1+ 16], %4
987 mova [%1+ 24], %5
988 %endif
989 %endmacro
991 %macro PRED16x16_VERTICAL 0
992 cglobal pred16x16_vertical_10, 2, 3
993 sub r0, r1
994 mov r2d, 8
995 mova m0, [r0+ 0]
996 mova m1, [r0+mmsize]
997 %if mmsize==8
998 mova m2, [r0+16]
999 mova m3, [r0+24]
1000 %endif
1001 .loop:
1002 MOV16 r0+r1*1, m0, m1, m2, m3
1003 MOV16 r0+r1*2, m0, m1, m2, m3
1004 lea r0, [r0+r1*2]
1005 dec r2d
1006 jg .loop
1007 REP_RET
1008 %endmacro
1010 INIT_MMX mmxext
1011 PRED16x16_VERTICAL
1012 INIT_XMM sse2
1013 PRED16x16_VERTICAL
1015 ;-----------------------------------------------------------------------------
1016 ; void pred16x16_horizontal(pixel *src, int stride)
1017 ;-----------------------------------------------------------------------------
1018 %macro PRED16x16_HORIZONTAL 0
1019 cglobal pred16x16_horizontal_10, 2, 3
1020 mov r2d, 8
1021 .vloop:
1022 movd m0, [r0+r1*0-4]
1023 movd m1, [r0+r1*1-4]
1024 SPLATW m0, m0, 1
1025 SPLATW m1, m1, 1
1026 MOV16 r0+r1*0, m0, m0, m0, m0
1027 MOV16 r0+r1*1, m1, m1, m1, m1
1028 lea r0, [r0+r1*2]
1029 dec r2d
1030 jg .vloop
1031 REP_RET
1032 %endmacro
1034 INIT_MMX mmxext
1035 PRED16x16_HORIZONTAL
1036 INIT_XMM sse2
1037 PRED16x16_HORIZONTAL
1039 ;-----------------------------------------------------------------------------
1040 ; void pred16x16_dc(pixel *src, int stride)
1041 ;-----------------------------------------------------------------------------
1042 %macro PRED16x16_DC 0
1043 cglobal pred16x16_dc_10, 2, 6
1044 mov r5, r0
1045 sub r0, r1
1046 mova m0, [r0+0]
1047 paddw m0, [r0+mmsize]
1048 %if mmsize==8
1049 paddw m0, [r0+16]
1050 paddw m0, [r0+24]
1051 %endif
1052 HADDW m0, m2
1054 lea r0, [r0+r1-2]
1055 movzx r3d, word [r0]
1056 movzx r4d, word [r0+r1]
1057 %rep 7
1058 lea r0, [r0+r1*2]
1059 movzx r2d, word [r0]
1060 add r3d, r2d
1061 movzx r2d, word [r0+r1]
1062 add r4d, r2d
1063 %endrep
1064 lea r3d, [r3+r4+16]
1066 movd m1, r3d
1067 paddw m0, m1
1068 psrlw m0, 5
1069 SPLATW m0, m0
1070 mov r3d, 8
1071 .loop:
1072 MOV16 r5+r1*0, m0, m0, m0, m0
1073 MOV16 r5+r1*1, m0, m0, m0, m0
1074 lea r5, [r5+r1*2]
1075 dec r3d
1076 jg .loop
1077 REP_RET
1078 %endmacro
1080 INIT_MMX mmxext
1081 PRED16x16_DC
1082 INIT_XMM sse2
1083 PRED16x16_DC
1085 ;-----------------------------------------------------------------------------
1086 ; void pred16x16_top_dc(pixel *src, int stride)
1087 ;-----------------------------------------------------------------------------
1088 %macro PRED16x16_TOP_DC 0
1089 cglobal pred16x16_top_dc_10, 2, 3
1090 sub r0, r1
1091 mova m0, [r0+0]
1092 paddw m0, [r0+mmsize]
1093 %if mmsize==8
1094 paddw m0, [r0+16]
1095 paddw m0, [r0+24]
1096 %endif
1097 HADDW m0, m2
1099 SPLATW m0, m0
1100 paddw m0, [pw_8]
1101 psrlw m0, 4
1102 mov r2d, 8
1103 .loop:
1104 MOV16 r0+r1*1, m0, m0, m0, m0
1105 MOV16 r0+r1*2, m0, m0, m0, m0
1106 lea r0, [r0+r1*2]
1107 dec r2d
1108 jg .loop
1109 REP_RET
1110 %endmacro
1112 INIT_MMX mmxext
1113 PRED16x16_TOP_DC
1114 INIT_XMM sse2
1115 PRED16x16_TOP_DC
1117 ;-----------------------------------------------------------------------------
1118 ; void pred16x16_left_dc(pixel *src, int stride)
1119 ;-----------------------------------------------------------------------------
1120 %macro PRED16x16_LEFT_DC 0
1121 cglobal pred16x16_left_dc_10, 2, 6
1122 mov r5, r0
1124 sub r0, 2
1125 movzx r3d, word [r0]
1126 movzx r4d, word [r0+r1]
1127 %rep 7
1128 lea r0, [r0+r1*2]
1129 movzx r2d, word [r0]
1130 add r3d, r2d
1131 movzx r2d, word [r0+r1]
1132 add r4d, r2d
1133 %endrep
1134 lea r3d, [r3+r4+8]
1135 shr r3d, 4
1137 movd m0, r3d
1138 SPLATW m0, m0
1139 mov r3d, 8
1140 .loop:
1141 MOV16 r5+r1*0, m0, m0, m0, m0
1142 MOV16 r5+r1*1, m0, m0, m0, m0
1143 lea r5, [r5+r1*2]
1144 dec r3d
1145 jg .loop
1146 REP_RET
1147 %endmacro
1149 INIT_MMX mmxext
1150 PRED16x16_LEFT_DC
1151 INIT_XMM sse2
1152 PRED16x16_LEFT_DC
1154 ;-----------------------------------------------------------------------------
1155 ; void pred16x16_128_dc(pixel *src, int stride)
1156 ;-----------------------------------------------------------------------------
1157 %macro PRED16x16_128_DC 0
1158 cglobal pred16x16_128_dc_10, 2,3
1159 mova m0, [pw_512]
1160 mov r2d, 8
1161 .loop:
1162 MOV16 r0+r1*0, m0, m0, m0, m0
1163 MOV16 r0+r1*1, m0, m0, m0, m0
1164 lea r0, [r0+r1*2]
1165 dec r2d
1166 jg .loop
1167 REP_RET
1168 %endmacro
1170 INIT_MMX mmxext
1171 PRED16x16_128_DC
1172 INIT_XMM sse2
1173 PRED16x16_128_DC