1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 deblocking code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;*****************************************************************************
26 pb_00: times
16 db 0x00
27 pb_01: times
16 db 0x01
28 pb_03: times
16 db 0x03
29 pb_a1: times
16 db 0xa1
33 ; expands to [base],...,[base+7*stride]
34 %define PASS8ROWS
(base
, base3
, stride
, stride3
) \
35 [base
], [base
+stride
], [base
+stride
*2], [base3
], \
36 [base3
+stride
], [base3
+stride
*2], [base3
+stride3
], [base3
+stride
*4]
38 ; in: 8 rows of 4 bytes in %1..%8
39 ; out: 4 rows of 8 bytes in m0..m3
40 %macro TRANSPOSE4x8_LOAD
8
69 ; in: 4 rows of 8 bytes in m0..m3
70 ; out: 8 rows of 4 bytes in %1..%8
71 %macro TRANSPOSE8x4_STORE
8
111 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113 %macro TRANSPOSE6x8_MEM
9
121 SBUTTERFLY bw
, m0
, m1
, m7
122 SBUTTERFLY bw
, m2
, m3
, m1
123 SBUTTERFLY bw
, m4
, m5
, m3
125 SBUTTERFLY bw
, m6
, %8, m5
126 SBUTTERFLY wd
, m0
, m2
, m1
127 SBUTTERFLY wd
, m4
, m6
, m2
130 SBUTTERFLY wd
, m7
, [%9+0x10], m6
131 SBUTTERFLY wd
, m3
, m5
, m4
132 SBUTTERFLY
dq, m7
, m3
, m0
133 SBUTTERFLY
dq, m1
, m2
, m5
142 ; in: 8 rows of 8 in %1..%8
143 ; out: 8 rows of 8 in %9..%16
144 %macro TRANSPOSE8x8_MEM
16
152 SBUTTERFLY bw
, m0
, m1
, m7
153 SBUTTERFLY bw
, m2
, m3
, m1
154 SBUTTERFLY bw
, m4
, m5
, m3
155 SBUTTERFLY bw
, m6
, %8, m5
157 SBUTTERFLY wd
, m0
, m2
, m3
158 SBUTTERFLY wd
, m4
, m6
, m2
159 SBUTTERFLY wd
, m7
, m1
, m6
162 SBUTTERFLY wd
, m2
, m5
, m1
163 SBUTTERFLY
dq, m0
, m4
, m5
164 SBUTTERFLY
dq, m7
, m2
, m4
169 SBUTTERFLY
dq, m3
, %11, m0
170 SBUTTERFLY
dq, m6
, m1
, m5
177 ; out: %4 = |%1-%2|>%3
188 ; out: %4 = |%1-%2|>%3
209 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210 ; out: m5=beta-1, m7=mask, %3=alpha-1
217 packuswb m4
, m4
; 16x alpha-1
218 packuswb m5
, m5
; 16x beta-1
222 DIFF_GT m1
, m2
, m4
, m7
, m6
; |p0-q0| > alpha-1
223 DIFF_GT m0
, m1
, m5
, m4
, m6
; |p1-p0| > beta-1
225 DIFF_GT m3
, m2
, m5
, m4
, m6
; |q1-q0| > beta-1
231 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
234 %macro DEBLOCK_P0_Q0
0
237 pand m5
, [pb_01
GLOBAL] ; (p0^q0)&1
240 pavgb m3
, m0
; (p1 - q1 + 256)>>1
241 pavgb m3
, [pb_03
GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
243 pavgb m4
, m2
; (q0 - p0 + 256)>>1
245 paddusb m3
, m4
; d+128+33
246 mova m6
, [pb_a1
GLOBAL]
248 psubusb m3
, [pb_a1
GLOBAL]
258 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260 ; clobbers: q2, tmp, tc0
264 pavgb
%2, %6 ; avg(p2,avg(p0,q0))
266 pand
%6, [pb_01
GLOBAL] ; (p2^avg(p0,q0))&1
267 psubusb
%2, %6 ; (p2+((p0+q0+1)>>1))>>1
277 ;-----------------------------------------------------------------------------
278 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279 ;-----------------------------------------------------------------------------
281 cglobal x264_deblock_v_luma_sse2
, 5,5,10
287 add r4
, r0
; pix-3*stride
289 mova m0
, [r4
+r1
] ; p1
290 mova m1
, [r4
+2*r1
] ; p0
292 mova m3
, [r0
+r1
] ; q1
296 punpcklbw m8
, m8
; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
303 DIFF_GT2 m1
, m3
, m5
, m6
, m7
; |p2-p0| > beta-1
308 LUMA_Q1 m0
, m3
, [r4
], [r4
+r1
], m6
, m4
310 movdqa m4
, [r0
+2*r1
] ; q2
311 DIFF_GT2 m2
, m4
, m5
, m6
, m3
; |q2-q0| > beta-1
316 LUMA_Q1 m3
, m4
, [r0
+2*r1
], [r0
+r1
], m8
, m6
323 ;-----------------------------------------------------------------------------
324 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325 ;-----------------------------------------------------------------------------
327 cglobal x264_deblock_h_luma_sse2
, 5,7
334 %define pix_tmp rsp
+0x30
340 ; transpose 6x16 -> tmp space
341 TRANSPOSE6x8_MEM PASS8ROWS
(r6
, r5
, r10
, r11
), pix_tmp
344 TRANSPOSE6x8_MEM PASS8ROWS
(r6
, r5
, r10
, r11
), pix_tmp
+8
347 ; alpha, beta, tc0 are still in r2d, r3d, r4
348 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
349 lea r0
, [pix_tmp
+0x30]
354 call x264_deblock_v_luma_sse2
356 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
359 movq m0
, [pix_tmp
+0x18]
360 movq m1
, [pix_tmp
+0x28]
361 movq m2
, [pix_tmp
+0x38]
362 movq m3
, [pix_tmp
+0x48]
363 TRANSPOSE8x4_STORE PASS8ROWS
(r6
, r5
, r10
, r11
)
369 movq m0
, [pix_tmp
+0x10]
370 movq m1
, [pix_tmp
+0x20]
371 movq m2
, [pix_tmp
+0x30]
372 movq m3
, [pix_tmp
+0x40]
373 TRANSPOSE8x4_STORE PASS8ROWS
(r6
, r5
, r10
, r11
)
384 %macro DEBLOCK_LUMA
3
385 ;-----------------------------------------------------------------------------
386 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
387 ;-----------------------------------------------------------------------------
388 cglobal x264_deblock_
%2_luma_
%1, 5,5
393 add r4
, r0
; pix-3*stride
394 %assign pad
2*%3+12-(stack_offset
&15)
397 mova m0
, [r4
+r1
] ; p1
398 mova m1
, [r4
+2*r1
] ; p0
400 mova m3
, [r0
+r1
] ; q1
406 punpcklbw m4
, m4
; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
407 mova
[esp+%3], m4
; tc
411 mova
[esp], m4
; mask
414 DIFF_GT2 m1
, m3
, m5
, m6
, m7
; |p2-p0| > beta-1
416 pand m4
, [esp+%3] ; tc
420 LUMA_Q1 m0
, m3
, [r4
], [r4
+r1
], m6
, m4
422 mova m4
, [r0
+2*r1
] ; q2
423 DIFF_GT2 m2
, m4
, m5
, m6
, m3
; |q2-q0| > beta-1
424 mova m5
, [esp] ; mask
426 mova m5
, [esp+%3] ; tc
430 LUMA_Q1 m3
, m4
, [r0
+2*r1
], [r0
+r1
], m5
, m6
438 ;-----------------------------------------------------------------------------
439 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
440 ;-----------------------------------------------------------------------------
442 cglobal x264_deblock_h_luma_
%1, 0,5
448 %assign pad
0x78-(stack_offset
&15)
450 %define pix_tmp
esp+12
452 ; transpose 6x16 -> tmp space
453 TRANSPOSE6x8_MEM PASS8ROWS
(r0
, r1
, r3
, r4
), pix_tmp
456 TRANSPOSE6x8_MEM PASS8ROWS
(r0
, r1
, r3
, r4
), pix_tmp
+8
459 lea r0
, [pix_tmp
+0x30]
465 call x264_deblock_
%2_luma_
%1
467 add dword [esp ], 8 ; pix_tmp+0x38
468 add dword [esp+16], 2 ; tc0+2
469 call x264_deblock_
%2_luma_
%1
473 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
478 movq m0
, [pix_tmp
+0x10]
479 movq m1
, [pix_tmp
+0x20]
480 movq m2
, [pix_tmp
+0x30]
481 movq m3
, [pix_tmp
+0x40]
482 TRANSPOSE8x4_STORE PASS8ROWS
(r0
, r1
, r3
, r4
)
486 movq m0
, [pix_tmp
+0x18]
487 movq m1
, [pix_tmp
+0x28]
488 movq m2
, [pix_tmp
+0x38]
489 movq m3
, [pix_tmp
+0x48]
490 TRANSPOSE8x4_STORE PASS8ROWS
(r0
, r1
, r3
, r4
)
494 %endmacro
; DEBLOCK_LUMA
497 DEBLOCK_LUMA sse2
, v
, 16
503 %macro LUMA_INTRA_P012
4 ; p0..p3 in memory
508 pavgb t0
, t1
; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
521 psubb t0
, t2
; p1' = (p2+p1+p0+q0+2)/4;
528 psubb t3
, t2
; p2+2*p1+2*p0+2*q0+q1
532 pavgb t1
, t5
; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
537 psubb t1
, t3
; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
545 pavgb t2
, p1
; p0'b = (2*p1+p0+q0+2)/4
553 mova
%1, t1
; store p0
559 pavgb t1
, t0
; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
561 paddb t2
, t4
; 2*p3+3*p2+p1+p0+q0
566 psubb t1
, t2
; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
574 mova
%2, t0
; store p1
575 mova
%3, t1
; store p2
578 %macro LUMA_INTRA_SWAP_PQ
0
584 %define mask1p mask1q
587 %macro DEBLOCK_LUMA_INTRA
2
603 %define mask1q
[rsp
-24]
607 %define spill
(x
) [esp+16*x
+((stack_offset
+4)&15)]
612 %define mask0 spill
(2)
613 %define mask1p spill
(3)
614 %define mask1q spill
(4)
615 %define mpb_00
[pb_00
GLOBAL]
616 %define mpb_01
[pb_01
GLOBAL]
619 ;-----------------------------------------------------------------------------
620 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
621 ;-----------------------------------------------------------------------------
622 cglobal x264_deblock_
%2_luma_intra_
%1, 4,6,16
627 lea r5
, [r1
*3] ; 3*stride
633 add r4
, r0
; pix-4*stride
640 mova mpb_01
, [pb_01
GLOBAL]
641 LOAD_MASK r2d
, r3d
, t5
; m5=beta-1, t5=alpha-1, m7=mask0
642 SWAP
7, 12 ; m12=mask0
644 pavgb t5
, mpb_01
; alpha/4+1
647 DIFF_GT2 p0
, q0
, t5
, t0
, t3
; t0 = |p0-q0| > alpha/4+1
648 DIFF_GT2 p0
, p2
, m5
, t2
, t5
; mask1 = |p2-p0| > beta-1
649 DIFF_GT2 q0
, q2
, m5
, t4
, t5
; t4 = |q2-q0| > beta-1
656 LOAD_MASK r2d
, r3d
, t5
; m5=beta-1, t5=alpha-1, m7=mask0
659 pavgb m4
, [pb_00
GLOBAL]
660 pavgb m4
, [pb_01
GLOBAL] ; alpha/4+1
661 DIFF_GT2 p0
, q0
, m4
, m6
, m7
; m6 = |p0-q0| > alpha/4+1
663 DIFF_GT2 p0
, p2
, m5
, m4
, m7
; m4 = |p2-p0| > beta-1
666 DIFF_GT2 q0
, q2
, m5
, m4
, m7
; m4 = |q2-q0| > beta-1
670 LUMA_INTRA_P012
[r4
+r5
], [r4
+2*r1
], [r4
+r1
], [r4
]
672 LUMA_INTRA_P012
[r0
], [r0
+r1
], [r0
+2*r1
], [r0
+r5
]
681 ;-----------------------------------------------------------------------------
682 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
683 ;-----------------------------------------------------------------------------
684 cglobal x264_deblock_h_luma_intra_
%1, 4,7
692 ; transpose 8x16 -> tmp space
693 TRANSPOSE8x8_MEM PASS8ROWS
(r6
, r5
, r10
, r11
), PASS8ROWS
(pix_tmp
, pix_tmp
+0x30, 0x10, 0x30)
696 TRANSPOSE8x8_MEM PASS8ROWS
(r6
, r5
, r10
, r11
), PASS8ROWS
(pix_tmp
+8, pix_tmp
+0x38, 0x10, 0x30)
698 lea r0
, [pix_tmp
+0x40]
700 call x264_deblock_v_luma_intra_
%1
702 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
704 TRANSPOSE8x8_MEM PASS8ROWS
(pix_tmp
+8, pix_tmp
+0x38, 0x10, 0x30), PASS8ROWS
(r6
, r5
, r10
, r11
)
709 TRANSPOSE8x8_MEM PASS8ROWS
(pix_tmp
, pix_tmp
+0x30, 0x10, 0x30), PASS8ROWS
(r6
, r5
, r10
, r11
)
713 cglobal x264_deblock_h_luma_intra_
%1, 2,4
717 %assign pad
0x8c-(stack_offset
&15)
721 ; transpose 8x16 -> tmp space
722 TRANSPOSE8x8_MEM PASS8ROWS
(r0
, r2
, r1
, r3
), PASS8ROWS
(pix_tmp
, pix_tmp
+0x30, 0x10, 0x30)
725 TRANSPOSE8x8_MEM PASS8ROWS
(r0
, r2
, r1
, r3
), PASS8ROWS
(pix_tmp
+8, pix_tmp
+0x38, 0x10, 0x30)
727 lea r0
, [pix_tmp
+0x40]
732 call x264_deblock_
%2_luma_intra_
%1
734 add dword [rsp
], 8 ; pix_tmp+8
735 call x264_deblock_
%2_luma_intra_
%1
744 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
745 TRANSPOSE8x8_MEM PASS8ROWS
(pix_tmp
, pix_tmp
+0x30, 0x10, 0x30), PASS8ROWS
(r0
, r2
, r1
, r3
)
748 TRANSPOSE8x8_MEM PASS8ROWS
(pix_tmp
+8, pix_tmp
+0x38, 0x10, 0x30), PASS8ROWS
(r0
, r2
, r1
, r3
)
752 %endmacro
; DEBLOCK_LUMA_INTRA
755 DEBLOCK_LUMA_INTRA sse2
, v
758 DEBLOCK_LUMA_INTRA mmxext
, v8