Remove table that was forgotten in the split.
[FFMpeg-mirror/ordered_chapters.git] / libavcodec / x86 / h264_deblock_sse2.asm
blobbf45c7ea691a39c4e1eda2718d6adeedb696a029
1 ;*****************************************************************************
2 ;* MMX/SSE2-optimized H.264 deblocking code
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;*
8 ;* This program is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* This program is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License
19 ;* along with this program; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;*****************************************************************************
23 %include "x86inc.asm"
25 SECTION_RODATA
26 pb_00: times 16 db 0x00
27 pb_01: times 16 db 0x01
28 pb_03: times 16 db 0x03
29 pb_a1: times 16 db 0xa1
31 SECTION .text
33 ; expands to [base],...,[base+7*stride]
34 %define PASS8ROWS(base, base3, stride, stride3) \
35 [base], [base+stride], [base+stride*2], [base3], \
36 [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
38 ; in: 8 rows of 4 bytes in %1..%8
39 ; out: 4 rows of 8 bytes in m0..m3
40 %macro TRANSPOSE4x8_LOAD 8
41 movd m0, %1
42 movd m2, %2
43 movd m1, %3
44 movd m3, %4
45 punpcklbw m0, m2
46 punpcklbw m1, m3
47 movq m2, m0
48 punpcklwd m0, m1
49 punpckhwd m2, m1
51 movd m4, %5
52 movd m6, %6
53 movd m5, %7
54 movd m7, %8
55 punpcklbw m4, m6
56 punpcklbw m5, m7
57 movq m6, m4
58 punpcklwd m4, m5
59 punpckhwd m6, m5
61 movq m1, m0
62 movq m3, m2
63 punpckldq m0, m4
64 punpckhdq m1, m4
65 punpckldq m2, m6
66 punpckhdq m3, m6
67 %endmacro
69 ; in: 4 rows of 8 bytes in m0..m3
70 ; out: 8 rows of 4 bytes in %1..%8
71 %macro TRANSPOSE8x4_STORE 8
72 movq m4, m0
73 movq m5, m1
74 movq m6, m2
75 punpckhdq m4, m4
76 punpckhdq m5, m5
77 punpckhdq m6, m6
79 punpcklbw m0, m1
80 punpcklbw m2, m3
81 movq m1, m0
82 punpcklwd m0, m2
83 punpckhwd m1, m2
84 movd %1, m0
85 punpckhdq m0, m0
86 movd %2, m0
87 movd %3, m1
88 punpckhdq m1, m1
89 movd %4, m1
91 punpckhdq m3, m3
92 punpcklbw m4, m5
93 punpcklbw m6, m3
94 movq m5, m4
95 punpcklwd m4, m6
96 punpckhwd m5, m6
97 movd %5, m4
98 punpckhdq m4, m4
99 movd %6, m4
100 movd %7, m5
101 punpckhdq m5, m5
102 movd %8, m5
103 %endmacro
105 %macro SBUTTERFLY 4
106 movq %4, %2
107 punpckl%1 %2, %3
108 punpckh%1 %4, %3
109 %endmacro
111 ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
112 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
113 %macro TRANSPOSE6x8_MEM 9
114 movq m0, %1
115 movq m1, %2
116 movq m2, %3
117 movq m3, %4
118 movq m4, %5
119 movq m5, %6
120 movq m6, %7
121 SBUTTERFLY bw, m0, m1, m7
122 SBUTTERFLY bw, m2, m3, m1
123 SBUTTERFLY bw, m4, m5, m3
124 movq [%9+0x10], m1
125 SBUTTERFLY bw, m6, %8, m5
126 SBUTTERFLY wd, m0, m2, m1
127 SBUTTERFLY wd, m4, m6, m2
128 punpckhdq m0, m4
129 movq [%9+0x00], m0
130 SBUTTERFLY wd, m7, [%9+0x10], m6
131 SBUTTERFLY wd, m3, m5, m4
132 SBUTTERFLY dq, m7, m3, m0
133 SBUTTERFLY dq, m1, m2, m5
134 punpckldq m6, m4
135 movq [%9+0x10], m1
136 movq [%9+0x20], m5
137 movq [%9+0x30], m7
138 movq [%9+0x40], m0
139 movq [%9+0x50], m6
140 %endmacro
142 ; in: 8 rows of 8 in %1..%8
143 ; out: 8 rows of 8 in %9..%16
144 %macro TRANSPOSE8x8_MEM 16
145 movq m0, %1
146 movq m1, %2
147 movq m2, %3
148 movq m3, %4
149 movq m4, %5
150 movq m5, %6
151 movq m6, %7
152 SBUTTERFLY bw, m0, m1, m7
153 SBUTTERFLY bw, m2, m3, m1
154 SBUTTERFLY bw, m4, m5, m3
155 SBUTTERFLY bw, m6, %8, m5
156 movq %9, m3
157 SBUTTERFLY wd, m0, m2, m3
158 SBUTTERFLY wd, m4, m6, m2
159 SBUTTERFLY wd, m7, m1, m6
160 movq %11, m2
161 movq m2, %9
162 SBUTTERFLY wd, m2, m5, m1
163 SBUTTERFLY dq, m0, m4, m5
164 SBUTTERFLY dq, m7, m2, m4
165 movq %9, m0
166 movq %10, m5
167 movq %13, m7
168 movq %14, m4
169 SBUTTERFLY dq, m3, %11, m0
170 SBUTTERFLY dq, m6, m1, m5
171 movq %11, m3
172 movq %12, m0
173 movq %15, m6
174 movq %16, m5
175 %endmacro
177 ; out: %4 = |%1-%2|>%3
178 ; clobbers: %5
179 %macro DIFF_GT 5
180 mova %5, %2
181 mova %4, %1
182 psubusb %5, %1
183 psubusb %4, %2
184 por %4, %5
185 psubusb %4, %3
186 %endmacro
188 ; out: %4 = |%1-%2|>%3
189 ; clobbers: %5
190 %macro DIFF_GT2 5
191 mova %5, %2
192 mova %4, %1
193 psubusb %5, %1
194 psubusb %4, %2
195 psubusb %5, %3
196 psubusb %4, %3
197 pcmpeqb %4, %5
198 %endmacro
200 %macro SPLATW 1
201 %ifidn m0, xmm0
202 pshuflw %1, %1, 0
203 punpcklqdq %1, %1
204 %else
205 pshufw %1, %1, 0
206 %endif
207 %endmacro
209 ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
210 ; out: m5=beta-1, m7=mask, %3=alpha-1
211 ; clobbers: m4,m6
212 %macro LOAD_MASK 2-3
213 movd m4, %1
214 movd m5, %2
215 SPLATW m4
216 SPLATW m5
217 packuswb m4, m4 ; 16x alpha-1
218 packuswb m5, m5 ; 16x beta-1
219 %if %0>2
220 mova %3, m4
221 %endif
222 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
223 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
224 por m7, m4
225 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
226 por m7, m4
227 pxor m6, m6
228 pcmpeqb m7, m6
229 %endmacro
231 ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
232 ; out: m1=p0' m2=q0'
233 ; clobbers: m0,3-6
234 %macro DEBLOCK_P0_Q0 0
235 mova m5, m1
236 pxor m5, m2 ; p0^q0
237 pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
238 pcmpeqb m4, m4
239 pxor m3, m4
240 pavgb m3, m0 ; (p1 - q1 + 256)>>1
241 pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
242 pxor m4, m1
243 pavgb m4, m2 ; (q0 - p0 + 256)>>1
244 pavgb m3, m5
245 paddusb m3, m4 ; d+128+33
246 mova m6, [pb_a1 GLOBAL]
247 psubusb m6, m3
248 psubusb m3, [pb_a1 GLOBAL]
249 pminub m6, m7
250 pminub m3, m7
251 psubusb m1, m6
252 psubusb m2, m3
253 paddusb m1, m3
254 paddusb m2, m6
255 %endmacro
257 ; in: m1=p0 m2=q0
258 ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
259 ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
260 ; clobbers: q2, tmp, tc0
261 %macro LUMA_Q1 6
262 mova %6, m1
263 pavgb %6, m2
264 pavgb %2, %6 ; avg(p2,avg(p0,q0))
265 pxor %6, %3
266 pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
267 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
268 mova %6, %1
269 psubusb %6, %5
270 paddusb %5, %1
271 pmaxub %2, %6
272 pminub %2, %5
273 mova %4, %2
274 %endmacro
276 %ifdef ARCH_X86_64
277 ;-----------------------------------------------------------------------------
278 ; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
279 ;-----------------------------------------------------------------------------
280 INIT_XMM
281 cglobal x264_deblock_v_luma_sse2, 5,5,10
282 movd m8, [r4] ; tc0
283 lea r4, [r1*3]
284 dec r2d ; alpha-1
285 neg r4
286 dec r3d ; beta-1
287 add r4, r0 ; pix-3*stride
289 mova m0, [r4+r1] ; p1
290 mova m1, [r4+2*r1] ; p0
291 mova m2, [r0] ; q0
292 mova m3, [r0+r1] ; q1
293 LOAD_MASK r2d, r3d
295 punpcklbw m8, m8
296 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
297 pcmpeqb m9, m9
298 pcmpeqb m9, m8
299 pandn m9, m7
300 pand m8, m9
302 movdqa m3, [r4] ; p2
303 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
304 pand m6, m9
305 mova m7, m8
306 psubb m7, m6
307 pand m6, m8
308 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
310 movdqa m4, [r0+2*r1] ; q2
311 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
312 pand m6, m9
313 pand m8, m6
314 psubb m7, m6
315 mova m3, [r0+r1]
316 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
318 DEBLOCK_P0_Q0
319 mova [r4+2*r1], m1
320 mova [r0], m2
323 ;-----------------------------------------------------------------------------
324 ; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
325 ;-----------------------------------------------------------------------------
326 INIT_MMX
327 cglobal x264_deblock_h_luma_sse2, 5,7
328 movsxd r10, r1d
329 lea r11, [r10+r10*2]
330 lea r6, [r0-4]
331 lea r5, [r0-4+r11]
332 %ifdef WIN64
333 sub rsp, 0x98
334 %define pix_tmp rsp+0x30
335 %else
336 sub rsp, 0x68
337 %define pix_tmp rsp
338 %endif
340 ; transpose 6x16 -> tmp space
341 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
342 lea r6, [r6+r10*8]
343 lea r5, [r5+r10*8]
344 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
346 ; vertical filter
347 ; alpha, beta, tc0 are still in r2d, r3d, r4
348 ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
349 lea r0, [pix_tmp+0x30]
350 mov r1d, 0x10
351 %ifdef WIN64
352 mov [rsp+0x20], r4
353 %endif
354 call x264_deblock_v_luma_sse2
356 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
357 add r6, 2
358 add r5, 2
359 movq m0, [pix_tmp+0x18]
360 movq m1, [pix_tmp+0x28]
361 movq m2, [pix_tmp+0x38]
362 movq m3, [pix_tmp+0x48]
363 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
365 shl r10, 3
366 sub r6, r10
367 sub r5, r10
368 shr r10, 3
369 movq m0, [pix_tmp+0x10]
370 movq m1, [pix_tmp+0x20]
371 movq m2, [pix_tmp+0x30]
372 movq m3, [pix_tmp+0x40]
373 TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
375 %ifdef WIN64
376 add rsp, 0x98
377 %else
378 add rsp, 0x68
379 %endif
382 %else
384 %macro DEBLOCK_LUMA 3
385 ;-----------------------------------------------------------------------------
386 ; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
387 ;-----------------------------------------------------------------------------
388 cglobal x264_deblock_%2_luma_%1, 5,5
389 lea r4, [r1*3]
390 dec r2 ; alpha-1
391 neg r4
392 dec r3 ; beta-1
393 add r4, r0 ; pix-3*stride
394 %assign pad 2*%3+12-(stack_offset&15)
395 SUB esp, pad
397 mova m0, [r4+r1] ; p1
398 mova m1, [r4+2*r1] ; p0
399 mova m2, [r0] ; q0
400 mova m3, [r0+r1] ; q1
401 LOAD_MASK r2, r3
403 mov r3, r4mp
404 movd m4, [r3] ; tc0
405 punpcklbw m4, m4
406 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
407 mova [esp+%3], m4 ; tc
408 pcmpeqb m3, m3
409 pcmpgtb m4, m3
410 pand m4, m7
411 mova [esp], m4 ; mask
413 mova m3, [r4] ; p2
414 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
415 pand m6, m4
416 pand m4, [esp+%3] ; tc
417 mova m7, m4
418 psubb m7, m6
419 pand m6, m4
420 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
422 mova m4, [r0+2*r1] ; q2
423 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
424 mova m5, [esp] ; mask
425 pand m6, m5
426 mova m5, [esp+%3] ; tc
427 pand m5, m6
428 psubb m7, m6
429 mova m3, [r0+r1]
430 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
432 DEBLOCK_P0_Q0
433 mova [r4+2*r1], m1
434 mova [r0], m2
435 ADD esp, pad
438 ;-----------------------------------------------------------------------------
439 ; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
440 ;-----------------------------------------------------------------------------
441 INIT_MMX
442 cglobal x264_deblock_h_luma_%1, 0,5
443 mov r0, r0mp
444 mov r3, r1m
445 lea r4, [r3*3]
446 sub r0, 4
447 lea r1, [r0+r4]
448 %assign pad 0x78-(stack_offset&15)
449 SUB esp, pad
450 %define pix_tmp esp+12
452 ; transpose 6x16 -> tmp space
453 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
454 lea r0, [r0+r3*8]
455 lea r1, [r1+r3*8]
456 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
458 ; vertical filter
459 lea r0, [pix_tmp+0x30]
460 PUSH dword r4m
461 PUSH dword r3m
462 PUSH dword r2m
463 PUSH dword 16
464 PUSH dword r0
465 call x264_deblock_%2_luma_%1
466 %ifidn %2, v8
467 add dword [esp ], 8 ; pix_tmp+0x38
468 add dword [esp+16], 2 ; tc0+2
469 call x264_deblock_%2_luma_%1
470 %endif
471 ADD esp, 20
473 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
474 mov r0, r0mp
475 sub r0, 2
476 lea r1, [r0+r4]
478 movq m0, [pix_tmp+0x10]
479 movq m1, [pix_tmp+0x20]
480 movq m2, [pix_tmp+0x30]
481 movq m3, [pix_tmp+0x40]
482 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
484 lea r0, [r0+r3*8]
485 lea r1, [r1+r3*8]
486 movq m0, [pix_tmp+0x18]
487 movq m1, [pix_tmp+0x28]
488 movq m2, [pix_tmp+0x38]
489 movq m3, [pix_tmp+0x48]
490 TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
492 ADD esp, pad
494 %endmacro ; DEBLOCK_LUMA
496 INIT_XMM
497 DEBLOCK_LUMA sse2, v, 16
499 %endif ; ARCH
503 %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
504 mova t0, p2
505 mova t1, p0
506 pavgb t0, p1
507 pavgb t1, q0
508 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
509 mova t5, t1
510 mova t2, p2
511 mova t3, p0
512 paddb t2, p1
513 paddb t3, q0
514 paddb t2, t3
515 mova t3, t2
516 mova t4, t2
517 psrlw t2, 1
518 pavgb t2, mpb_00
519 pxor t2, t0
520 pand t2, mpb_01
521 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
523 mova t1, p2
524 mova t2, p2
525 pavgb t1, q1
526 psubb t2, q1
527 paddb t3, t3
528 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
529 pand t2, mpb_01
530 psubb t1, t2
531 pavgb t1, p1
532 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
533 psrlw t3, 2
534 pavgb t3, mpb_00
535 pxor t3, t1
536 pand t3, mpb_01
537 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
539 mova t3, p0
540 mova t2, p0
541 pxor t3, q1
542 pavgb t2, q1
543 pand t3, mpb_01
544 psubb t2, t3
545 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
547 pxor t1, t2
548 pxor t2, p0
549 pand t1, mask1p
550 pand t2, mask0
551 pxor t1, t2
552 pxor t1, p0
553 mova %1, t1 ; store p0
555 mova t1, %4 ; p3
556 mova t2, t1
557 pavgb t1, p2
558 paddb t2, p2
559 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
560 paddb t2, t2
561 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
562 psrlw t2, 2
563 pavgb t2, mpb_00
564 pxor t2, t1
565 pand t2, mpb_01
566 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
568 pxor t0, p1
569 pxor t1, p2
570 pand t0, mask1p
571 pand t1, mask1p
572 pxor t0, p1
573 pxor t1, p2
574 mova %2, t0 ; store p1
575 mova %3, t1 ; store p2
576 %endmacro
578 %macro LUMA_INTRA_SWAP_PQ 0
579 %define q1 m0
580 %define q0 m1
581 %define p0 m2
582 %define p1 m3
583 %define p2 q2
584 %define mask1p mask1q
585 %endmacro
587 %macro DEBLOCK_LUMA_INTRA 2
588 %define p1 m0
589 %define p0 m1
590 %define q0 m2
591 %define q1 m3
592 %define t0 m4
593 %define t1 m5
594 %define t2 m6
595 %define t3 m7
596 %ifdef ARCH_X86_64
597 %define p2 m8
598 %define q2 m9
599 %define t4 m10
600 %define t5 m11
601 %define mask0 m12
602 %define mask1p m13
603 %define mask1q [rsp-24]
604 %define mpb_00 m14
605 %define mpb_01 m15
606 %else
607 %define spill(x) [esp+16*x+((stack_offset+4)&15)]
608 %define p2 [r4+r1]
609 %define q2 [r0+2*r1]
610 %define t4 spill(0)
611 %define t5 spill(1)
612 %define mask0 spill(2)
613 %define mask1p spill(3)
614 %define mask1q spill(4)
615 %define mpb_00 [pb_00 GLOBAL]
616 %define mpb_01 [pb_01 GLOBAL]
617 %endif
619 ;-----------------------------------------------------------------------------
620 ; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
621 ;-----------------------------------------------------------------------------
622 cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
623 %ifndef ARCH_X86_64
624 sub esp, 0x60
625 %endif
626 lea r4, [r1*4]
627 lea r5, [r1*3] ; 3*stride
628 dec r2d ; alpha-1
629 jl .end
630 neg r4
631 dec r3d ; beta-1
632 jl .end
633 add r4, r0 ; pix-4*stride
634 mova p1, [r4+2*r1]
635 mova p0, [r4+r5]
636 mova q0, [r0]
637 mova q1, [r0+r1]
638 %ifdef ARCH_X86_64
639 pxor mpb_00, mpb_00
640 mova mpb_01, [pb_01 GLOBAL]
641 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
642 SWAP 7, 12 ; m12=mask0
643 pavgb t5, mpb_00
644 pavgb t5, mpb_01 ; alpha/4+1
645 movdqa p2, [r4+r1]
646 movdqa q2, [r0+2*r1]
647 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
648 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
649 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
650 pand t0, mask0
651 pand t4, t0
652 pand t2, t0
653 mova mask1q, t4
654 mova mask1p, t2
655 %else
656 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
657 mova m4, t5
658 mova mask0, m7
659 pavgb m4, [pb_00 GLOBAL]
660 pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
661 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
662 pand m6, mask0
663 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
664 pand m4, m6
665 mova mask1p, m4
666 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
667 pand m4, m6
668 mova mask1q, m4
669 %endif
670 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
671 LUMA_INTRA_SWAP_PQ
672 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
673 .end:
674 %ifndef ARCH_X86_64
675 add esp, 0x60
676 %endif
679 INIT_MMX
680 %ifdef ARCH_X86_64
681 ;-----------------------------------------------------------------------------
682 ; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
683 ;-----------------------------------------------------------------------------
684 cglobal x264_deblock_h_luma_intra_%1, 4,7
685 movsxd r10, r1d
686 lea r11, [r10*3]
687 lea r6, [r0-4]
688 lea r5, [r0-4+r11]
689 sub rsp, 0x88
690 %define pix_tmp rsp
692 ; transpose 8x16 -> tmp space
693 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
694 lea r6, [r6+r10*8]
695 lea r5, [r5+r10*8]
696 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
698 lea r0, [pix_tmp+0x40]
699 mov r1, 0x10
700 call x264_deblock_v_luma_intra_%1
702 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
703 lea r5, [r6+r11]
704 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
705 shl r10, 3
706 sub r6, r10
707 sub r5, r10
708 shr r10, 3
709 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
710 add rsp, 0x88
712 %else
713 cglobal x264_deblock_h_luma_intra_%1, 2,4
714 lea r3, [r1*3]
715 sub r0, 4
716 lea r2, [r0+r3]
717 %assign pad 0x8c-(stack_offset&15)
718 SUB rsp, pad
719 %define pix_tmp rsp
721 ; transpose 8x16 -> tmp space
722 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
723 lea r0, [r0+r1*8]
724 lea r2, [r2+r1*8]
725 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
727 lea r0, [pix_tmp+0x40]
728 PUSH dword r3m
729 PUSH dword r2m
730 PUSH dword 16
731 PUSH r0
732 call x264_deblock_%2_luma_intra_%1
733 %ifidn %2, v8
734 add dword [rsp], 8 ; pix_tmp+8
735 call x264_deblock_%2_luma_intra_%1
736 %endif
737 ADD esp, 16
739 mov r1, r1m
740 mov r0, r0mp
741 lea r3, [r1*3]
742 sub r0, 4
743 lea r2, [r0+r3]
744 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
745 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
746 lea r0, [r0+r1*8]
747 lea r2, [r2+r1*8]
748 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
749 ADD rsp, pad
751 %endif ; ARCH_X86_64
752 %endmacro ; DEBLOCK_LUMA_INTRA
754 INIT_XMM
755 DEBLOCK_LUMA_INTRA sse2, v
756 %ifndef ARCH_X86_64
757 INIT_MMX
758 DEBLOCK_LUMA_INTRA mmxext, v8
759 %endif