2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %define VP8_FILTER_WEIGHT
128
15 %define VP8_FILTER_SHIFT
7
17 ;void vp8_post_proc_down_and_across_mmx
19 ; unsigned char *src_ptr,
20 ; unsigned char *dst_ptr,
21 ; int src_pixels_per_line,
22 ; int dst_pixels_per_line,
27 global sym
(vp8_post_proc_down_and_across_mmx
)
28 sym
(vp8_post_proc_down_and_across_mmx
):
31 SHADOW_ARGS_TO_STACK
7
37 %if ABI_IS_32BIT
=1 && CONFIG_PIC
=1
38 ; move the global rd onto the stack, since we don't have enough registers
39 ; to do PIC addressing
40 movq mm0
, [GLOBAL(rd
)]
45 %define RD
[GLOBAL(rd
)]
49 lea rbx
, [GLOBAL(Blur
)]
50 movd mm2
, dword ptr arg
(6) ;flimit
54 mov rsi
, arg
(0) ;src_ptr
55 mov rdi
, arg
(1) ;dst_ptr
57 movsxd rcx
, DWORD PTR arg
(4) ;rows
58 movsxd rax
, DWORD PTR arg
(2) ;src_pixels_per_line ; destination pitch?
59 pxor mm0
, mm0
; mm0 = 00000000
63 xor rdx
, rdx
; clear out rdx for use as loop counter
66 pxor mm7
, mm7
; mm7 = 00000000
67 movq mm6
, [rbx
+ 32 ] ; mm6 = kernel 2 taps
68 movq mm3
, [rsi
] ; mm4 = r0 p0..p7
69 punpcklbw mm3
, mm0
; mm3 = p0..p3
70 movq mm1
, mm3
; mm1 = p0..p3
71 pmullw mm3
, mm6
; mm3 *= kernel 2 modifiers
73 movq mm6
, [rbx
+ 48] ; mm6 = kernel 3 taps
74 movq mm5
, [rsi
+ rax
] ; mm4 = r1 p0..p7
75 punpcklbw mm5
, mm0
; mm5 = r1 p0..p3
76 pmullw mm6
, mm5
; mm6 *= p0..p3 * kernel 3 modifiers
77 paddusw mm3
, mm6
; mm3 += mm6
80 movq mm7
, mm1
; mm7 = r0 p0..p3
81 psubusw mm7
, mm5
; mm7 = r0 p0..p3 - r1 p0..p3
82 psubusw mm5
, mm1
; mm5 = r1 p0..p3 - r0 p0..p3
83 paddusw mm7
, mm5
; mm7 = abs(r0 p0..p3 - r1 p0..p3)
86 movq mm6
, [rbx
+ 64 ] ; mm6 = kernel 4 modifiers
87 movq mm5
, [rsi
+ 2*rax
] ; mm4 = r2 p0..p7
88 punpcklbw mm5
, mm0
; mm5 = r2 p0..p3
89 pmullw mm6
, mm5
; mm5 *= kernel 4 modifiers
90 paddusw mm3
, mm6
; mm3 += mm5
93 movq mm6
, mm1
; mm6 = r0 p0..p3
94 psubusw mm6
, mm5
; mm6 = r0 p0..p3 - r2 p0..p3
95 psubusw mm5
, mm1
; mm5 = r2 p0..p3 - r2 p0..p3
96 paddusw mm6
, mm5
; mm6 = abs(r0 p0..p3 - r2 p0..p3)
98 por mm7
, mm6
; accumulate thresholds
102 movq mm6
, [rbx
] ; kernel 0 taps
103 movq mm5
, [rsi
+2*rax
] ; mm4 = r-2 p0..p7
104 punpcklbw mm5
, mm0
; mm5 = r-2 p0..p3
105 pmullw mm6
, mm5
; mm5 *= kernel 0 modifiers
106 paddusw mm3
, mm6
; mm3 += mm5
109 movq mm6
, mm1
; mm6 = r0 p0..p3
110 psubusw mm6
, mm5
; mm6 = p0..p3 - r-2 p0..p3
111 psubusw mm5
, mm1
; mm5 = r-2 p0..p3 - p0..p3
112 paddusw mm6
, mm5
; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
114 por mm7
, mm6
; accumulate thresholds
116 movq mm6
, [rbx
+ 16] ; kernel 1 taps
117 movq mm4
, [rsi
+rax
] ; mm4 = r-1 p0..p7
118 punpcklbw mm4
, mm0
; mm4 = r-1 p0..p3
119 pmullw mm6
, mm4
; mm4 *= kernel 1 modifiers.
120 paddusw mm3
, mm6
; mm3 += mm5
123 movq mm6
, mm1
; mm6 = r0 p0..p3
124 psubusw mm6
, mm4
; mm6 = p0..p3 - r-2 p0..p3
125 psubusw mm4
, mm1
; mm5 = r-1 p0..p3 - p0..p3
126 paddusw mm6
, mm4
; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
128 por mm7
, mm6
; accumulate thresholds
131 paddusw mm3
, RD
; mm3 += round value
132 psraw mm3
, VP8_FILTER_SHIFT
; mm3 /= 128
134 pand mm1
, mm7
; mm1 select vals > thresh from source
135 pandn mm7
, mm3
; mm7 select vals < thresh from blurred result
136 paddusw mm1
, mm7
; combination
138 packuswb mm1
, mm0
; pack to bytes
141 neg rax
; pitch is positive
148 cmp edx, dword ptr arg
(5) ;cols
150 ; done with the all cols, start the across filtering in place
160 pxor mm7
, mm7
; mm7 = 00000000
161 movq mm6
, [rbx
+ 32 ] ;
162 movq mm4
, [rdi
+rdx
] ; mm4 = p0..p7
163 movq mm3
, mm4
; mm3 = p0..p7
164 punpcklbw mm3
, mm0
; mm3 = p0..p3
165 movq mm1
, mm3
; mm1 = p0..p3
166 pmullw mm3
, mm6
; mm3 *= kernel 2 modifiers
169 psrlq mm4
, 8 ; mm4 = p1..p7
170 movq mm5
, mm4
; mm5 = p1..p7
171 punpcklbw mm5
, mm0
; mm5 = p1..p4
172 pmullw mm6
, mm5
; mm6 *= p1..p4 * kernel 3 modifiers
173 paddusw mm3
, mm6
; mm3 += mm6
176 movq mm7
, mm1
; mm7 = p0..p3
177 psubusw mm7
, mm5
; mm7 = p0..p3 - p1..p4
178 psubusw mm5
, mm1
; mm5 = p1..p4 - p0..p3
179 paddusw mm7
, mm5
; mm7 = abs(p0..p3 - p1..p4)
182 movq mm6
, [rbx
+ 64 ]
183 psrlq mm4
, 8 ; mm4 = p2..p7
184 movq mm5
, mm4
; mm5 = p2..p7
185 punpcklbw mm5
, mm0
; mm5 = p2..p5
186 pmullw mm6
, mm5
; mm5 *= kernel 4 modifiers
187 paddusw mm3
, mm6
; mm3 += mm5
190 movq mm6
, mm1
; mm6 = p0..p3
191 psubusw mm6
, mm5
; mm6 = p0..p3 - p1..p4
192 psubusw mm5
, mm1
; mm5 = p1..p4 - p0..p3
193 paddusw mm6
, mm5
; mm6 = abs(p0..p3 - p1..p4)
195 por mm7
, mm6
; accumulate thresholds
199 movq mm4
, [rdi
+rdx
-2] ; mm4 = p-2..p5
200 movq mm5
, mm4
; mm5 = p-2..p5
201 punpcklbw mm5
, mm0
; mm5 = p-2..p1
202 pmullw mm6
, mm5
; mm5 *= kernel 0 modifiers
203 paddusw mm3
, mm6
; mm3 += mm5
206 movq mm6
, mm1
; mm6 = p0..p3
207 psubusw mm6
, mm5
; mm6 = p0..p3 - p1..p4
208 psubusw mm5
, mm1
; mm5 = p1..p4 - p0..p3
209 paddusw mm6
, mm5
; mm6 = abs(p0..p3 - p1..p4)
211 por mm7
, mm6
; accumulate thresholds
214 psrlq mm4
, 8 ; mm4 = p-1..p5
215 punpcklbw mm4
, mm0
; mm4 = p-1..p2
216 pmullw mm6
, mm4
; mm4 *= kernel 1 modifiers.
217 paddusw mm3
, mm6
; mm3 += mm5
220 movq mm6
, mm1
; mm6 = p0..p3
221 psubusw mm6
, mm4
; mm6 = p0..p3 - p1..p4
222 psubusw mm4
, mm1
; mm5 = p1..p4 - p0..p3
223 paddusw mm6
, mm4
; mm6 = abs(p0..p3 - p1..p4)
225 por mm7
, mm6
; accumulate thresholds
227 paddusw mm3
, RD
; mm3 += round value
228 psraw mm3
, VP8_FILTER_SHIFT
; mm3 /= 128
230 pand mm1
, mm7
; mm1 select vals > thresh from source
231 pandn mm7
, mm3
; mm7 select vals < thresh from blurred result
232 paddusw mm1
, mm7
; combination
234 packuswb mm1
, mm0
; pack to bytes
235 mov DWORD PTR [rdi
+rdx
-4], eax ; store previous four bytes
239 cmp edx, dword ptr arg
(5) ;cols
242 mov DWORD PTR [rdi
+rdx
-4], eax
246 add rsi
,rax
; next line
247 movsxd rax
, dword ptr arg
(3) ;dst_pixels_per_line ; destination pitch?
248 add rdi
,rax
; next destination
249 movsxd rax
, dword ptr arg
(2) ;src_pixels_per_line ; destination pitch?
251 dec rcx
; decrement count
252 jnz nextrow
; next row
265 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
266 ; int pitch, int rows, int cols,int flimit)
268 global sym
(vp8_mbpost_proc_down_mmx
)
269 sym
(vp8_mbpost_proc_down_mmx
):
272 SHADOW_ARGS_TO_STACK
5
281 ; unsigned char d[16][8] at [rsp]
282 ; create flimit2 at [rsp+128]
283 mov eax, dword ptr arg
(4) ;flimit
286 %define flimit2
[rsp
+128]
289 lea r8
, [GLOBAL(sym
(vp8_rv
))]
293 add dword ptr arg
(2), 8
295 ;for(c=0; c<cols; c+=4)
300 movsxd rax
, dword ptr arg
(1) ;pitch ;
301 neg rax
; rax = -pitch
303 lea rsi
, [rsi
+ rax
*8]; ; rdi = s[-pitch*8]
316 movd mm1
, DWORD PTR [rdi
];
333 ;save the var and sum
336 movd mm1
, DWORD PTR [rsi
] ; [s-pitch*8]
337 movd mm2
, DWORD PTR [rdi
] ; [s+pitch*7]
396 movd mm1
, DWORD PTR [rsi
+rax
*8]
405 %if ABI_IS_32BIT
=1 && CONFIG_PIC
=1
407 lea rax
, [GLOBAL(sym
(vp8_rv
))]
408 movq mm4
, [rax
+ rcx
*2] ;vp8_rv[rcx*2]
411 movq mm4
, [r8
+ rcx
*2] ;vp8_rv[rcx*2]
413 movq mm4
, [sym
(vp8_rv
) + rcx
*2]
426 movd
DWORD PTR [rsp
+rcx
*4], mm1
;d[rcx*4]
432 movd mm1
, DWORD PTR [rsp
+rcx
*4] ;d[rcx*4]
440 cmp edx, dword arg
(2) ;rows
444 add dword arg
(0), 4 ; s += 4
445 sub dword arg
(3), 4 ; cols -= 4
462 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
463 ; unsigned char blackclamp[16],
464 ; unsigned char whiteclamp[16],
465 ; unsigned char bothclamp[16],
466 ; unsigned int Width, unsigned int Height, int Pitch)
468 global sym
(vp8_plane_add_noise_mmx
)
469 sym
(vp8_plane_add_noise_mmx
):
472 SHADOW_ARGS_TO_STACK
8
479 call sym
(rand
) WRT_PLT
480 mov rcx
, arg
(1) ;noise
484 ; we rely on the fact that the clamping vectors are stored contiguously
485 ; in black/white/both order. Note that we have to reload this here because
486 ; rdx could be trashed by rand()
487 mov rdx
, arg
(2) ; blackclamp
491 movsxd rcx
, dword arg
(5) ;[Width]
496 movq mm1
,[rsi
+rax
] ; get the source
498 psubusb mm1
, [rdx
] ;blackclamp ; clamp both sides so we don't outrange adding noise
499 paddusb mm1
, [rdx
+32] ;bothclamp
500 psubusb mm1
, [rdx
+16] ;whiteclamp
502 movq mm2
,[rdi
+rax
] ; get the noise for this line
503 paddb mm1
,mm2
; add it in
504 movq
[rsi
+rax
],mm1
; store the result
506 add rax
,8 ; move to the next line
511 movsxd rax
, dword arg
(7) ; Pitch
512 add arg
(0), rax
; Start += Pitch
513 sub dword arg
(6), 1 ; Height -= 1