2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_post_proc_down_and_across_xmm
16 ; unsigned char *src_ptr,
17 ; unsigned char *dst_ptr,
18 ; int src_pixels_per_line,
19 ; int dst_pixels_per_line,
24 global sym
(vp8_post_proc_down_and_across_xmm
)
25 sym
(vp8_post_proc_down_and_across_xmm
):
28 SHADOW_ARGS_TO_STACK
7
35 %if ABI_IS_32BIT
=1 && CONFIG_PIC
=1
37 ; move the global rd onto the stack, since we don't have enough registers
38 ; to do PIC addressing
39 movdqa xmm0
, [GLOBAL(rd42
)]
44 %define RD42
[GLOBAL(rd42
)]
48 movd xmm2
, dword ptr arg
(6) ;flimit
53 mov rsi
, arg
(0) ;src_ptr
54 mov rdi
, arg
(1) ;dst_ptr
56 movsxd rcx
, DWORD PTR arg
(4) ;rows
57 movsxd rax
, DWORD PTR arg
(2) ;src_pixels_per_line ; destination pitch?
58 pxor xmm0
, xmm0
; mm0 = 00000000
62 xor rdx
, rdx
; clear out rdx for use as loop counter
64 movq xmm3
, QWORD PTR [rsi
] ; mm4 = r0 p0..p7
65 punpcklbw xmm3
, xmm0
; mm3 = p0..p3
66 movdqa xmm1
, xmm3
; mm1 = p0..p3
69 movq xmm5
, QWORD PTR [rsi
+ rax
] ; mm4 = r1 p0..p7
70 punpcklbw xmm5
, xmm0
; mm5 = r1 p0..p3
71 paddusw xmm3
, xmm5
; mm3 += mm6
74 movdqa xmm7
, xmm1
; mm7 = r0 p0..p3
75 psubusw xmm7
, xmm5
; mm7 = r0 p0..p3 - r1 p0..p3
76 psubusw xmm5
, xmm1
; mm5 = r1 p0..p3 - r0 p0..p3
77 paddusw xmm7
, xmm5
; mm7 = abs(r0 p0..p3 - r1 p0..p3)
80 movq xmm5
, QWORD PTR [rsi
+ 2*rax
] ; mm4 = r2 p0..p7
81 punpcklbw xmm5
, xmm0
; mm5 = r2 p0..p3
82 paddusw xmm3
, xmm5
; mm3 += mm5
85 movdqa xmm6
, xmm1
; mm6 = r0 p0..p3
86 psubusw xmm6
, xmm5
; mm6 = r0 p0..p3 - r2 p0..p3
87 psubusw xmm5
, xmm1
; mm5 = r2 p0..p3 - r2 p0..p3
88 paddusw xmm6
, xmm5
; mm6 = abs(r0 p0..p3 - r2 p0..p3)
90 por xmm7
, xmm6
; accumulate thresholds
94 movq xmm5
, QWORD PTR [rsi
+2*rax
] ; mm4 = r-2 p0..p7
95 punpcklbw xmm5
, xmm0
; mm5 = r-2 p0..p3
96 paddusw xmm3
, xmm5
; mm3 += mm5
99 movdqa xmm6
, xmm1
; mm6 = r0 p0..p3
100 psubusw xmm6
, xmm5
; mm6 = p0..p3 - r-2 p0..p3
101 psubusw xmm5
, xmm1
; mm5 = r-2 p0..p3 - p0..p3
102 paddusw xmm6
, xmm5
; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
104 por xmm7
, xmm6
; accumulate thresholds
106 movq xmm4
, QWORD PTR [rsi
+rax
] ; mm4 = r-1 p0..p7
107 punpcklbw xmm4
, xmm0
; mm4 = r-1 p0..p3
108 paddusw xmm3
, xmm4
; mm3 += mm5
111 movdqa xmm6
, xmm1
; mm6 = r0 p0..p3
112 psubusw xmm6
, xmm4
; mm6 = p0..p3 - r-2 p0..p3
113 psubusw xmm4
, xmm1
; mm5 = r-1 p0..p3 - p0..p3
114 paddusw xmm6
, xmm4
; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
116 por xmm7
, xmm6
; accumulate thresholds
119 paddusw xmm3
, RD42
; mm3 += round value
120 psraw xmm3
, 3 ; mm3 /= 8
122 pand xmm1
, xmm7
; mm1 select vals > thresh from source
123 pandn xmm7
, xmm3
; mm7 select vals < thresh from blurred result
124 paddusw xmm1
, xmm7
; combination
126 packuswb xmm1
, xmm0
; pack to bytes
127 movq
QWORD PTR [rdi
], xmm1
;
129 neg rax
; pitch is positive
134 cmp edx, dword arg
(5) ;cols
138 ; done with the all cols, start the across filtering in place
143 movq mm0
, QWORD PTR [rdi
-8];
146 movq xmm7
, QWORD PTR [rdi
+rdx
-2]
147 movd xmm4
, DWORD PTR [rdi
+rdx
+6]
154 punpcklbw xmm3
, xmm0
; mm3 = p0..p3
155 movdqa xmm1
, xmm3
; mm1 = p0..p3
161 punpcklbw xmm5
, xmm0
; mm5 = p1..p4
162 paddusw xmm3
, xmm5
; mm3 += mm6
165 movdqa xmm7
, xmm1
; mm7 = p0..p3
166 psubusw xmm7
, xmm5
; mm7 = p0..p3 - p1..p4
167 psubusw xmm5
, xmm1
; mm5 = p1..p4 - p0..p3
168 paddusw xmm7
, xmm5
; mm7 = abs(p0..p3 - p1..p4)
173 punpcklbw xmm5
, xmm0
; mm5 = p2..p5
174 paddusw xmm3
, xmm5
; mm3 += mm5
177 movdqa xmm6
, xmm1
; mm6 = p0..p3
178 psubusw xmm6
, xmm5
; mm6 = p0..p3 - p1..p4
179 psubusw xmm5
, xmm1
; mm5 = p1..p4 - p0..p3
180 paddusw xmm6
, xmm5
; mm6 = abs(p0..p3 - p1..p4)
182 por xmm7
, xmm6
; accumulate thresholds
185 movdqa xmm5
, xmm4
; mm5 = p-2..p5
186 punpcklbw xmm5
, xmm0
; mm5 = p-2..p1
187 paddusw xmm3
, xmm5
; mm3 += mm5
190 movdqa xmm6
, xmm1
; mm6 = p0..p3
191 psubusw xmm6
, xmm5
; mm6 = p0..p3 - p1..p4
192 psubusw xmm5
, xmm1
; mm5 = p1..p4 - p0..p3
193 paddusw xmm6
, xmm5
; mm6 = abs(p0..p3 - p1..p4)
195 por xmm7
, xmm6
; accumulate thresholds
197 psrldq xmm4
, 1 ; mm4 = p-1..p5
198 punpcklbw xmm4
, xmm0
; mm4 = p-1..p2
199 paddusw xmm3
, xmm4
; mm3 += mm5
202 movdqa xmm6
, xmm1
; mm6 = p0..p3
203 psubusw xmm6
, xmm4
; mm6 = p0..p3 - p1..p4
204 psubusw xmm4
, xmm1
; mm5 = p1..p4 - p0..p3
205 paddusw xmm6
, xmm4
; mm6 = abs(p0..p3 - p1..p4)
207 por xmm7
, xmm6
; accumulate thresholds
209 paddusw xmm3
, RD42
; mm3 += round value
210 psraw xmm3
, 3 ; mm3 /= 8
212 pand xmm1
, xmm7
; mm1 select vals > thresh from source
213 pandn xmm7
, xmm3
; mm7 select vals < thresh from blurred result
214 paddusw xmm1
, xmm7
; combination
216 packuswb xmm1
, xmm0
; pack to bytes
217 movq
QWORD PTR [rdi
+rdx
-8], mm0
; store previous four bytes
221 cmp edx, dword arg
(5) ;cols
225 movq
QWORD PTR [rdi
+rdx
-8], mm0
228 add rsi
,rax
; next line
229 mov eax, dword arg
(3) ;dst_pixels_per_line ; destination pitch?
230 add rdi
,rax
; next destination
231 mov eax, dword arg
(2) ;src_pixels_per_line ; destination pitch?
233 dec rcx
; decrement count
234 jnz nextrow
; next row
236 %if ABI_IS_32BIT
=1 && CONFIG_PIC
=1
251 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
252 ; int pitch, int rows, int cols,int flimit)
254 global sym
(vp8_mbpost_proc_down_xmm
)
255 sym
(vp8_mbpost_proc_down_xmm
):
258 SHADOW_ARGS_TO_STACK
5
268 ; unsigned char d[16][8] at [rsp]
269 ; create flimit2 at [rsp+128]
270 mov eax, dword ptr arg
(4) ;flimit
274 mov [rsp
+128+12], eax
275 %define flimit4
[rsp
+128]
278 lea r8
, [GLOBAL(sym
(vp8_rv
))]
284 ;for(c=0; c<cols; c+=8)
289 movsxd rax
, dword ptr arg
(1) ;pitch ;
290 neg rax
; rax = -pitch
292 lea rsi
, [rsi
+ rax
*8]; ; rdi = s[-pitch*8]
305 movq xmm1
, QWORD PTR [rdi
];
306 punpcklbw xmm1
, xmm0
;
312 punpcklwd xmm1
, xmm0
;
314 punpckhwd xmm2
, xmm0
;
322 ;save the var and sum
325 movq xmm1
, QWORD PTR [rsi
] ; [s-pitch*8]
326 movq xmm2
, QWORD PTR [rdi
] ; [s+pitch*7]
385 movq xmm1
, QWORD PTR [rsi
+rax
*8]
394 %if ABI_IS_32BIT
=1 && CONFIG_PIC
=1
396 lea rax
, [GLOBAL(sym
(vp8_rv
))]
397 movdqu xmm4
, [rax
+ rcx
*2] ;vp8_rv[rcx*2]
400 movdqu xmm4
, [r8
+ rcx
*2] ;vp8_rv[rcx*2]
402 movdqu xmm4
, [sym
(vp8_rv
) + rcx
*2]
416 movq
QWORD PTR [rsp
+ rcx
*8], xmm1
;d[rcx*8]
422 movq mm0
, [rsp
+ rcx
*8] ;d[rcx*8]
430 cmp edx, dword arg
(2) ;rows
433 add dword arg
(0), 8 ; s += 8
434 sub dword arg
(3), 8 ; cols -= 8
452 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
453 ; int pitch, int rows, int cols,int flimit)
454 global sym
(vp8_mbpost_proc_across_ip_xmm
)
455 sym
(vp8_mbpost_proc_across_ip_xmm
):
458 SHADOW_ARGS_TO_STACK
5
468 ; create flimit4 at [rsp]
469 mov eax, dword ptr arg
(4) ;flimit
474 %define flimit4
[rsp
]
480 xor rdx
, rdx
;sumsq=0;
487 ; sumsq += s[i]*s[i];
490 movzx eax, byte [rsi
+rdi
]
510 movsxd rdx
, dword arg
(3) ;cols
518 movd xmm1
, DWORD PTR [rsi
+rcx
-8] ; -8 -7 -6 -5
519 movd xmm2
, DWORD PTR [rsi
+rcx
+7] ; +7 +8 +9 +10
521 punpcklbw xmm1
, xmm0
; expanding
522 punpcklbw xmm2
, xmm0
; expanding
524 punpcklwd xmm1
, xmm0
; expanding to dwords
525 punpcklwd xmm2
, xmm0
; expanding to dwords
527 psubd xmm2
, xmm1
; 7--8 8--7 9--6 10--5
528 paddd xmm1
, xmm1
; -8*2 -7*2 -6*2 -5*2
530 paddd xmm1
, xmm2
; 7+-8 8+-7 9+-6 10+-5
531 pmaddwd xmm1
, xmm2
; squared of 7+-8 8+-7 9+-6 10+-5
536 pshufd xmm6
, xmm6
, 0 ; duplicate the last ones
537 pshufd xmm7
, xmm7
, 0 ; duplicate the last ones
539 psrldq xmm1
, 4 ; 8--7 9--6 10--5 0000
540 psrldq xmm2
, 4 ; 8--7 9--6 10--5 0000
542 pshufd xmm3
, xmm1
, 3 ; 0000 8--7 8--7 8--7 squared
543 pshufd xmm4
, xmm2
, 3 ; 0000 8--7 8--7 8--7 squared
548 pshufd xmm3
, xmm1
, 01011111b ; 0000 0000 9--6 9--6 squared
549 pshufd xmm4
, xmm2
, 01011111b ; 0000 0000 9--6 9--6 squared
554 pshufd xmm3
, xmm1
, 10111111b ; 0000 0000 8--7 8--7 squared
555 pshufd xmm4
, xmm2
, 10111111b ; 0000 0000 8--7 8--7 squared
575 movd xmm1
, DWORD PTR [rsi
+rcx
]
582 paddd xmm1
, [GLOBAL(four8s
)]
593 movd
[rsi
+rcx
-8], mm0
606 movsxd rax
, dword arg
(1)
609 sub dword arg
(2), 1 ;rows-=1
627 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
628 ; unsigned char blackclamp[16],
629 ; unsigned char whiteclamp[16],
630 ; unsigned char bothclamp[16],
631 ; unsigned int Width, unsigned int Height, int Pitch)
633 global sym
(vp8_plane_add_noise_wmt
)
634 sym
(vp8_plane_add_noise_wmt
):
637 SHADOW_ARGS_TO_STACK
8
644 call sym
(rand
) WRT_PLT
645 mov rcx
, arg
(1) ;noise
649 ; we rely on the fact that the clamping vectors are stored contiguously
650 ; in black/white/both order. Note that we have to reload this here because
651 ; rdx could be trashed by rand()
652 mov rdx
, arg
(2) ; blackclamp
656 movsxd rcx
, dword arg
(5) ;[Width]
661 movdqu xmm1
,[rsi
+rax
] ; get the source
663 psubusb xmm1
, [rdx
] ;blackclamp ; clamp both sides so we don't outrange adding noise
664 paddusb xmm1
, [rdx
+32] ;bothclamp
665 psubusb xmm1
, [rdx
+16] ;whiteclamp
667 movdqu xmm2
,[rdi
+rax
] ; get the noise for this line
668 paddb xmm1
,xmm2
; add it in
669 movdqu
[rsi
+rax
],xmm1
; store the result
671 add rax
,16 ; move to the next line
676 movsxd rax
, dword arg
(7) ; Pitch
677 add arg
(0), rax
; Start += Pitch
678 sub dword arg
(6), 1 ; Height -= 1