clarify *_offsets.asm differences
[libvpx.git] / vp8 / common / x86 / postproc_sse2.asm
blob30b4bf53a3bfb3a9ff7c89c04bd1cb7d3f52ce12
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_post_proc_down_and_across_xmm
16 ; unsigned char *src_ptr,
17 ; unsigned char *dst_ptr,
18 ; int src_pixels_per_line,
19 ; int dst_pixels_per_line,
20 ; int rows,
21 ; int cols,
22 ; int flimit
24 global sym(vp8_post_proc_down_and_across_xmm)
25 sym(vp8_post_proc_down_and_across_xmm):
26 push rbp
27 mov rbp, rsp
28 SHADOW_ARGS_TO_STACK 7
29 SAVE_XMM
30 GET_GOT rbx
31 push rsi
32 push rdi
33 ; end prolog
35 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
36 ALIGN_STACK 16, rax
37 ; move the global rd onto the stack, since we don't have enough registers
38 ; to do PIC addressing
39 movdqa xmm0, [GLOBAL(rd42)]
40 sub rsp, 16
41 movdqa [rsp], xmm0
42 %define RD42 [rsp]
43 %else
44 %define RD42 [GLOBAL(rd42)]
45 %endif
48 movd xmm2, dword ptr arg(6) ;flimit
49 punpcklwd xmm2, xmm2
50 punpckldq xmm2, xmm2
51 punpcklqdq xmm2, xmm2
53 mov rsi, arg(0) ;src_ptr
54 mov rdi, arg(1) ;dst_ptr
56 movsxd rcx, DWORD PTR arg(4) ;rows
57 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
58 pxor xmm0, xmm0 ; mm0 = 00000000
60 nextrow:
62 xor rdx, rdx ; clear out rdx for use as loop counter
63 nextcol:
64 movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
65 punpcklbw xmm3, xmm0 ; mm3 = p0..p3
66 movdqa xmm1, xmm3 ; mm1 = p0..p3
67 psllw xmm3, 2 ;
69 movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7
70 punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3
71 paddusw xmm3, xmm5 ; mm3 += mm6
73 ; thresholding
74 movdqa xmm7, xmm1 ; mm7 = r0 p0..p3
75 psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3
76 psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3
77 paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
78 pcmpgtw xmm7, xmm2
80 movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
81 punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3
82 paddusw xmm3, xmm5 ; mm3 += mm5
84 ; thresholding
85 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
86 psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3
87 psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3
88 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
89 pcmpgtw xmm6, xmm2
90 por xmm7, xmm6 ; accumulate thresholds
93 neg rax
94 movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7
95 punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3
96 paddusw xmm3, xmm5 ; mm3 += mm5
98 ; thresholding
99 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
100 psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3
101 psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3
102 paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
103 pcmpgtw xmm6, xmm2
104 por xmm7, xmm6 ; accumulate thresholds
106 movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7
107 punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3
108 paddusw xmm3, xmm4 ; mm3 += mm5
110 ; thresholding
111 movdqa xmm6, xmm1 ; mm6 = r0 p0..p3
112 psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3
113 psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3
114 paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
115 pcmpgtw xmm6, xmm2
116 por xmm7, xmm6 ; accumulate thresholds
119 paddusw xmm3, RD42 ; mm3 += round value
120 psraw xmm3, 3 ; mm3 /= 8
122 pand xmm1, xmm7 ; mm1 select vals > thresh from source
123 pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
124 paddusw xmm1, xmm7 ; combination
126 packuswb xmm1, xmm0 ; pack to bytes
127 movq QWORD PTR [rdi], xmm1 ;
129 neg rax ; pitch is positive
130 add rsi, 8
131 add rdi, 8
133 add rdx, 8
134 cmp edx, dword arg(5) ;cols
136 jl nextcol
138 ; done with the all cols, start the across filtering in place
139 sub rsi, rdx
140 sub rdi, rdx
142 xor rdx, rdx
143 movq mm0, QWORD PTR [rdi-8];
145 acrossnextcol:
146 movq xmm7, QWORD PTR [rdi +rdx -2]
147 movd xmm4, DWORD PTR [rdi +rdx +6]
149 pslldq xmm4, 8
150 por xmm4, xmm7
152 movdqa xmm3, xmm4
153 psrldq xmm3, 2
154 punpcklbw xmm3, xmm0 ; mm3 = p0..p3
155 movdqa xmm1, xmm3 ; mm1 = p0..p3
156 psllw xmm3, 2
159 movdqa xmm5, xmm4
160 psrldq xmm5, 3
161 punpcklbw xmm5, xmm0 ; mm5 = p1..p4
162 paddusw xmm3, xmm5 ; mm3 += mm6
164 ; thresholding
165 movdqa xmm7, xmm1 ; mm7 = p0..p3
166 psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4
167 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
168 paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4)
169 pcmpgtw xmm7, xmm2
171 movdqa xmm5, xmm4
172 psrldq xmm5, 4
173 punpcklbw xmm5, xmm0 ; mm5 = p2..p5
174 paddusw xmm3, xmm5 ; mm3 += mm5
176 ; thresholding
177 movdqa xmm6, xmm1 ; mm6 = p0..p3
178 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
179 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
180 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
181 pcmpgtw xmm6, xmm2
182 por xmm7, xmm6 ; accumulate thresholds
185 movdqa xmm5, xmm4 ; mm5 = p-2..p5
186 punpcklbw xmm5, xmm0 ; mm5 = p-2..p1
187 paddusw xmm3, xmm5 ; mm3 += mm5
189 ; thresholding
190 movdqa xmm6, xmm1 ; mm6 = p0..p3
191 psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4
192 psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3
193 paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4)
194 pcmpgtw xmm6, xmm2
195 por xmm7, xmm6 ; accumulate thresholds
197 psrldq xmm4, 1 ; mm4 = p-1..p5
198 punpcklbw xmm4, xmm0 ; mm4 = p-1..p2
199 paddusw xmm3, xmm4 ; mm3 += mm5
201 ; thresholding
202 movdqa xmm6, xmm1 ; mm6 = p0..p3
203 psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4
204 psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3
205 paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4)
206 pcmpgtw xmm6, xmm2
207 por xmm7, xmm6 ; accumulate thresholds
209 paddusw xmm3, RD42 ; mm3 += round value
210 psraw xmm3, 3 ; mm3 /= 8
212 pand xmm1, xmm7 ; mm1 select vals > thresh from source
213 pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result
214 paddusw xmm1, xmm7 ; combination
216 packuswb xmm1, xmm0 ; pack to bytes
217 movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes
218 movdq2q mm0, xmm1
220 add rdx, 8
221 cmp edx, dword arg(5) ;cols
222 jl acrossnextcol;
224 ; last 8 pixels
225 movq QWORD PTR [rdi+rdx-8], mm0
227 ; done with this rwo
228 add rsi,rax ; next line
229 mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
230 add rdi,rax ; next destination
231 mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
233 dec rcx ; decrement count
234 jnz nextrow ; next row
236 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
237 add rsp,16
238 pop rsp
239 %endif
240 ; begin epilog
241 pop rdi
242 pop rsi
243 RESTORE_GOT
244 RESTORE_XMM
245 UNSHADOW_ARGS
246 pop rbp
248 %undef RD42
251 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
252 ; int pitch, int rows, int cols,int flimit)
253 extern sym(vp8_rv)
254 global sym(vp8_mbpost_proc_down_xmm)
255 sym(vp8_mbpost_proc_down_xmm):
256 push rbp
257 mov rbp, rsp
258 SHADOW_ARGS_TO_STACK 5
259 SAVE_XMM
260 GET_GOT rbx
261 push rsi
262 push rdi
263 ; end prolog
265 ALIGN_STACK 16, rax
266 sub rsp, 128+16
268 ; unsigned char d[16][8] at [rsp]
269 ; create flimit2 at [rsp+128]
270 mov eax, dword ptr arg(4) ;flimit
271 mov [rsp+128], eax
272 mov [rsp+128+4], eax
273 mov [rsp+128+8], eax
274 mov [rsp+128+12], eax
275 %define flimit4 [rsp+128]
277 %if ABI_IS_32BIT=0
278 lea r8, [GLOBAL(sym(vp8_rv))]
279 %endif
281 ;rows +=8;
282 add dword arg(2), 8
284 ;for(c=0; c<cols; c+=8)
285 loop_col:
286 mov rsi, arg(0) ; s
287 pxor xmm0, xmm0 ;
289 movsxd rax, dword ptr arg(1) ;pitch ;
290 neg rax ; rax = -pitch
292 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
293 neg rax
296 pxor xmm5, xmm5
297 pxor xmm6, xmm6 ;
299 pxor xmm7, xmm7 ;
300 mov rdi, rsi
302 mov rcx, 15 ;
304 loop_initvar:
305 movq xmm1, QWORD PTR [rdi];
306 punpcklbw xmm1, xmm0 ;
308 paddw xmm5, xmm1 ;
309 pmullw xmm1, xmm1 ;
311 movdqa xmm2, xmm1 ;
312 punpcklwd xmm1, xmm0 ;
314 punpckhwd xmm2, xmm0 ;
315 paddd xmm6, xmm1 ;
317 paddd xmm7, xmm2 ;
318 lea rdi, [rdi+rax] ;
320 dec rcx
321 jne loop_initvar
322 ;save the var and sum
323 xor rdx, rdx
324 loop_row:
325 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
326 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
328 punpcklbw xmm1, xmm0
329 punpcklbw xmm2, xmm0
331 paddw xmm5, xmm2
332 psubw xmm5, xmm1
334 pmullw xmm2, xmm2
335 movdqa xmm4, xmm2
337 punpcklwd xmm2, xmm0
338 punpckhwd xmm4, xmm0
340 paddd xmm6, xmm2
341 paddd xmm7, xmm4
343 pmullw xmm1, xmm1
344 movdqa xmm2, xmm1
346 punpcklwd xmm1, xmm0
347 psubd xmm6, xmm1
349 punpckhwd xmm2, xmm0
350 psubd xmm7, xmm2
353 movdqa xmm3, xmm6
354 pslld xmm3, 4
356 psubd xmm3, xmm6
357 movdqa xmm1, xmm5
359 movdqa xmm4, xmm5
360 pmullw xmm1, xmm1
362 pmulhw xmm4, xmm4
363 movdqa xmm2, xmm1
365 punpcklwd xmm1, xmm4
366 punpckhwd xmm2, xmm4
368 movdqa xmm4, xmm7
369 pslld xmm4, 4
371 psubd xmm4, xmm7
373 psubd xmm3, xmm1
374 psubd xmm4, xmm2
376 psubd xmm3, flimit4
377 psubd xmm4, flimit4
379 psrad xmm3, 31
380 psrad xmm4, 31
382 packssdw xmm3, xmm4
383 packsswb xmm3, xmm0
385 movq xmm1, QWORD PTR [rsi+rax*8]
387 movq xmm2, xmm1
388 punpcklbw xmm1, xmm0
390 paddw xmm1, xmm5
391 mov rcx, rdx
393 and rcx, 127
394 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
395 push rax
396 lea rax, [GLOBAL(sym(vp8_rv))]
397 movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2]
398 pop rax
399 %elif ABI_IS_32BIT=0
400 movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
401 %else
402 movdqu xmm4, [sym(vp8_rv) + rcx*2]
403 %endif
405 paddw xmm1, xmm4
406 ;paddw xmm1, eight8s
407 psraw xmm1, 4
409 packuswb xmm1, xmm0
410 pand xmm1, xmm3
412 pandn xmm3, xmm2
413 por xmm1, xmm3
415 and rcx, 15
416 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
418 mov rcx, rdx
419 sub rcx, 8
421 and rcx, 15
422 movq mm0, [rsp + rcx*8] ;d[rcx*8]
424 movq [rsi], mm0
425 lea rsi, [rsi+rax]
427 lea rdi, [rdi+rax]
428 add rdx, 1
430 cmp edx, dword arg(2) ;rows
431 jl loop_row
433 add dword arg(0), 8 ; s += 8
434 sub dword arg(3), 8 ; cols -= 8
435 cmp dword arg(3), 0
436 jg loop_col
438 add rsp, 128+16
439 pop rsp
441 ; begin epilog
442 pop rdi
443 pop rsi
444 RESTORE_GOT
445 RESTORE_XMM
446 UNSHADOW_ARGS
447 pop rbp
449 %undef flimit4
452 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
453 ; int pitch, int rows, int cols,int flimit)
454 global sym(vp8_mbpost_proc_across_ip_xmm)
455 sym(vp8_mbpost_proc_across_ip_xmm):
456 push rbp
457 mov rbp, rsp
458 SHADOW_ARGS_TO_STACK 5
459 SAVE_XMM
460 GET_GOT rbx
461 push rsi
462 push rdi
463 ; end prolog
465 ALIGN_STACK 16, rax
466 sub rsp, 16
468 ; create flimit4 at [rsp]
469 mov eax, dword ptr arg(4) ;flimit
470 mov [rsp], eax
471 mov [rsp+4], eax
472 mov [rsp+8], eax
473 mov [rsp+12], eax
474 %define flimit4 [rsp]
477 ;for(r=0;r<rows;r++)
478 ip_row_loop:
480 xor rdx, rdx ;sumsq=0;
481 xor rcx, rcx ;sum=0;
482 mov rsi, arg(0); s
483 mov rdi, -8
484 ip_var_loop:
485 ;for(i=-8;i<=6;i++)
487 ; sumsq += s[i]*s[i];
488 ; sum += s[i];
490 movzx eax, byte [rsi+rdi]
491 add ecx, eax
492 mul al
493 add edx, eax
494 add rdi, 1
495 cmp rdi, 6
496 jle ip_var_loop
499 ;mov rax, sumsq
500 ;movd xmm7, rax
501 movd xmm7, edx
503 ;mov rax, sum
504 ;movd xmm6, rax
505 movd xmm6, ecx
507 mov rsi, arg(0) ;s
508 xor rcx, rcx
510 movsxd rdx, dword arg(3) ;cols
511 add rdx, 8
512 pxor mm0, mm0
513 pxor mm1, mm1
515 pxor xmm0, xmm0
516 nextcol4:
518 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
519 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
521 punpcklbw xmm1, xmm0 ; expanding
522 punpcklbw xmm2, xmm0 ; expanding
524 punpcklwd xmm1, xmm0 ; expanding to dwords
525 punpcklwd xmm2, xmm0 ; expanding to dwords
527 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5
528 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2
530 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5
531 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5
533 paddd xmm6, xmm2
534 paddd xmm7, xmm1
536 pshufd xmm6, xmm6, 0 ; duplicate the last ones
537 pshufd xmm7, xmm7, 0 ; duplicate the last ones
539 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000
540 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000
542 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared
543 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared
545 paddd xmm6, xmm4
546 paddd xmm7, xmm3
548 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared
549 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared
551 paddd xmm7, xmm3
552 paddd xmm6, xmm4
554 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared
555 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared
557 paddd xmm7, xmm3
558 paddd xmm6, xmm4
560 movdqa xmm3, xmm6
561 pmaddwd xmm3, xmm3
563 movdqa xmm5, xmm7
564 pslld xmm5, 4
566 psubd xmm5, xmm7
567 psubd xmm5, xmm3
569 psubd xmm5, flimit4
570 psrad xmm5, 31
572 packssdw xmm5, xmm0
573 packsswb xmm5, xmm0
575 movd xmm1, DWORD PTR [rsi+rcx]
576 movq xmm2, xmm1
578 punpcklbw xmm1, xmm0
579 punpcklwd xmm1, xmm0
581 paddd xmm1, xmm6
582 paddd xmm1, [GLOBAL(four8s)]
584 psrad xmm1, 4
585 packssdw xmm1, xmm0
587 packuswb xmm1, xmm0
588 pand xmm1, xmm5
590 pandn xmm5, xmm2
591 por xmm5, xmm1
593 movd [rsi+rcx-8], mm0
594 movq mm0, mm1
596 movdq2q mm1, xmm5
597 psrldq xmm7, 12
599 psrldq xmm6, 12
600 add rcx, 4
602 cmp rcx, rdx
603 jl nextcol4
605 ;s+=pitch;
606 movsxd rax, dword arg(1)
607 add arg(0), rax
609 sub dword arg(2), 1 ;rows-=1
610 cmp dword arg(2), 0
611 jg ip_row_loop
613 add rsp, 16
614 pop rsp
616 ; begin epilog
617 pop rdi
618 pop rsi
619 RESTORE_GOT
620 RESTORE_XMM
621 UNSHADOW_ARGS
622 pop rbp
624 %undef flimit4
627 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
628 ; unsigned char blackclamp[16],
629 ; unsigned char whiteclamp[16],
630 ; unsigned char bothclamp[16],
631 ; unsigned int Width, unsigned int Height, int Pitch)
632 extern sym(rand)
633 global sym(vp8_plane_add_noise_wmt)
634 sym(vp8_plane_add_noise_wmt):
635 push rbp
636 mov rbp, rsp
637 SHADOW_ARGS_TO_STACK 8
638 GET_GOT rbx
639 push rsi
640 push rdi
641 ; end prolog
643 addnoise_loop:
644 call sym(rand) WRT_PLT
645 mov rcx, arg(1) ;noise
646 and rax, 0xff
647 add rcx, rax
649 ; we rely on the fact that the clamping vectors are stored contiguously
650 ; in black/white/both order. Note that we have to reload this here because
651 ; rdx could be trashed by rand()
652 mov rdx, arg(2) ; blackclamp
655 mov rdi, rcx
656 movsxd rcx, dword arg(5) ;[Width]
657 mov rsi, arg(0) ;Pos
658 xor rax,rax
660 addnoise_nextset:
661 movdqu xmm1,[rsi+rax] ; get the source
663 psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
664 paddusb xmm1, [rdx+32] ;bothclamp
665 psubusb xmm1, [rdx+16] ;whiteclamp
667 movdqu xmm2,[rdi+rax] ; get the noise for this line
668 paddb xmm1,xmm2 ; add it in
669 movdqu [rsi+rax],xmm1 ; store the result
671 add rax,16 ; move to the next line
673 cmp rax, rcx
674 jl addnoise_nextset
676 movsxd rax, dword arg(7) ; Pitch
677 add arg(0), rax ; Start += Pitch
678 sub dword arg(6), 1 ; Height -= 1
679 jg addnoise_loop
681 ; begin epilog
682 pop rdi
683 pop rsi
684 RESTORE_GOT
685 UNSHADOW_ARGS
686 pop rbp
690 SECTION_RODATA
691 align 16
692 rd42:
693 times 8 dw 0x04
694 four8s:
695 times 4 dd 8