Add save/restore xmm registers in x86 assembly code
[libvpx.git] / vp8 / common / x86 / postproc_mmx.asm
blob787e832687bef0dd4544239d6067acbad0875dbf
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %define VP8_FILTER_WEIGHT 128
15 %define VP8_FILTER_SHIFT 7
17 ;void vp8_post_proc_down_and_across_mmx
19 ; unsigned char *src_ptr,
20 ; unsigned char *dst_ptr,
21 ; int src_pixels_per_line,
22 ; int dst_pixels_per_line,
23 ; int rows,
24 ; int cols,
25 ; int flimit
27 global sym(vp8_post_proc_down_and_across_mmx)
28 sym(vp8_post_proc_down_and_across_mmx):
29 push rbp
30 mov rbp, rsp
31 SHADOW_ARGS_TO_STACK 7
32 GET_GOT rbx
33 push rsi
34 push rdi
35 ; end prolog
37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
38 ; move the global rd onto the stack, since we don't have enough registers
39 ; to do PIC addressing
40 movq mm0, [GLOBAL(rd)]
41 sub rsp, 8
42 movq [rsp], mm0
43 %define RD [rsp]
44 %else
45 %define RD [GLOBAL(rd)]
46 %endif
48 push rbx
49 lea rbx, [GLOBAL(Blur)]
50 movd mm2, dword ptr arg(6) ;flimit
51 punpcklwd mm2, mm2
52 punpckldq mm2, mm2
54 mov rsi, arg(0) ;src_ptr
55 mov rdi, arg(1) ;dst_ptr
57 movsxd rcx, DWORD PTR arg(4) ;rows
58 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
59 pxor mm0, mm0 ; mm0 = 00000000
61 nextrow:
63 xor rdx, rdx ; clear out rdx for use as loop counter
64 nextcol:
66 pxor mm7, mm7 ; mm7 = 00000000
67 movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
68 movq mm3, [rsi] ; mm4 = r0 p0..p7
69 punpcklbw mm3, mm0 ; mm3 = p0..p3
70 movq mm1, mm3 ; mm1 = p0..p3
71 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
73 movq mm6, [rbx + 48] ; mm6 = kernel 3 taps
74 movq mm5, [rsi + rax] ; mm4 = r1 p0..p7
75 punpcklbw mm5, mm0 ; mm5 = r1 p0..p3
76 pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers
77 paddusw mm3, mm6 ; mm3 += mm6
79 ; thresholding
80 movq mm7, mm1 ; mm7 = r0 p0..p3
81 psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3
82 psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3
83 paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
84 pcmpgtw mm7, mm2
86 movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers
87 movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7
88 punpcklbw mm5, mm0 ; mm5 = r2 p0..p3
89 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
90 paddusw mm3, mm6 ; mm3 += mm5
92 ; thresholding
93 movq mm6, mm1 ; mm6 = r0 p0..p3
94 psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3
95 psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3
96 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
97 pcmpgtw mm6, mm2
98 por mm7, mm6 ; accumulate thresholds
101 neg rax
102 movq mm6, [rbx ] ; kernel 0 taps
103 movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7
104 punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3
105 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
106 paddusw mm3, mm6 ; mm3 += mm5
108 ; thresholding
109 movq mm6, mm1 ; mm6 = r0 p0..p3
110 psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3
111 psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3
112 paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
113 pcmpgtw mm6, mm2
114 por mm7, mm6 ; accumulate thresholds
116 movq mm6, [rbx + 16] ; kernel 1 taps
117 movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7
118 punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3
119 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
120 paddusw mm3, mm6 ; mm3 += mm5
122 ; thresholding
123 movq mm6, mm1 ; mm6 = r0 p0..p3
124 psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3
125 psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3
126 paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
127 pcmpgtw mm6, mm2
128 por mm7, mm6 ; accumulate thresholds
131 paddusw mm3, RD ; mm3 += round value
132 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
134 pand mm1, mm7 ; mm1 select vals > thresh from source
135 pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
136 paddusw mm1, mm7 ; combination
138 packuswb mm1, mm0 ; pack to bytes
140 movd [rdi], mm1 ;
141 neg rax ; pitch is positive
144 add rsi, 4
145 add rdi, 4
146 add rdx, 4
148 cmp edx, dword ptr arg(5) ;cols
149 jl nextcol
150 ; done with the all cols, start the across filtering in place
151 sub rsi, rdx
152 sub rdi, rdx
155 push rax
156 xor rdx, rdx
157 mov rax, [rdi-4];
159 acrossnextcol:
160 pxor mm7, mm7 ; mm7 = 00000000
161 movq mm6, [rbx + 32 ] ;
162 movq mm4, [rdi+rdx] ; mm4 = p0..p7
163 movq mm3, mm4 ; mm3 = p0..p7
164 punpcklbw mm3, mm0 ; mm3 = p0..p3
165 movq mm1, mm3 ; mm1 = p0..p3
166 pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers
168 movq mm6, [rbx + 48]
169 psrlq mm4, 8 ; mm4 = p1..p7
170 movq mm5, mm4 ; mm5 = p1..p7
171 punpcklbw mm5, mm0 ; mm5 = p1..p4
172 pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers
173 paddusw mm3, mm6 ; mm3 += mm6
175 ; thresholding
176 movq mm7, mm1 ; mm7 = p0..p3
177 psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4
178 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
179 paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4)
180 pcmpgtw mm7, mm2
182 movq mm6, [rbx + 64 ]
183 psrlq mm4, 8 ; mm4 = p2..p7
184 movq mm5, mm4 ; mm5 = p2..p7
185 punpcklbw mm5, mm0 ; mm5 = p2..p5
186 pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers
187 paddusw mm3, mm6 ; mm3 += mm5
189 ; thresholding
190 movq mm6, mm1 ; mm6 = p0..p3
191 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
192 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
193 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
194 pcmpgtw mm6, mm2
195 por mm7, mm6 ; accumulate thresholds
198 movq mm6, [rbx ]
199 movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5
200 movq mm5, mm4 ; mm5 = p-2..p5
201 punpcklbw mm5, mm0 ; mm5 = p-2..p1
202 pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers
203 paddusw mm3, mm6 ; mm3 += mm5
205 ; thresholding
206 movq mm6, mm1 ; mm6 = p0..p3
207 psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4
208 psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3
209 paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4)
210 pcmpgtw mm6, mm2
211 por mm7, mm6 ; accumulate thresholds
213 movq mm6, [rbx + 16]
214 psrlq mm4, 8 ; mm4 = p-1..p5
215 punpcklbw mm4, mm0 ; mm4 = p-1..p2
216 pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers.
217 paddusw mm3, mm6 ; mm3 += mm5
219 ; thresholding
220 movq mm6, mm1 ; mm6 = p0..p3
221 psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4
222 psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3
223 paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4)
224 pcmpgtw mm6, mm2
225 por mm7, mm6 ; accumulate thresholds
227 paddusw mm3, RD ; mm3 += round value
228 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128
230 pand mm1, mm7 ; mm1 select vals > thresh from source
231 pandn mm7, mm3 ; mm7 select vals < thresh from blurred result
232 paddusw mm1, mm7 ; combination
234 packuswb mm1, mm0 ; pack to bytes
235 mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes
236 movd eax, mm1
238 add rdx, 4
239 cmp edx, dword ptr arg(5) ;cols
240 jl acrossnextcol;
242 mov DWORD PTR [rdi+rdx-4], eax
243 pop rax
245 ; done with this rwo
246 add rsi,rax ; next line
247 movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
248 add rdi,rax ; next destination
249 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
251 dec rcx ; decrement count
252 jnz nextrow ; next row
253 pop rbx
255 ; begin epilog
256 pop rdi
257 pop rsi
258 RESTORE_GOT
259 UNSHADOW_ARGS
260 pop rbp
262 %undef RD
265 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
266 ; int pitch, int rows, int cols,int flimit)
267 extern sym(vp8_rv)
268 global sym(vp8_mbpost_proc_down_mmx)
269 sym(vp8_mbpost_proc_down_mmx):
270 push rbp
271 mov rbp, rsp
272 SHADOW_ARGS_TO_STACK 5
273 GET_GOT rbx
274 push rsi
275 push rdi
276 ; end prolog
278 ALIGN_STACK 16, rax
279 sub rsp, 136
281 ; unsigned char d[16][8] at [rsp]
282 ; create flimit2 at [rsp+128]
283 mov eax, dword ptr arg(4) ;flimit
284 mov [rsp+128], eax
285 mov [rsp+128+4], eax
286 %define flimit2 [rsp+128]
288 %if ABI_IS_32BIT=0
289 lea r8, [GLOBAL(sym(vp8_rv))]
290 %endif
292 ;rows +=8;
293 add dword ptr arg(2), 8
295 ;for(c=0; c<cols; c+=4)
296 loop_col:
297 mov rsi, arg(0) ;s
298 pxor mm0, mm0 ;
300 movsxd rax, dword ptr arg(1) ;pitch ;
301 neg rax ; rax = -pitch
303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8]
304 neg rax
307 pxor mm5, mm5
308 pxor mm6, mm6 ;
310 pxor mm7, mm7 ;
311 mov rdi, rsi
313 mov rcx, 15 ;
315 loop_initvar:
316 movd mm1, DWORD PTR [rdi];
317 punpcklbw mm1, mm0 ;
319 paddw mm5, mm1 ;
320 pmullw mm1, mm1 ;
322 movq mm2, mm1 ;
323 punpcklwd mm1, mm0 ;
325 punpckhwd mm2, mm0 ;
326 paddd mm6, mm1 ;
328 paddd mm7, mm2 ;
329 lea rdi, [rdi+rax] ;
331 dec rcx
332 jne loop_initvar
333 ;save the var and sum
334 xor rdx, rdx
335 loop_row:
336 movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
337 movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
339 punpcklbw mm1, mm0
340 punpcklbw mm2, mm0
342 paddw mm5, mm2
343 psubw mm5, mm1
345 pmullw mm2, mm2
346 movq mm4, mm2
348 punpcklwd mm2, mm0
349 punpckhwd mm4, mm0
351 paddd mm6, mm2
352 paddd mm7, mm4
354 pmullw mm1, mm1
355 movq mm2, mm1
357 punpcklwd mm1, mm0
358 psubd mm6, mm1
360 punpckhwd mm2, mm0
361 psubd mm7, mm2
364 movq mm3, mm6
365 pslld mm3, 4
367 psubd mm3, mm6
368 movq mm1, mm5
370 movq mm4, mm5
371 pmullw mm1, mm1
373 pmulhw mm4, mm4
374 movq mm2, mm1
376 punpcklwd mm1, mm4
377 punpckhwd mm2, mm4
379 movq mm4, mm7
380 pslld mm4, 4
382 psubd mm4, mm7
384 psubd mm3, mm1
385 psubd mm4, mm2
387 psubd mm3, flimit2
388 psubd mm4, flimit2
390 psrad mm3, 31
391 psrad mm4, 31
393 packssdw mm3, mm4
394 packsswb mm3, mm0
396 movd mm1, DWORD PTR [rsi+rax*8]
398 movq mm2, mm1
399 punpcklbw mm1, mm0
401 paddw mm1, mm5
402 mov rcx, rdx
404 and rcx, 127
405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
406 push rax
407 lea rax, [GLOBAL(sym(vp8_rv))]
408 movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2]
409 pop rax
410 %elif ABI_IS_32BIT=0
411 movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2]
412 %else
413 movq mm4, [sym(vp8_rv) + rcx*2]
414 %endif
415 paddw mm1, mm4
416 ;paddw xmm1, eight8s
417 psraw mm1, 4
419 packuswb mm1, mm0
420 pand mm1, mm3
422 pandn mm3, mm2
423 por mm1, mm3
425 and rcx, 15
426 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
428 mov rcx, rdx
429 sub rcx, 8
431 and rcx, 15
432 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
434 movd [rsi], mm1
435 lea rsi, [rsi+rax]
437 lea rdi, [rdi+rax]
438 add rdx, 1
440 cmp edx, dword arg(2) ;rows
441 jl loop_row
444 add dword arg(0), 4 ; s += 4
445 sub dword arg(3), 4 ; cols -= 4
446 cmp dword arg(3), 0
447 jg loop_col
449 add rsp, 136
450 pop rsp
452 ; begin epilog
453 pop rdi
454 pop rsi
455 RESTORE_GOT
456 UNSHADOW_ARGS
457 pop rbp
459 %undef flimit2
462 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
463 ; unsigned char blackclamp[16],
464 ; unsigned char whiteclamp[16],
465 ; unsigned char bothclamp[16],
466 ; unsigned int Width, unsigned int Height, int Pitch)
467 extern sym(rand)
468 global sym(vp8_plane_add_noise_mmx)
469 sym(vp8_plane_add_noise_mmx):
470 push rbp
471 mov rbp, rsp
472 SHADOW_ARGS_TO_STACK 8
473 GET_GOT rbx
474 push rsi
475 push rdi
476 ; end prolog
478 addnoise_loop:
479 call sym(rand) WRT_PLT
480 mov rcx, arg(1) ;noise
481 and rax, 0xff
482 add rcx, rax
484 ; we rely on the fact that the clamping vectors are stored contiguously
485 ; in black/white/both order. Note that we have to reload this here because
486 ; rdx could be trashed by rand()
487 mov rdx, arg(2) ; blackclamp
490 mov rdi, rcx
491 movsxd rcx, dword arg(5) ;[Width]
492 mov rsi, arg(0) ;Pos
493 xor rax,rax
495 addnoise_nextset:
496 movq mm1,[rsi+rax] ; get the source
498 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
499 paddusb mm1, [rdx+32] ;bothclamp
500 psubusb mm1, [rdx+16] ;whiteclamp
502 movq mm2,[rdi+rax] ; get the noise for this line
503 paddb mm1,mm2 ; add it in
504 movq [rsi+rax],mm1 ; store the result
506 add rax,8 ; move to the next line
508 cmp rax, rcx
509 jl addnoise_nextset
511 movsxd rax, dword arg(7) ; Pitch
512 add arg(0), rax ; Start += Pitch
513 sub dword arg(6), 1 ; Height -= 1
514 jg addnoise_loop
516 ; begin epilog
517 pop rdi
518 pop rsi
519 RESTORE_GOT
520 UNSHADOW_ARGS
521 pop rbp
525 SECTION_RODATA
526 align 16
527 Blur:
528 times 16 dw 16
529 times 8 dw 64
530 times 16 dw 16
531 times 8 dw 0
534 times 4 dw 0x40