Merge "Skip computation of distortion in vp8_pick_inter_mode if active_map is used"
[libvpx.git] / vp8 / common / x86 / loopfilter_mmx.asm
blob697a5dee6004c87f5009231852153aa57dcd0e6a
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 ;void vp8_loop_filter_horizontal_edge_mmx
17 ; unsigned char *src_ptr,
18 ; int src_pixel_step,
19 ; const char *blimit,
20 ; const char *limit,
21 ; const char *thresh,
22 ; int count
24 global sym(vp8_loop_filter_horizontal_edge_mmx)
25 sym(vp8_loop_filter_horizontal_edge_mmx):
26 push rbp
27 mov rbp, rsp
28 SHADOW_ARGS_TO_STACK 6
29 GET_GOT rbx
30 push rsi
31 push rdi
32 ; end prolog
34 ALIGN_STACK 16, rax
35 sub rsp, 32 ; reserve 32 bytes
36 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
37 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
39 mov rsi, arg(0) ;src_ptr
40 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
42 movsxd rcx, dword ptr arg(5) ;count
43 .next8_h:
44 mov rdx, arg(3) ;limit
45 movq mm7, [rdx]
46 mov rdi, rsi ; rdi points to row +1 for indirect addressing
47 add rdi, rax
49 ; calculate breakout conditions
50 movq mm2, [rdi+2*rax] ; q3
51 movq mm1, [rsi+2*rax] ; q2
52 movq mm6, mm1 ; q2
53 psubusb mm1, mm2 ; q2-=q3
54 psubusb mm2, mm6 ; q3-=q2
55 por mm1, mm2 ; abs(q3-q2)
56 psubusb mm1, mm7 ;
59 movq mm4, [rsi+rax] ; q1
60 movq mm3, mm4 ; q1
61 psubusb mm4, mm6 ; q1-=q2
62 psubusb mm6, mm3 ; q2-=q1
63 por mm4, mm6 ; abs(q2-q1)
65 psubusb mm4, mm7
66 por mm1, mm4
68 movq mm4, [rsi] ; q0
69 movq mm0, mm4 ; q0
70 psubusb mm4, mm3 ; q0-=q1
71 psubusb mm3, mm0 ; q1-=q0
72 por mm4, mm3 ; abs(q0-q1)
73 movq t0, mm4 ; save to t0
74 psubusb mm4, mm7
75 por mm1, mm4
78 neg rax ; negate pitch to deal with above border
80 movq mm2, [rsi+4*rax] ; p3
81 movq mm4, [rdi+4*rax] ; p2
82 movq mm5, mm4 ; p2
83 psubusb mm4, mm2 ; p2-=p3
84 psubusb mm2, mm5 ; p3-=p2
85 por mm4, mm2 ; abs(p3 - p2)
86 psubusb mm4, mm7
87 por mm1, mm4
90 movq mm4, [rsi+2*rax] ; p1
91 movq mm3, mm4 ; p1
92 psubusb mm4, mm5 ; p1-=p2
93 psubusb mm5, mm3 ; p2-=p1
94 por mm4, mm5 ; abs(p2 - p1)
95 psubusb mm4, mm7
96 por mm1, mm4
98 movq mm2, mm3 ; p1
100 movq mm4, [rsi+rax] ; p0
101 movq mm5, mm4 ; p0
102 psubusb mm4, mm3 ; p0-=p1
103 psubusb mm3, mm5 ; p1-=p0
104 por mm4, mm3 ; abs(p1 - p0)
105 movq t1, mm4 ; save to t1
106 psubusb mm4, mm7
107 por mm1, mm4
109 movq mm3, [rdi] ; q1
110 movq mm4, mm3 ; q1
111 psubusb mm3, mm2 ; q1-=p1
112 psubusb mm2, mm4 ; p1-=q1
113 por mm2, mm3 ; abs(p1-q1)
114 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
115 psrlw mm2, 1 ; abs(p1-q1)/2
117 movq mm6, mm5 ; p0
118 movq mm3, [rsi] ; q0
119 psubusb mm5, mm3 ; p0-=q0
120 psubusb mm3, mm6 ; q0-=p0
121 por mm5, mm3 ; abs(p0 - q0)
122 paddusb mm5, mm5 ; abs(p0-q0)*2
123 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
125 mov rdx, arg(2) ;blimit ; get blimit
126 movq mm7, [rdx] ; blimit
128 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
129 por mm1, mm5
130 pxor mm5, mm5
131 pcmpeqb mm1, mm5 ; mask mm1
133 ; calculate high edge variance
134 mov rdx, arg(4) ;thresh ; get thresh
135 movq mm7, [rdx] ;
136 movq mm4, t0 ; get abs (q1 - q0)
137 psubusb mm4, mm7
138 movq mm3, t1 ; get abs (p1 - p0)
139 psubusb mm3, mm7
140 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
142 pcmpeqb mm4, mm5
144 pcmpeqb mm5, mm5
145 pxor mm4, mm5
148 ; start work on filters
149 movq mm2, [rsi+2*rax] ; p1
150 movq mm7, [rdi] ; q1
151 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
152 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
153 psubsb mm2, mm7 ; p1 - q1
154 pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
155 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
156 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
157 movq mm3, mm0 ; q0
158 psubsb mm0, mm6 ; q0 - p0
159 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
160 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
161 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
162 pand mm1, mm2 ; mask filter values we don't care about
163 movq mm2, mm1
164 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
165 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
167 pxor mm0, mm0 ;
168 pxor mm5, mm5
169 punpcklbw mm0, mm2 ;
170 punpckhbw mm5, mm2 ;
171 psraw mm0, 11 ;
172 psraw mm5, 11
173 packsswb mm0, mm5
174 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
176 pxor mm0, mm0 ; 0
177 movq mm5, mm1 ; abcdefgh
178 punpcklbw mm0, mm1 ; e0f0g0h0
179 psraw mm0, 11 ; sign extended shift right by 3
180 pxor mm1, mm1 ; 0
181 punpckhbw mm1, mm5 ; a0b0c0d0
182 psraw mm1, 11 ; sign extended shift right by 3
183 movq mm5, mm0 ; save results
185 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
186 paddsw mm5, [GLOBAL(ones)]
187 paddsw mm1, [GLOBAL(ones)]
188 psraw mm5, 1 ; partial shifted one more time for 2nd tap
189 psraw mm1, 1 ; partial shifted one more time for 2nd tap
190 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
191 pandn mm4, mm5 ; high edge variance additive
193 paddsb mm6, mm2 ; p0+= p0 add
194 pxor mm6, [GLOBAL(t80)] ; unoffset
195 movq [rsi+rax], mm6 ; write back
197 movq mm6, [rsi+2*rax] ; p1
198 pxor mm6, [GLOBAL(t80)] ; reoffset
199 paddsb mm6, mm4 ; p1+= p1 add
200 pxor mm6, [GLOBAL(t80)] ; unoffset
201 movq [rsi+2*rax], mm6 ; write back
203 psubsb mm3, mm0 ; q0-= q0 add
204 pxor mm3, [GLOBAL(t80)] ; unoffset
205 movq [rsi], mm3 ; write back
207 psubsb mm7, mm4 ; q1-= q1 add
208 pxor mm7, [GLOBAL(t80)] ; unoffset
209 movq [rdi], mm7 ; write back
211 add rsi,8
212 neg rax
213 dec rcx
214 jnz .next8_h
216 add rsp, 32
217 pop rsp
218 ; begin epilog
219 pop rdi
220 pop rsi
221 RESTORE_GOT
222 UNSHADOW_ARGS
223 pop rbp
227 ;void vp8_loop_filter_vertical_edge_mmx
229 ; unsigned char *src_ptr,
230 ; int src_pixel_step,
231 ; const char *blimit,
232 ; const char *limit,
233 ; const char *thresh,
234 ; int count
236 global sym(vp8_loop_filter_vertical_edge_mmx)
237 sym(vp8_loop_filter_vertical_edge_mmx):
238 push rbp
239 mov rbp, rsp
240 SHADOW_ARGS_TO_STACK 6
241 GET_GOT rbx
242 push rsi
243 push rdi
244 ; end prolog
246 ALIGN_STACK 16, rax
247 sub rsp, 64 ; reserve 64 bytes
248 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
249 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
250 %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
252 mov rsi, arg(0) ;src_ptr
253 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
255 lea rsi, [rsi + rax*4 - 4]
257 movsxd rcx, dword ptr arg(5) ;count
258 .next8_v:
259 mov rdi, rsi ; rdi points to row +1 for indirect addressing
260 add rdi, rax
263 ;transpose
264 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
265 movq mm7, mm6 ; 77 76 75 74 73 72 71 70
267 punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
268 punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
270 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
271 movq mm5, mm4 ; 47 46 45 44 43 42 41 40
273 punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
274 punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
276 movq mm3, mm5 ; 57 47 56 46 55 45 54 44
277 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
279 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
280 movq mm2, mm4 ; 53 43 52 42 51 41 50 40
282 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
283 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
285 neg rax
286 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
288 movq mm1, mm6 ; 27 26 25 24 23 22 21 20
289 punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
291 punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
292 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
294 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
295 movq mm0, mm7 ; 17 07 16 06 15 05 14 04
297 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
298 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
300 movq mm6, mm7 ; 37 27 17 07 36 26 16 06
301 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
303 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
305 movq mm5, mm6 ; 76 66 56 46 36 26 16 06
306 psubusb mm5, mm7 ; q2-q3
308 psubusb mm7, mm6 ; q3-q2
309 por mm7, mm5; ; mm7=abs (q3-q2)
311 movq mm5, mm0 ; 35 25 15 05 34 24 14 04
312 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
314 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
315 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
317 psubusb mm3, mm6 ; q1-q2
318 psubusb mm6, mm5 ; q2-q1
320 por mm6, mm3 ; mm6=abs(q2-q1)
321 lea rdx, srct
323 movq [rdx+24], mm5 ; save q1
324 movq [rdx+16], mm0 ; save q0
326 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
327 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
329 movq mm0, mm3 ; 13 03 12 02 11 01 10 00
330 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
332 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
333 movq mm1, mm0 ; 31 21 11 01 30 20 10 00
335 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
336 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
338 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
339 psubusb mm2, mm0 ; p2-p3
341 psubusb mm0, mm1 ; p3-p2
342 por mm0, mm2 ; mm0=abs(p3-p2)
344 movq mm2, mm3 ; 33 23 13 03 32 22 12 02
345 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
347 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
348 movq [rdx+8], mm3 ; save p0
350 movq [rdx], mm2 ; save p1
351 movq mm5, mm2 ; mm5 = p1
353 psubusb mm2, mm1 ; p1-p2
354 psubusb mm1, mm5 ; p2-p1
356 por mm1, mm2 ; mm1=abs(p2-p1)
357 mov rdx, arg(3) ;limit
359 movq mm4, [rdx] ; mm4 = limit
360 psubusb mm7, mm4
362 psubusb mm0, mm4
363 psubusb mm1, mm4
365 psubusb mm6, mm4
366 por mm7, mm6
368 por mm0, mm1
369 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
371 movq mm1, mm5 ; p1
373 movq mm7, mm3 ; mm3=mm7=p0
374 psubusb mm7, mm5 ; p0 - p1
376 psubusb mm5, mm3 ; p1 - p0
377 por mm5, mm7 ; abs(p1-p0)
379 movq t0, mm5 ; save abs(p1-p0)
380 lea rdx, srct
382 psubusb mm5, mm4
383 por mm0, mm5 ; mm0=mask
385 movq mm5, [rdx+16] ; mm5=q0
386 movq mm7, [rdx+24] ; mm7=q1
388 movq mm6, mm5 ; mm6=q0
389 movq mm2, mm7 ; q1
390 psubusb mm5, mm7 ; q0-q1
392 psubusb mm7, mm6 ; q1-q0
393 por mm7, mm5 ; abs(q1-q0)
395 movq t1, mm7 ; save abs(q1-q0)
396 psubusb mm7, mm4
398 por mm0, mm7 ; mask
400 movq mm5, mm2 ; q1
401 psubusb mm5, mm1 ; q1-=p1
402 psubusb mm1, mm2 ; p1-=q1
403 por mm5, mm1 ; abs(p1-q1)
404 pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
405 psrlw mm5, 1 ; abs(p1-q1)/2
407 mov rdx, arg(2) ;blimit ;
409 movq mm4, [rdx] ;blimit
410 movq mm1, mm3 ; mm1=mm3=p0
412 movq mm7, mm6 ; mm7=mm6=q0
413 psubusb mm1, mm7 ; p0-q0
415 psubusb mm7, mm3 ; q0-p0
416 por mm1, mm7 ; abs(q0-p0)
417 paddusb mm1, mm1 ; abs(q0-p0)*2
418 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
420 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
421 por mm1, mm0; ; mask
423 pxor mm0, mm0
424 pcmpeqb mm1, mm0
426 ; calculate high edge variance
427 mov rdx, arg(4) ;thresh ; get thresh
428 movq mm7, [rdx]
430 movq mm4, t0 ; get abs (q1 - q0)
431 psubusb mm4, mm7
433 movq mm3, t1 ; get abs (p1 - p0)
434 psubusb mm3, mm7
436 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
437 pcmpeqb mm4, mm0
439 pcmpeqb mm0, mm0
440 pxor mm4, mm0
444 ; start work on filters
445 lea rdx, srct
447 movq mm2, [rdx] ; p1
448 movq mm7, [rdx+24] ; q1
450 movq mm6, [rdx+8] ; p0
451 movq mm0, [rdx+16] ; q0
453 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
454 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
456 psubsb mm2, mm7 ; p1 - q1
457 pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
459 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
460 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
462 movq mm3, mm0 ; q0
463 psubsb mm0, mm6 ; q0 - p0
465 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
466 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
468 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
469 pand mm1, mm2 ; mask filter values we don't care about
471 movq mm2, mm1
472 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
474 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
475 pxor mm0, mm0 ;
477 pxor mm5, mm5
478 punpcklbw mm0, mm2 ;
480 punpckhbw mm5, mm2 ;
481 psraw mm0, 11 ;
483 psraw mm5, 11
484 packsswb mm0, mm5
486 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
488 pxor mm0, mm0 ; 0
489 movq mm5, mm1 ; abcdefgh
491 punpcklbw mm0, mm1 ; e0f0g0h0
492 psraw mm0, 11 ; sign extended shift right by 3
494 pxor mm1, mm1 ; 0
495 punpckhbw mm1, mm5 ; a0b0c0d0
497 psraw mm1, 11 ; sign extended shift right by 3
498 movq mm5, mm0 ; save results
500 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
501 paddsw mm5, [GLOBAL(ones)]
503 paddsw mm1, [GLOBAL(ones)]
504 psraw mm5, 1 ; partial shifted one more time for 2nd tap
506 psraw mm1, 1 ; partial shifted one more time for 2nd tap
507 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
509 pandn mm4, mm5 ; high edge variance additive
511 paddsb mm6, mm2 ; p0+= p0 add
512 pxor mm6, [GLOBAL(t80)] ; unoffset
514 ; mm6=p0 ;
515 movq mm1, [rdx] ; p1
516 pxor mm1, [GLOBAL(t80)] ; reoffset
518 paddsb mm1, mm4 ; p1+= p1 add
519 pxor mm1, [GLOBAL(t80)] ; unoffset
520 ; mm6 = p0 mm1 = p1
522 psubsb mm3, mm0 ; q0-= q0 add
523 pxor mm3, [GLOBAL(t80)] ; unoffset
525 ; mm3 = q0
526 psubsb mm7, mm4 ; q1-= q1 add
527 pxor mm7, [GLOBAL(t80)] ; unoffset
528 ; mm7 = q1
530 ; tranpose and write back
531 ; mm1 = 72 62 52 42 32 22 12 02
532 ; mm6 = 73 63 53 43 33 23 13 03
533 ; mm3 = 74 64 54 44 34 24 14 04
534 ; mm7 = 75 65 55 45 35 25 15 05
536 movq mm2, mm1 ; 72 62 52 42 32 22 12 02
537 punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
539 movq mm4, mm3 ; 74 64 54 44 34 24 14 04
540 punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
542 punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
543 punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
545 movq mm6, mm2 ; 33 32 23 22 13 12 03 02
546 punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
548 punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
549 movq mm5, mm1 ; 73 72 63 62 53 52 43 42
551 punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
552 punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
555 ; mm2 = 15 14 13 12 05 04 03 02
556 ; mm6 = 35 34 33 32 25 24 23 22
557 ; mm5 = 55 54 53 52 45 44 43 42
558 ; mm1 = 75 74 73 72 65 64 63 62
562 movd [rsi+rax*4+2], mm2
563 psrlq mm2, 32
565 movd [rdi+rax*4+2], mm2
566 movd [rsi+rax*2+2], mm6
568 psrlq mm6, 32
569 movd [rsi+rax+2],mm6
571 movd [rsi+2], mm1
572 psrlq mm1, 32
574 movd [rdi+2], mm1
575 neg rax
577 movd [rdi+rax+2],mm5
578 psrlq mm5, 32
580 movd [rdi+rax*2+2], mm5
582 lea rsi, [rsi+rax*8]
583 dec rcx
584 jnz .next8_v
586 add rsp, 64
587 pop rsp
588 ; begin epilog
589 pop rdi
590 pop rsi
591 RESTORE_GOT
592 UNSHADOW_ARGS
593 pop rbp
597 ;void vp8_mbloop_filter_horizontal_edge_mmx
599 ; unsigned char *src_ptr,
600 ; int src_pixel_step,
601 ; const char *blimit,
602 ; const char *limit,
603 ; const char *thresh,
604 ; int count
606 global sym(vp8_mbloop_filter_horizontal_edge_mmx)
607 sym(vp8_mbloop_filter_horizontal_edge_mmx):
608 push rbp
609 mov rbp, rsp
610 SHADOW_ARGS_TO_STACK 6
611 GET_GOT rbx
612 push rsi
613 push rdi
614 ; end prolog
616 ALIGN_STACK 16, rax
617 sub rsp, 32 ; reserve 32 bytes
618 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
619 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
621 mov rsi, arg(0) ;src_ptr
622 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
624 movsxd rcx, dword ptr arg(5) ;count
625 .next8_mbh:
626 mov rdx, arg(3) ;limit
627 movq mm7, [rdx]
628 mov rdi, rsi ; rdi points to row +1 for indirect addressing
629 add rdi, rax
631 ; calculate breakout conditions
632 movq mm2, [rdi+2*rax] ; q3
634 movq mm1, [rsi+2*rax] ; q2
635 movq mm6, mm1 ; q2
636 psubusb mm1, mm2 ; q2-=q3
637 psubusb mm2, mm6 ; q3-=q2
638 por mm1, mm2 ; abs(q3-q2)
639 psubusb mm1, mm7
642 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
643 movq mm4, [rsi+rax] ; q1
644 movq mm3, mm4 ; q1
645 psubusb mm4, mm6 ; q1-=q2
646 psubusb mm6, mm3 ; q2-=q1
647 por mm4, mm6 ; abs(q2-q1)
648 psubusb mm4, mm7
649 por mm1, mm4
652 ; mm1 = mask, mm3=q1, mm7 = limit
654 movq mm4, [rsi] ; q0
655 movq mm0, mm4 ; q0
656 psubusb mm4, mm3 ; q0-=q1
657 psubusb mm3, mm0 ; q1-=q0
658 por mm4, mm3 ; abs(q0-q1)
659 movq t0, mm4 ; save to t0
660 psubusb mm4, mm7
661 por mm1, mm4
664 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
666 neg rax ; negate pitch to deal with above border
668 movq mm2, [rsi+4*rax] ; p3
669 movq mm4, [rdi+4*rax] ; p2
670 movq mm5, mm4 ; p2
671 psubusb mm4, mm2 ; p2-=p3
672 psubusb mm2, mm5 ; p3-=p2
673 por mm4, mm2 ; abs(p3 - p2)
674 psubusb mm4, mm7
675 por mm1, mm4
676 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
678 movq mm4, [rsi+2*rax] ; p1
679 movq mm3, mm4 ; p1
680 psubusb mm4, mm5 ; p1-=p2
681 psubusb mm5, mm3 ; p2-=p1
682 por mm4, mm5 ; abs(p2 - p1)
683 psubusb mm4, mm7
684 por mm1, mm4
686 movq mm2, mm3 ; p1
689 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
691 movq mm4, [rsi+rax] ; p0
692 movq mm5, mm4 ; p0
693 psubusb mm4, mm3 ; p0-=p1
694 psubusb mm3, mm5 ; p1-=p0
695 por mm4, mm3 ; abs(p1 - p0)
696 movq t1, mm4 ; save to t1
697 psubusb mm4, mm7
698 por mm1, mm4
699 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
700 ; mm5 = p0
701 movq mm3, [rdi] ; q1
702 movq mm4, mm3 ; q1
703 psubusb mm3, mm2 ; q1-=p1
704 psubusb mm2, mm4 ; p1-=q1
705 por mm2, mm3 ; abs(p1-q1)
706 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
707 psrlw mm2, 1 ; abs(p1-q1)/2
709 movq mm6, mm5 ; p0
710 movq mm3, mm0 ; q0
711 psubusb mm5, mm3 ; p0-=q0
712 psubusb mm3, mm6 ; q0-=p0
713 por mm5, mm3 ; abs(p0 - q0)
714 paddusb mm5, mm5 ; abs(p0-q0)*2
715 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
717 mov rdx, arg(2) ;blimit ; get blimit
718 movq mm7, [rdx] ; blimit
720 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
721 por mm1, mm5
722 pxor mm5, mm5
723 pcmpeqb mm1, mm5 ; mask mm1
725 ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
726 ; mm6 = p0,
728 ; calculate high edge variance
729 mov rdx, arg(4) ;thresh ; get thresh
730 movq mm7, [rdx] ;
731 movq mm4, t0 ; get abs (q1 - q0)
732 psubusb mm4, mm7
733 movq mm3, t1 ; get abs (p1 - p0)
734 psubusb mm3, mm7
735 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
737 pcmpeqb mm4, mm5
739 pcmpeqb mm5, mm5
740 pxor mm4, mm5
744 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
745 ; mm6 = p0, mm4=hev
746 ; start work on filters
747 movq mm2, [rsi+2*rax] ; p1
748 movq mm7, [rdi] ; q1
749 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
750 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
751 psubsb mm2, mm7 ; p1 - q1
753 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
754 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
755 movq mm3, mm0 ; q0
756 psubsb mm0, mm6 ; q0 - p0
757 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
758 paddsb mm2, mm0 ; 2 * (q0 - p0)
759 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
760 pand mm1, mm2 ; mask filter values we don't care about
763 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
764 movq mm2, mm1 ; vp8_filter
765 pand mm2, mm4; ; Filter2 = vp8_filter & hev
767 movq mm5, mm2 ;
768 paddsb mm5, [GLOBAL(t3)];
770 pxor mm0, mm0 ; 0
771 pxor mm7, mm7 ; 0
773 punpcklbw mm0, mm5 ; e0f0g0h0
774 psraw mm0, 11 ; sign extended shift right by 3
775 punpckhbw mm7, mm5 ; a0b0c0d0
776 psraw mm7, 11 ; sign extended shift right by 3
777 packsswb mm0, mm7 ; Filter2 >>=3;
779 movq mm5, mm0 ; Filter2
781 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
782 pxor mm0, mm0 ; 0
783 pxor mm7, mm7 ; 0
785 punpcklbw mm0, mm2 ; e0f0g0h0
786 psraw mm0, 11 ; sign extended shift right by 3
787 punpckhbw mm7, mm2 ; a0b0c0d0
788 psraw mm7, 11 ; sign extended shift right by 3
789 packsswb mm0, mm7 ; Filter2 >>=3;
791 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
792 psubsb mm3, mm0 ; qs0 =qs0 - filter1
793 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
795 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
796 ; vp8_filter &= ~hev;
797 ; Filter2 = vp8_filter;
798 pandn mm4, mm1 ; vp8_filter&=~hev
801 ; mm3=qs0, mm4=filter2, mm6=ps0
803 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
804 ; s = vp8_signed_char_clamp(qs0 - u);
805 ; *oq0 = s^0x80;
806 ; s = vp8_signed_char_clamp(ps0 + u);
807 ; *op0 = s^0x80;
808 pxor mm0, mm0
810 pxor mm1, mm1
811 pxor mm2, mm2
812 punpcklbw mm1, mm4
813 punpckhbw mm2, mm4
814 pmulhw mm1, [GLOBAL(s27)]
815 pmulhw mm2, [GLOBAL(s27)]
816 paddw mm1, [GLOBAL(s63)]
817 paddw mm2, [GLOBAL(s63)]
818 psraw mm1, 7
819 psraw mm2, 7
820 packsswb mm1, mm2
822 psubsb mm3, mm1
823 paddsb mm6, mm1
825 pxor mm3, [GLOBAL(t80)]
826 pxor mm6, [GLOBAL(t80)]
827 movq [rsi+rax], mm6
828 movq [rsi], mm3
830 ; roughly 2/7th difference across boundary
831 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
832 ; s = vp8_signed_char_clamp(qs1 - u);
833 ; *oq1 = s^0x80;
834 ; s = vp8_signed_char_clamp(ps1 + u);
835 ; *op1 = s^0x80;
836 pxor mm1, mm1
837 pxor mm2, mm2
838 punpcklbw mm1, mm4
839 punpckhbw mm2, mm4
840 pmulhw mm1, [GLOBAL(s18)]
841 pmulhw mm2, [GLOBAL(s18)]
842 paddw mm1, [GLOBAL(s63)]
843 paddw mm2, [GLOBAL(s63)]
844 psraw mm1, 7
845 psraw mm2, 7
846 packsswb mm1, mm2
848 movq mm3, [rdi]
849 movq mm6, [rsi+rax*2] ; p1
851 pxor mm3, [GLOBAL(t80)]
852 pxor mm6, [GLOBAL(t80)]
854 paddsb mm6, mm1
855 psubsb mm3, mm1
857 pxor mm6, [GLOBAL(t80)]
858 pxor mm3, [GLOBAL(t80)]
859 movq [rdi], mm3
860 movq [rsi+rax*2], mm6
862 ; roughly 1/7th difference across boundary
863 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
864 ; s = vp8_signed_char_clamp(qs2 - u);
865 ; *oq2 = s^0x80;
866 ; s = vp8_signed_char_clamp(ps2 + u);
867 ; *op2 = s^0x80;
868 pxor mm1, mm1
869 pxor mm2, mm2
870 punpcklbw mm1, mm4
871 punpckhbw mm2, mm4
872 pmulhw mm1, [GLOBAL(s9)]
873 pmulhw mm2, [GLOBAL(s9)]
874 paddw mm1, [GLOBAL(s63)]
875 paddw mm2, [GLOBAL(s63)]
876 psraw mm1, 7
877 psraw mm2, 7
878 packsswb mm1, mm2
881 movq mm6, [rdi+rax*4]
882 neg rax
883 movq mm3, [rdi+rax ]
885 pxor mm6, [GLOBAL(t80)]
886 pxor mm3, [GLOBAL(t80)]
888 paddsb mm6, mm1
889 psubsb mm3, mm1
891 pxor mm6, [GLOBAL(t80)]
892 pxor mm3, [GLOBAL(t80)]
893 movq [rdi+rax ], mm3
894 neg rax
895 movq [rdi+rax*4], mm6
897 ;EARLY_BREAK_OUT:
898 neg rax
899 add rsi,8
900 dec rcx
901 jnz .next8_mbh
903 add rsp, 32
904 pop rsp
905 ; begin epilog
906 pop rdi
907 pop rsi
908 RESTORE_GOT
909 UNSHADOW_ARGS
910 pop rbp
914 ;void vp8_mbloop_filter_vertical_edge_mmx
916 ; unsigned char *src_ptr,
917 ; int src_pixel_step,
918 ; const char *blimit,
919 ; const char *limit,
920 ; const char *thresh,
921 ; int count
923 global sym(vp8_mbloop_filter_vertical_edge_mmx)
924 sym(vp8_mbloop_filter_vertical_edge_mmx):
925 push rbp
926 mov rbp, rsp
927 SHADOW_ARGS_TO_STACK 6
928 GET_GOT rbx
929 push rsi
930 push rdi
931 ; end prolog
933 ALIGN_STACK 16, rax
934 sub rsp, 96 ; reserve 96 bytes
935 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
936 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
937 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
939 mov rsi, arg(0) ;src_ptr
940 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
942 lea rsi, [rsi + rax*4 - 4]
944 movsxd rcx, dword ptr arg(5) ;count
945 .next8_mbv:
946 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
948 ;transpose
949 movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70
950 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
952 movq mm7, mm6 ; 77 76 75 74 73 72 71 70
953 punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64
955 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
956 movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50
958 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
959 movq mm5, mm4 ; 47 46 45 44 43 42 41 40
961 punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44
962 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
964 movq mm3, mm5 ; 57 47 56 46 55 45 54 44
965 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
967 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
968 movq mm2, mm4 ; 53 43 52 42 51 41 50 40
970 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
971 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
973 neg rax
975 movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30
976 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
978 movq mm1, mm6 ; 27 26 25 24 23 22 21 20
979 punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24
981 punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20
983 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
984 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
986 movq mm0, mm7 ; 17 07 16 06 15 05 14 04
987 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
989 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
990 movq mm6, mm7 ; 37 27 17 07 36 26 16 06
992 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
993 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
995 lea rdx, srct
996 movq mm5, mm6 ; 76 66 56 46 36 26 16 06
998 movq [rdx+56], mm7
999 psubusb mm5, mm7 ; q2-q3
1002 movq [rdx+48], mm6
1003 psubusb mm7, mm6 ; q3-q2
1005 por mm7, mm5; ; mm7=abs (q3-q2)
1006 movq mm5, mm0 ; 35 25 15 05 34 24 14 04
1008 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
1009 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
1011 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
1012 psubusb mm3, mm6 ; q1-q2
1014 psubusb mm6, mm5 ; q2-q1
1015 por mm6, mm3 ; mm6=abs(q2-q1)
1017 movq [rdx+40], mm5 ; save q1
1018 movq [rdx+32], mm0 ; save q0
1020 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
1021 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
1023 movq mm0, mm3 ; 13 03 12 02 11 01 10 00
1024 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
1026 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
1027 movq mm1, mm0 ; 31 21 11 01 30 20 10 00
1029 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
1030 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
1032 movq [rdx], mm0 ; save p3
1033 movq [rdx+8], mm1 ; save p2
1035 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
1036 psubusb mm2, mm0 ; p2-p3
1038 psubusb mm0, mm1 ; p3-p2
1039 por mm0, mm2 ; mm0=abs(p3-p2)
1041 movq mm2, mm3 ; 33 23 13 03 32 22 12 02
1042 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
1044 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
1045 movq [rdx+24], mm3 ; save p0
1047 movq [rdx+16], mm2 ; save p1
1048 movq mm5, mm2 ; mm5 = p1
1050 psubusb mm2, mm1 ; p1-p2
1051 psubusb mm1, mm5 ; p2-p1
1053 por mm1, mm2 ; mm1=abs(p2-p1)
1054 mov rdx, arg(3) ;limit
1056 movq mm4, [rdx] ; mm4 = limit
1057 psubusb mm7, mm4 ; abs(q3-q2) > limit
1059 psubusb mm0, mm4 ; abs(p3-p2) > limit
1060 psubusb mm1, mm4 ; abs(p2-p1) > limit
1062 psubusb mm6, mm4 ; abs(q2-q1) > limit
1063 por mm7, mm6 ; or
1065 por mm0, mm1 ;
1066 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
1068 movq mm1, mm5 ; p1
1070 movq mm7, mm3 ; mm3=mm7=p0
1071 psubusb mm7, mm5 ; p0 - p1
1073 psubusb mm5, mm3 ; p1 - p0
1074 por mm5, mm7 ; abs(p1-p0)
1076 movq t0, mm5 ; save abs(p1-p0)
1077 lea rdx, srct
1079 psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit
1080 por mm0, mm5 ; mm0=mask
1082 movq mm5, [rdx+32] ; mm5=q0
1083 movq mm7, [rdx+40] ; mm7=q1
1085 movq mm6, mm5 ; mm6=q0
1086 movq mm2, mm7 ; q1
1087 psubusb mm5, mm7 ; q0-q1
1089 psubusb mm7, mm6 ; q1-q0
1090 por mm7, mm5 ; abs(q1-q0)
1092 movq t1, mm7 ; save abs(q1-q0)
1093 psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit
1095 por mm0, mm7 ; mask
1097 movq mm5, mm2 ; q1
1098 psubusb mm5, mm1 ; q1-=p1
1099 psubusb mm1, mm2 ; p1-=q1
1100 por mm5, mm1 ; abs(p1-q1)
1101 pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
1102 psrlw mm5, 1 ; abs(p1-q1)/2
1104 mov rdx, arg(2) ;blimit ;
1106 movq mm4, [rdx] ;blimit
1107 movq mm1, mm3 ; mm1=mm3=p0
1109 movq mm7, mm6 ; mm7=mm6=q0
1110 psubusb mm1, mm7 ; p0-q0
1112 psubusb mm7, mm3 ; q0-p0
1113 por mm1, mm7 ; abs(q0-p0)
1114 paddusb mm1, mm1 ; abs(q0-p0)*2
1115 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1117 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
1118 por mm1, mm0; ; mask
1120 pxor mm0, mm0
1121 pcmpeqb mm1, mm0
1123 ; calculate high edge variance
1124 mov rdx, arg(4) ;thresh ; get thresh
1125 movq mm7, [rdx]
1127 movq mm4, t0 ; get abs (q1 - q0)
1128 psubusb mm4, mm7 ; abs(q1 - q0) > thresh
1130 movq mm3, t1 ; get abs (p1 - p0)
1131 psubusb mm3, mm7 ; abs(p1 - p0)> thresh
1133 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
1134 pcmpeqb mm4, mm0
1136 pcmpeqb mm0, mm0
1137 pxor mm4, mm0
1142 ; start work on filters
1143 lea rdx, srct
1145 ; start work on filters
1146 movq mm2, [rdx+16] ; p1
1147 movq mm7, [rdx+40] ; q1
1148 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
1149 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
1150 psubsb mm2, mm7 ; p1 - q1
1152 movq mm6, [rdx+24] ; p0
1153 movq mm0, [rdx+32] ; q0
1154 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
1155 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
1157 movq mm3, mm0 ; q0
1158 psubsb mm0, mm6 ; q0 - p0
1159 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
1160 paddsb mm2, mm0 ; 2 * (q0 - p0)
1161 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
1162 pand mm1, mm2 ; mask filter values we don't care about
1164 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
1165 movq mm2, mm1 ; vp8_filter
1166 pand mm2, mm4; ; Filter2 = vp8_filter & hev
1168 movq mm5, mm2 ;
1169 paddsb mm5, [GLOBAL(t3)];
1171 pxor mm0, mm0 ; 0
1172 pxor mm7, mm7 ; 0
1174 punpcklbw mm0, mm5 ; e0f0g0h0
1175 psraw mm0, 11 ; sign extended shift right by 3
1176 punpckhbw mm7, mm5 ; a0b0c0d0
1177 psraw mm7, 11 ; sign extended shift right by 3
1178 packsswb mm0, mm7 ; Filter2 >>=3;
1180 movq mm5, mm0 ; Filter2
1182 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
1183 pxor mm0, mm0 ; 0
1184 pxor mm7, mm7 ; 0
1186 punpcklbw mm0, mm2 ; e0f0g0h0
1187 psraw mm0, 11 ; sign extended shift right by 3
1188 punpckhbw mm7, mm2 ; a0b0c0d0
1189 psraw mm7, 11 ; sign extended shift right by 3
1190 packsswb mm0, mm7 ; Filter2 >>=3;
1192 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
1193 psubsb mm3, mm0 ; qs0 =qs0 - filter1
1194 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
1196 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
1197 ; vp8_filter &= ~hev;
1198 ; Filter2 = vp8_filter;
1199 pandn mm4, mm1 ; vp8_filter&=~hev
1202 ; mm3=qs0, mm4=filter2, mm6=ps0
1204 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
1205 ; s = vp8_signed_char_clamp(qs0 - u);
1206 ; *oq0 = s^0x80;
1207 ; s = vp8_signed_char_clamp(ps0 + u);
1208 ; *op0 = s^0x80;
1209 pxor mm0, mm0
1211 pxor mm1, mm1
1212 pxor mm2, mm2
1213 punpcklbw mm1, mm4
1214 punpckhbw mm2, mm4
1215 pmulhw mm1, [GLOBAL(s27)]
1216 pmulhw mm2, [GLOBAL(s27)]
1217 paddw mm1, [GLOBAL(s63)]
1218 paddw mm2, [GLOBAL(s63)]
1219 psraw mm1, 7
1220 psraw mm2, 7
1221 packsswb mm1, mm2
1223 psubsb mm3, mm1
1224 paddsb mm6, mm1
1226 pxor mm3, [GLOBAL(t80)]
1227 pxor mm6, [GLOBAL(t80)]
1228 movq [rdx+24], mm6
1229 movq [rdx+32], mm3
1231 ; roughly 2/7th difference across boundary
1232 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
1233 ; s = vp8_signed_char_clamp(qs1 - u);
1234 ; *oq1 = s^0x80;
1235 ; s = vp8_signed_char_clamp(ps1 + u);
1236 ; *op1 = s^0x80;
1237 pxor mm1, mm1
1238 pxor mm2, mm2
1239 punpcklbw mm1, mm4
1240 punpckhbw mm2, mm4
1241 pmulhw mm1, [GLOBAL(s18)]
1242 pmulhw mm2, [GLOBAL(s18)]
1243 paddw mm1, [GLOBAL(s63)]
1244 paddw mm2, [GLOBAL(s63)]
1245 psraw mm1, 7
1246 psraw mm2, 7
1247 packsswb mm1, mm2
1249 movq mm3, [rdx + 40]
1250 movq mm6, [rdx + 16] ; p1
1251 pxor mm3, [GLOBAL(t80)]
1252 pxor mm6, [GLOBAL(t80)]
1254 paddsb mm6, mm1
1255 psubsb mm3, mm1
1257 pxor mm6, [GLOBAL(t80)]
1258 pxor mm3, [GLOBAL(t80)]
1259 movq [rdx + 40], mm3
1260 movq [rdx + 16], mm6
1262 ; roughly 1/7th difference across boundary
1263 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
1264 ; s = vp8_signed_char_clamp(qs2 - u);
1265 ; *oq2 = s^0x80;
1266 ; s = vp8_signed_char_clamp(ps2 + u);
1267 ; *op2 = s^0x80;
1268 pxor mm1, mm1
1269 pxor mm2, mm2
1270 punpcklbw mm1, mm4
1271 punpckhbw mm2, mm4
1272 pmulhw mm1, [GLOBAL(s9)]
1273 pmulhw mm2, [GLOBAL(s9)]
1274 paddw mm1, [GLOBAL(s63)]
1275 paddw mm2, [GLOBAL(s63)]
1276 psraw mm1, 7
1277 psraw mm2, 7
1278 packsswb mm1, mm2
1280 movq mm6, [rdx+ 8]
1281 movq mm3, [rdx+48]
1283 pxor mm6, [GLOBAL(t80)]
1284 pxor mm3, [GLOBAL(t80)]
1286 paddsb mm6, mm1
1287 psubsb mm3, mm1
1289 pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
1290 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
1292 ; tranpose and write back
1293 movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
1294 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
1296 punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00
1297 punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40
1299 movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02
1300 movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02
1302 punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02
1303 punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42
1305 movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00
1306 punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00
1308 punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20
1309 movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40
1311 punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40
1312 punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60
1314 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
1315 punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04
1317 movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06
1318 punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06
1320 movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04
1321 punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04
1323 punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24
1324 movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00
1326 punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00
1327 punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10
1329 movq [rsi+rax*4], mm0 ; write out
1330 movq [rdi+rax*4], mm6 ; write out
1332 movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20
1333 punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20
1335 punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30
1336 movq [rsi+rax*2], mm0 ; write out
1338 movq [rdi+rax*2], mm5 ; write out
1339 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
1341 punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44
1342 punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46
1344 movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44
1345 punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44
1347 punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64
1348 movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40
1350 movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60
1351 punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40
1353 punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50
1354 movq [rsi], mm0 ; write out
1356 movq [rdi], mm1 ; write out
1357 neg rax
1359 punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60
1360 punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60
1362 movq [rsi+rax*2], mm3
1363 movq [rdi+rax*2], mm4
1365 lea rsi, [rsi+rax*8]
1366 dec rcx
1368 jnz .next8_mbv
1370 add rsp, 96
1371 pop rsp
1372 ; begin epilog
1373 pop rdi
1374 pop rsi
1375 RESTORE_GOT
1376 UNSHADOW_ARGS
1377 pop rbp
1381 ;void vp8_loop_filter_simple_horizontal_edge_mmx
1383 ; unsigned char *src_ptr,
1384 ; int src_pixel_step,
1385 ; const char *blimit
1387 global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
1388 sym(vp8_loop_filter_simple_horizontal_edge_mmx):
1389 push rbp
1390 mov rbp, rsp
1391 SHADOW_ARGS_TO_STACK 3
1392 GET_GOT rbx
1393 push rsi
1394 push rdi
1395 ; end prolog
1397 mov rsi, arg(0) ;src_ptr
1398 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
1400 mov rcx, 2 ; count
1401 .nexts8_h:
1402 mov rdx, arg(2) ;blimit ; get blimit
1403 movq mm3, [rdx] ;
1405 mov rdi, rsi ; rdi points to row +1 for indirect addressing
1406 add rdi, rax
1407 neg rax
1409 ; calculate mask
1410 movq mm1, [rsi+2*rax] ; p1
1411 movq mm0, [rdi] ; q1
1412 movq mm2, mm1
1413 movq mm7, mm0
1414 movq mm4, mm0
1415 psubusb mm0, mm1 ; q1-=p1
1416 psubusb mm1, mm4 ; p1-=q1
1417 por mm1, mm0 ; abs(p1-q1)
1418 pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
1419 psrlw mm1, 1 ; abs(p1-q1)/2
1421 movq mm5, [rsi+rax] ; p0
1422 movq mm4, [rsi] ; q0
1423 movq mm0, mm4 ; q0
1424 movq mm6, mm5 ; p0
1425 psubusb mm5, mm4 ; p0-=q0
1426 psubusb mm4, mm6 ; q0-=p0
1427 por mm5, mm4 ; abs(p0 - q0)
1428 paddusb mm5, mm5 ; abs(p0-q0)*2
1429 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1431 psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
1432 pxor mm3, mm3
1433 pcmpeqb mm5, mm3
1435 ; start work on filters
1436 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
1437 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
1438 psubsb mm2, mm7 ; p1 - q1
1440 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
1441 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
1442 movq mm3, mm0 ; q0
1443 psubsb mm0, mm6 ; q0 - p0
1444 paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
1445 paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
1446 paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
1447 pand mm5, mm2 ; mask filter values we don't care about
1449 ; do + 4 side
1450 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1452 movq mm0, mm5 ; get a copy of filters
1453 psllw mm0, 8 ; shift left 8
1454 psraw mm0, 3 ; arithmetic shift right 11
1455 psrlw mm0, 8
1456 movq mm1, mm5 ; get a copy of filters
1457 psraw mm1, 11 ; arithmetic shift right 11
1458 psllw mm1, 8 ; shift left 8 to put it back
1460 por mm0, mm1 ; put the two together to get result
1462 psubsb mm3, mm0 ; q0-= q0 add
1463 pxor mm3, [GLOBAL(t80)] ; unoffset
1464 movq [rsi], mm3 ; write back
1467 ; now do +3 side
1468 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
1470 movq mm0, mm5 ; get a copy of filters
1471 psllw mm0, 8 ; shift left 8
1472 psraw mm0, 3 ; arithmetic shift right 11
1473 psrlw mm0, 8
1474 psraw mm5, 11 ; arithmetic shift right 11
1475 psllw mm5, 8 ; shift left 8 to put it back
1476 por mm0, mm5 ; put the two together to get result
1479 paddsb mm6, mm0 ; p0+= p0 add
1480 pxor mm6, [GLOBAL(t80)] ; unoffset
1481 movq [rsi+rax], mm6 ; write back
1483 add rsi,8
1484 neg rax
1485 dec rcx
1486 jnz .nexts8_h
1488 ; begin epilog
1489 pop rdi
1490 pop rsi
1491 RESTORE_GOT
1492 UNSHADOW_ARGS
1493 pop rbp
1497 ;void vp8_loop_filter_simple_vertical_edge_mmx
1499 ; unsigned char *src_ptr,
1500 ; int src_pixel_step,
1501 ; const char *blimit
1503 global sym(vp8_loop_filter_simple_vertical_edge_mmx)
1504 sym(vp8_loop_filter_simple_vertical_edge_mmx):
1505 push rbp
1506 mov rbp, rsp
1507 SHADOW_ARGS_TO_STACK 3
1508 GET_GOT rbx
1509 push rsi
1510 push rdi
1511 ; end prolog
1513 ALIGN_STACK 16, rax
1514 sub rsp, 32 ; reserve 32 bytes
1515 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
1516 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
1518 mov rsi, arg(0) ;src_ptr
1519 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
1521 lea rsi, [rsi + rax*4- 2]; ;
1522 mov rcx, 2 ; count
1523 .nexts8_v:
1525 lea rdi, [rsi + rax];
1526 movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
1528 movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
1529 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
1531 movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
1532 movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
1534 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
1535 movq mm5, mm4 ; 53 43 52 42 51 41 50 40
1537 punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
1538 punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
1540 neg rax
1542 movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
1543 movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
1545 punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
1546 movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
1548 movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
1549 punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
1551 movq mm2, mm0 ; 13 03 12 02 11 01 10 00
1552 punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
1554 punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
1555 movq mm1, mm0 ; 13 03 12 02 11 01 10 00
1557 punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
1558 movq mm3, mm2 ; 33 23 13 03 32 22 12 02
1560 punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
1561 punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
1563 punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
1566 ; calculate mask
1567 movq mm6, mm0 ; p1
1568 movq mm7, mm3 ; q1
1569 psubusb mm7, mm6 ; q1-=p1
1570 psubusb mm6, mm3 ; p1-=q1
1571 por mm6, mm7 ; abs(p1-q1)
1572 pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
1573 psrlw mm6, 1 ; abs(p1-q1)/2
1575 movq mm5, mm1 ; p0
1576 movq mm4, mm2 ; q0
1578 psubusb mm5, mm2 ; p0-=q0
1579 psubusb mm4, mm1 ; q0-=p0
1581 por mm5, mm4 ; abs(p0 - q0)
1582 paddusb mm5, mm5 ; abs(p0-q0)*2
1583 paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1585 mov rdx, arg(2) ;blimit ; get blimit
1586 movq mm7, [rdx]
1588 psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
1589 pxor mm7, mm7
1590 pcmpeqb mm5, mm7 ; mm5 = mask
1592 ; start work on filters
1593 movq t0, mm0
1594 movq t1, mm3
1596 pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
1597 pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
1599 psubsb mm0, mm3 ; p1 - q1
1600 movq mm6, mm1 ; p0
1602 movq mm7, mm2 ; q0
1603 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
1605 pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
1606 movq mm3, mm7 ; offseted ; q0
1608 psubsb mm7, mm6 ; q0 - p0
1609 paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
1611 paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
1612 paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
1614 pand mm5, mm0 ; mask filter values we don't care about
1616 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1618 movq mm0, mm5 ; get a copy of filters
1619 psllw mm0, 8 ; shift left 8
1620 psraw mm0, 3 ; arithmetic shift right 11
1621 psrlw mm0, 8
1623 movq mm7, mm5 ; get a copy of filters
1624 psraw mm7, 11 ; arithmetic shift right 11
1625 psllw mm7, 8 ; shift left 8 to put it back
1627 por mm0, mm7 ; put the two together to get result
1629 psubsb mm3, mm0 ; q0-= q0sz add
1630 pxor mm3, [GLOBAL(t80)] ; unoffset
1632 ; now do +3 side
1633 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
1635 movq mm0, mm5 ; get a copy of filters
1636 psllw mm0, 8 ; shift left 8
1637 psraw mm0, 3 ; arithmetic shift right 11
1638 psrlw mm0, 8
1640 psraw mm5, 11 ; arithmetic shift right 11
1641 psllw mm5, 8 ; shift left 8 to put it back
1642 por mm0, mm5 ; put the two together to get result
1644 paddsb mm6, mm0 ; p0+= p0 add
1645 pxor mm6, [GLOBAL(t80)] ; unoffset
1648 movq mm0, t0
1649 movq mm4, t1
1651 ; mm0 = 70 60 50 40 30 20 10 00
1652 ; mm6 = 71 61 51 41 31 21 11 01
1653 ; mm3 = 72 62 52 42 32 22 12 02
1654 ; mm4 = 73 63 53 43 33 23 13 03
1655 ; transpose back to write out
1657 movq mm1, mm0 ;
1658 punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
1660 punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
1661 movq mm2, mm3 ;
1663 punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
1664 movq mm5, mm1 ; 71 70 61 60 51 50 41 40
1666 punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
1667 movq mm6, mm0 ; 31 30 21 20 11 10 01 00
1669 punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
1670 punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
1672 movd [rsi+rax*4], mm0 ; write 03 02 01 00
1673 punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
1675 psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
1676 punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
1678 movd [rdi+rax*4], mm0 ; write 13 12 11 10
1679 movd [rsi+rax*2], mm6 ; write 23 22 21 20
1681 psrlq mm6, 32 ; 33 32 31 30
1682 movd [rsi], mm1 ; write 43 42 41 40
1684 movd [rsi + rax], mm6 ; write 33 32 31 30
1685 neg rax
1687 movd [rsi + rax*2], mm5 ; write 63 62 61 60
1688 psrlq mm1, 32 ; 53 52 51 50
1690 movd [rdi], mm1 ; write out 53 52 51 50
1691 psrlq mm5, 32 ; 73 72 71 70
1693 movd [rdi + rax*2], mm5 ; write 73 72 71 70
1695 lea rsi, [rsi+rax*8] ; next 8
1697 dec rcx
1698 jnz .nexts8_v
1700 add rsp, 32
1701 pop rsp
1702 ; begin epilog
1703 pop rdi
1704 pop rsi
1705 RESTORE_GOT
1706 UNSHADOW_ARGS
1707 pop rbp
1712 ;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
1713 ; int y_stride,
1714 ; loop_filter_info *lfi)
1718 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1719 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1720 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1723 SECTION_RODATA
1724 align 16
1725 tfe:
1726 times 8 db 0xfe
1727 align 16
1728 t80:
1729 times 8 db 0x80
1730 align 16
1731 t1s:
1732 times 8 db 0x01
1733 align 16
1735 times 8 db 0x03
1736 align 16
1738 times 8 db 0x04
1739 align 16
1740 ones:
1741 times 4 dw 0x0001
1742 align 16
1743 s27:
1744 times 4 dw 0x1b00
1745 align 16
1746 s18:
1747 times 4 dw 0x1200
1748 align 16
1750 times 4 dw 0x0900
1751 align 16
1752 s63:
1753 times 4 dw 0x003f