clarify *_offsets.asm differences
[libvpx.git] / vp8 / common / x86 / loopfilter_mmx.asm
blobc6c215c3c6fcaef13c4328534d58fec0927aa16c
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 ;void vp8_loop_filter_horizontal_edge_mmx
17 ; unsigned char *src_ptr,
18 ; int src_pixel_step,
19 ; const char *flimit,
20 ; const char *limit,
21 ; const char *thresh,
22 ; int count
24 global sym(vp8_loop_filter_horizontal_edge_mmx)
25 sym(vp8_loop_filter_horizontal_edge_mmx):
26 push rbp
27 mov rbp, rsp
28 SHADOW_ARGS_TO_STACK 6
29 GET_GOT rbx
30 push rsi
31 push rdi
32 ; end prolog
34 ALIGN_STACK 16, rax
35 sub rsp, 32 ; reserve 32 bytes
36 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
37 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
39 mov rsi, arg(0) ;src_ptr
40 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
42 movsxd rcx, dword ptr arg(5) ;count
43 next8_h:
44 mov rdx, arg(3) ;limit
45 movq mm7, [rdx]
46 mov rdi, rsi ; rdi points to row +1 for indirect addressing
47 add rdi, rax
49 ; calculate breakout conditions
50 movq mm2, [rdi+2*rax] ; q3
51 movq mm1, [rsi+2*rax] ; q2
52 movq mm6, mm1 ; q2
53 psubusb mm1, mm2 ; q2-=q3
54 psubusb mm2, mm6 ; q3-=q2
55 por mm1, mm2 ; abs(q3-q2)
56 psubusb mm1, mm7 ;
59 movq mm4, [rsi+rax] ; q1
60 movq mm3, mm4 ; q1
61 psubusb mm4, mm6 ; q1-=q2
62 psubusb mm6, mm3 ; q2-=q1
63 por mm4, mm6 ; abs(q2-q1)
65 psubusb mm4, mm7
66 por mm1, mm4
68 movq mm4, [rsi] ; q0
69 movq mm0, mm4 ; q0
70 psubusb mm4, mm3 ; q0-=q1
71 psubusb mm3, mm0 ; q1-=q0
72 por mm4, mm3 ; abs(q0-q1)
73 movq t0, mm4 ; save to t0
74 psubusb mm4, mm7
75 por mm1, mm4
78 neg rax ; negate pitch to deal with above border
80 movq mm2, [rsi+4*rax] ; p3
81 movq mm4, [rdi+4*rax] ; p2
82 movq mm5, mm4 ; p2
83 psubusb mm4, mm2 ; p2-=p3
84 psubusb mm2, mm5 ; p3-=p2
85 por mm4, mm2 ; abs(p3 - p2)
86 psubusb mm4, mm7
87 por mm1, mm4
90 movq mm4, [rsi+2*rax] ; p1
91 movq mm3, mm4 ; p1
92 psubusb mm4, mm5 ; p1-=p2
93 psubusb mm5, mm3 ; p2-=p1
94 por mm4, mm5 ; abs(p2 - p1)
95 psubusb mm4, mm7
96 por mm1, mm4
98 movq mm2, mm3 ; p1
100 movq mm4, [rsi+rax] ; p0
101 movq mm5, mm4 ; p0
102 psubusb mm4, mm3 ; p0-=p1
103 psubusb mm3, mm5 ; p1-=p0
104 por mm4, mm3 ; abs(p1 - p0)
105 movq t1, mm4 ; save to t1
106 psubusb mm4, mm7
107 por mm1, mm4
109 movq mm3, [rdi] ; q1
110 movq mm4, mm3 ; q1
111 psubusb mm3, mm2 ; q1-=p1
112 psubusb mm2, mm4 ; p1-=q1
113 por mm2, mm3 ; abs(p1-q1)
114 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
115 psrlw mm2, 1 ; abs(p1-q1)/2
117 movq mm6, mm5 ; p0
118 movq mm3, [rsi] ; q0
119 psubusb mm5, mm3 ; p0-=q0
120 psubusb mm3, mm6 ; q0-=p0
121 por mm5, mm3 ; abs(p0 - q0)
122 paddusb mm5, mm5 ; abs(p0-q0)*2
123 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
125 mov rdx, arg(2) ;flimit ; get flimit
126 movq mm2, [rdx] ; flimit mm2
127 paddb mm2, mm2 ; flimit*2 (less than 255)
128 paddb mm7, mm2 ; flimit * 2 + limit (less than 255)
130 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
131 por mm1, mm5
132 pxor mm5, mm5
133 pcmpeqb mm1, mm5 ; mask mm1
135 ; calculate high edge variance
136 mov rdx, arg(4) ;thresh ; get thresh
137 movq mm7, [rdx] ;
138 movq mm4, t0 ; get abs (q1 - q0)
139 psubusb mm4, mm7
140 movq mm3, t1 ; get abs (p1 - p0)
141 psubusb mm3, mm7
142 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
144 pcmpeqb mm4, mm5
146 pcmpeqb mm5, mm5
147 pxor mm4, mm5
150 ; start work on filters
151 movq mm2, [rsi+2*rax] ; p1
152 movq mm7, [rdi] ; q1
153 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
154 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
155 psubsb mm2, mm7 ; p1 - q1
156 pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
157 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
158 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
159 movq mm3, mm0 ; q0
160 psubsb mm0, mm6 ; q0 - p0
161 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
162 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
163 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
164 pand mm1, mm2 ; mask filter values we don't care about
165 movq mm2, mm1
166 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
167 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
169 pxor mm0, mm0 ;
170 pxor mm5, mm5
171 punpcklbw mm0, mm2 ;
172 punpckhbw mm5, mm2 ;
173 psraw mm0, 11 ;
174 psraw mm5, 11
175 packsswb mm0, mm5
176 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
178 pxor mm0, mm0 ; 0
179 movq mm5, mm1 ; abcdefgh
180 punpcklbw mm0, mm1 ; e0f0g0h0
181 psraw mm0, 11 ; sign extended shift right by 3
182 pxor mm1, mm1 ; 0
183 punpckhbw mm1, mm5 ; a0b0c0d0
184 psraw mm1, 11 ; sign extended shift right by 3
185 movq mm5, mm0 ; save results
187 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
188 paddsw mm5, [GLOBAL(ones)]
189 paddsw mm1, [GLOBAL(ones)]
190 psraw mm5, 1 ; partial shifted one more time for 2nd tap
191 psraw mm1, 1 ; partial shifted one more time for 2nd tap
192 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
193 pandn mm4, mm5 ; high edge variance additive
195 paddsb mm6, mm2 ; p0+= p0 add
196 pxor mm6, [GLOBAL(t80)] ; unoffset
197 movq [rsi+rax], mm6 ; write back
199 movq mm6, [rsi+2*rax] ; p1
200 pxor mm6, [GLOBAL(t80)] ; reoffset
201 paddsb mm6, mm4 ; p1+= p1 add
202 pxor mm6, [GLOBAL(t80)] ; unoffset
203 movq [rsi+2*rax], mm6 ; write back
205 psubsb mm3, mm0 ; q0-= q0 add
206 pxor mm3, [GLOBAL(t80)] ; unoffset
207 movq [rsi], mm3 ; write back
209 psubsb mm7, mm4 ; q1-= q1 add
210 pxor mm7, [GLOBAL(t80)] ; unoffset
211 movq [rdi], mm7 ; write back
213 add rsi,8
214 neg rax
215 dec rcx
216 jnz next8_h
218 add rsp, 32
219 pop rsp
220 ; begin epilog
221 pop rdi
222 pop rsi
223 RESTORE_GOT
224 UNSHADOW_ARGS
225 pop rbp
229 ;void vp8_loop_filter_vertical_edge_mmx
231 ; unsigned char *src_ptr,
232 ; int src_pixel_step,
233 ; const char *flimit,
234 ; const char *limit,
235 ; const char *thresh,
236 ; int count
238 global sym(vp8_loop_filter_vertical_edge_mmx)
239 sym(vp8_loop_filter_vertical_edge_mmx):
240 push rbp
241 mov rbp, rsp
242 SHADOW_ARGS_TO_STACK 6
243 GET_GOT rbx
244 push rsi
245 push rdi
246 ; end prolog
248 ALIGN_STACK 16, rax
249 sub rsp, 64 ; reserve 64 bytes
250 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
251 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
252 %define srct [rsp + 32] ;__declspec(align(16)) char srct[32];
254 mov rsi, arg(0) ;src_ptr
255 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
257 lea rsi, [rsi + rax*4 - 4]
259 movsxd rcx, dword ptr arg(5) ;count
260 next8_v:
261 mov rdi, rsi ; rdi points to row +1 for indirect addressing
262 add rdi, rax
265 ;transpose
266 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
267 movq mm7, mm6 ; 77 76 75 74 73 72 71 70
269 punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64
270 punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60
272 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
273 movq mm5, mm4 ; 47 46 45 44 43 42 41 40
275 punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44
276 punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40
278 movq mm3, mm5 ; 57 47 56 46 55 45 54 44
279 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
281 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
282 movq mm2, mm4 ; 53 43 52 42 51 41 50 40
284 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
285 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
287 neg rax
288 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
290 movq mm1, mm6 ; 27 26 25 24 23 22 21 20
291 punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24
293 punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20
294 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
296 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
297 movq mm0, mm7 ; 17 07 16 06 15 05 14 04
299 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
300 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
302 movq mm6, mm7 ; 37 27 17 07 36 26 16 06
303 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
305 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
307 movq mm5, mm6 ; 76 66 56 46 36 26 16 06
308 psubusb mm5, mm7 ; q2-q3
310 psubusb mm7, mm6 ; q3-q2
311 por mm7, mm5; ; mm7=abs (q3-q2)
313 movq mm5, mm0 ; 35 25 15 05 34 24 14 04
314 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
316 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
317 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
319 psubusb mm3, mm6 ; q1-q2
320 psubusb mm6, mm5 ; q2-q1
322 por mm6, mm3 ; mm6=abs(q2-q1)
323 lea rdx, srct
325 movq [rdx+24], mm5 ; save q1
326 movq [rdx+16], mm0 ; save q0
328 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
329 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
331 movq mm0, mm3 ; 13 03 12 02 11 01 10 00
332 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
334 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
335 movq mm1, mm0 ; 31 21 11 01 30 20 10 00
337 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
338 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
340 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
341 psubusb mm2, mm0 ; p2-p3
343 psubusb mm0, mm1 ; p3-p2
344 por mm0, mm2 ; mm0=abs(p3-p2)
346 movq mm2, mm3 ; 33 23 13 03 32 22 12 02
347 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
349 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
350 movq [rdx+8], mm3 ; save p0
352 movq [rdx], mm2 ; save p1
353 movq mm5, mm2 ; mm5 = p1
355 psubusb mm2, mm1 ; p1-p2
356 psubusb mm1, mm5 ; p2-p1
358 por mm1, mm2 ; mm1=abs(p2-p1)
359 mov rdx, arg(3) ;limit
361 movq mm4, [rdx] ; mm4 = limit
362 psubusb mm7, mm4
364 psubusb mm0, mm4
365 psubusb mm1, mm4
367 psubusb mm6, mm4
368 por mm7, mm6
370 por mm0, mm1
371 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
373 movq mm1, mm5 ; p1
375 movq mm7, mm3 ; mm3=mm7=p0
376 psubusb mm7, mm5 ; p0 - p1
378 psubusb mm5, mm3 ; p1 - p0
379 por mm5, mm7 ; abs(p1-p0)
381 movq t0, mm5 ; save abs(p1-p0)
382 lea rdx, srct
384 psubusb mm5, mm4
385 por mm0, mm5 ; mm0=mask
387 movq mm5, [rdx+16] ; mm5=q0
388 movq mm7, [rdx+24] ; mm7=q1
390 movq mm6, mm5 ; mm6=q0
391 movq mm2, mm7 ; q1
392 psubusb mm5, mm7 ; q0-q1
394 psubusb mm7, mm6 ; q1-q0
395 por mm7, mm5 ; abs(q1-q0)
397 movq t1, mm7 ; save abs(q1-q0)
398 psubusb mm7, mm4
400 por mm0, mm7 ; mask
402 movq mm5, mm2 ; q1
403 psubusb mm5, mm1 ; q1-=p1
404 psubusb mm1, mm2 ; p1-=q1
405 por mm5, mm1 ; abs(p1-q1)
406 pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
407 psrlw mm5, 1 ; abs(p1-q1)/2
409 mov rdx, arg(2) ;flimit ;
411 movq mm2, [rdx] ;flimit mm2
412 movq mm1, mm3 ; mm1=mm3=p0
414 movq mm7, mm6 ; mm7=mm6=q0
415 psubusb mm1, mm7 ; p0-q0
417 psubusb mm7, mm3 ; q0-p0
418 por mm1, mm7 ; abs(q0-p0)
419 paddusb mm1, mm1 ; abs(q0-p0)*2
420 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
422 paddb mm2, mm2 ; flimit*2 (less than 255)
423 paddb mm4, mm2 ; flimit * 2 + limit (less than 255)
425 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
426 por mm1, mm0; ; mask
428 pxor mm0, mm0
429 pcmpeqb mm1, mm0
431 ; calculate high edge variance
432 mov rdx, arg(4) ;thresh ; get thresh
433 movq mm7, [rdx]
435 movq mm4, t0 ; get abs (q1 - q0)
436 psubusb mm4, mm7
438 movq mm3, t1 ; get abs (p1 - p0)
439 psubusb mm3, mm7
441 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
442 pcmpeqb mm4, mm0
444 pcmpeqb mm0, mm0
445 pxor mm4, mm0
449 ; start work on filters
450 lea rdx, srct
452 movq mm2, [rdx] ; p1
453 movq mm7, [rdx+24] ; q1
455 movq mm6, [rdx+8] ; p0
456 movq mm0, [rdx+16] ; q0
458 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
459 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
461 psubsb mm2, mm7 ; p1 - q1
462 pand mm2, mm4 ; high var mask (hvm)(p1 - q1)
464 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
465 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
467 movq mm3, mm0 ; q0
468 psubsb mm0, mm6 ; q0 - p0
470 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
471 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
473 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
474 pand mm1, mm2 ; mask filter values we don't care about
476 movq mm2, mm1
477 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
479 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
480 pxor mm0, mm0 ;
482 pxor mm5, mm5
483 punpcklbw mm0, mm2 ;
485 punpckhbw mm5, mm2 ;
486 psraw mm0, 11 ;
488 psraw mm5, 11
489 packsswb mm0, mm5
491 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
493 pxor mm0, mm0 ; 0
494 movq mm5, mm1 ; abcdefgh
496 punpcklbw mm0, mm1 ; e0f0g0h0
497 psraw mm0, 11 ; sign extended shift right by 3
499 pxor mm1, mm1 ; 0
500 punpckhbw mm1, mm5 ; a0b0c0d0
502 psraw mm1, 11 ; sign extended shift right by 3
503 movq mm5, mm0 ; save results
505 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
506 paddsw mm5, [GLOBAL(ones)]
508 paddsw mm1, [GLOBAL(ones)]
509 psraw mm5, 1 ; partial shifted one more time for 2nd tap
511 psraw mm1, 1 ; partial shifted one more time for 2nd tap
512 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
514 pandn mm4, mm5 ; high edge variance additive
516 paddsb mm6, mm2 ; p0+= p0 add
517 pxor mm6, [GLOBAL(t80)] ; unoffset
519 ; mm6=p0 ;
520 movq mm1, [rdx] ; p1
521 pxor mm1, [GLOBAL(t80)] ; reoffset
523 paddsb mm1, mm4 ; p1+= p1 add
524 pxor mm1, [GLOBAL(t80)] ; unoffset
525 ; mm6 = p0 mm1 = p1
527 psubsb mm3, mm0 ; q0-= q0 add
528 pxor mm3, [GLOBAL(t80)] ; unoffset
530 ; mm3 = q0
531 psubsb mm7, mm4 ; q1-= q1 add
532 pxor mm7, [GLOBAL(t80)] ; unoffset
533 ; mm7 = q1
535 ; tranpose and write back
536 ; mm1 = 72 62 52 42 32 22 12 02
537 ; mm6 = 73 63 53 43 33 23 13 03
538 ; mm3 = 74 64 54 44 34 24 14 04
539 ; mm7 = 75 65 55 45 35 25 15 05
541 movq mm2, mm1 ; 72 62 52 42 32 22 12 02
542 punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02
544 movq mm4, mm3 ; 74 64 54 44 34 24 14 04
545 punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42
547 punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04
548 punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44
550 movq mm6, mm2 ; 33 32 23 22 13 12 03 02
551 punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02
553 punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22
554 movq mm5, mm1 ; 73 72 63 62 53 52 43 42
556 punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42
557 punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62
560 ; mm2 = 15 14 13 12 05 04 03 02
561 ; mm6 = 35 34 33 32 25 24 23 22
562 ; mm5 = 55 54 53 52 45 44 43 42
563 ; mm1 = 75 74 73 72 65 64 63 62
567 movd [rsi+rax*4+2], mm2
568 psrlq mm2, 32
570 movd [rdi+rax*4+2], mm2
571 movd [rsi+rax*2+2], mm6
573 psrlq mm6, 32
574 movd [rsi+rax+2],mm6
576 movd [rsi+2], mm1
577 psrlq mm1, 32
579 movd [rdi+2], mm1
580 neg rax
582 movd [rdi+rax+2],mm5
583 psrlq mm5, 32
585 movd [rdi+rax*2+2], mm5
587 lea rsi, [rsi+rax*8]
588 dec rcx
589 jnz next8_v
591 add rsp, 64
592 pop rsp
593 ; begin epilog
594 pop rdi
595 pop rsi
596 RESTORE_GOT
597 UNSHADOW_ARGS
598 pop rbp
602 ;void vp8_mbloop_filter_horizontal_edge_mmx
604 ; unsigned char *src_ptr,
605 ; int src_pixel_step,
606 ; const char *flimit,
607 ; const char *limit,
608 ; const char *thresh,
609 ; int count
611 global sym(vp8_mbloop_filter_horizontal_edge_mmx)
612 sym(vp8_mbloop_filter_horizontal_edge_mmx):
613 push rbp
614 mov rbp, rsp
615 SHADOW_ARGS_TO_STACK 6
616 GET_GOT rbx
617 push rsi
618 push rdi
619 ; end prolog
621 ALIGN_STACK 16, rax
622 sub rsp, 32 ; reserve 32 bytes
623 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
624 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
626 mov rsi, arg(0) ;src_ptr
627 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
629 movsxd rcx, dword ptr arg(5) ;count
630 next8_mbh:
631 mov rdx, arg(3) ;limit
632 movq mm7, [rdx]
633 mov rdi, rsi ; rdi points to row +1 for indirect addressing
634 add rdi, rax
636 ; calculate breakout conditions
637 movq mm2, [rdi+2*rax] ; q3
639 movq mm1, [rsi+2*rax] ; q2
640 movq mm6, mm1 ; q2
641 psubusb mm1, mm2 ; q2-=q3
642 psubusb mm2, mm6 ; q3-=q2
643 por mm1, mm2 ; abs(q3-q2)
644 psubusb mm1, mm7
647 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
648 movq mm4, [rsi+rax] ; q1
649 movq mm3, mm4 ; q1
650 psubusb mm4, mm6 ; q1-=q2
651 psubusb mm6, mm3 ; q2-=q1
652 por mm4, mm6 ; abs(q2-q1)
653 psubusb mm4, mm7
654 por mm1, mm4
657 ; mm1 = mask, mm3=q1, mm7 = limit
659 movq mm4, [rsi] ; q0
660 movq mm0, mm4 ; q0
661 psubusb mm4, mm3 ; q0-=q1
662 psubusb mm3, mm0 ; q1-=q0
663 por mm4, mm3 ; abs(q0-q1)
664 movq t0, mm4 ; save to t0
665 psubusb mm4, mm7
666 por mm1, mm4
669 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
671 neg rax ; negate pitch to deal with above border
673 movq mm2, [rsi+4*rax] ; p3
674 movq mm4, [rdi+4*rax] ; p2
675 movq mm5, mm4 ; p2
676 psubusb mm4, mm2 ; p2-=p3
677 psubusb mm2, mm5 ; p3-=p2
678 por mm4, mm2 ; abs(p3 - p2)
679 psubusb mm4, mm7
680 por mm1, mm4
681 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
683 movq mm4, [rsi+2*rax] ; p1
684 movq mm3, mm4 ; p1
685 psubusb mm4, mm5 ; p1-=p2
686 psubusb mm5, mm3 ; p2-=p1
687 por mm4, mm5 ; abs(p2 - p1)
688 psubusb mm4, mm7
689 por mm1, mm4
691 movq mm2, mm3 ; p1
694 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
696 movq mm4, [rsi+rax] ; p0
697 movq mm5, mm4 ; p0
698 psubusb mm4, mm3 ; p0-=p1
699 psubusb mm3, mm5 ; p1-=p0
700 por mm4, mm3 ; abs(p1 - p0)
701 movq t1, mm4 ; save to t1
702 psubusb mm4, mm7
703 por mm1, mm4
704 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
705 ; mm5 = p0
706 movq mm3, [rdi] ; q1
707 movq mm4, mm3 ; q1
708 psubusb mm3, mm2 ; q1-=p1
709 psubusb mm2, mm4 ; p1-=q1
710 por mm2, mm3 ; abs(p1-q1)
711 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
712 psrlw mm2, 1 ; abs(p1-q1)/2
714 movq mm6, mm5 ; p0
715 movq mm3, mm0 ; q0
716 psubusb mm5, mm3 ; p0-=q0
717 psubusb mm3, mm6 ; q0-=p0
718 por mm5, mm3 ; abs(p0 - q0)
719 paddusb mm5, mm5 ; abs(p0-q0)*2
720 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
722 mov rdx, arg(2) ;flimit ; get flimit
723 movq mm2, [rdx] ; flimit mm2
724 paddb mm2, mm2 ; flimit*2 (less than 255)
725 paddb mm7, mm2 ; flimit * 2 + limit (less than 255)
727 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
728 por mm1, mm5
729 pxor mm5, mm5
730 pcmpeqb mm1, mm5 ; mask mm1
732 ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
733 ; mm6 = p0,
735 ; calculate high edge variance
736 mov rdx, arg(4) ;thresh ; get thresh
737 movq mm7, [rdx] ;
738 movq mm4, t0 ; get abs (q1 - q0)
739 psubusb mm4, mm7
740 movq mm3, t1 ; get abs (p1 - p0)
741 psubusb mm3, mm7
742 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
744 pcmpeqb mm4, mm5
746 pcmpeqb mm5, mm5
747 pxor mm4, mm5
751 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
752 ; mm6 = p0, mm4=hev
753 ; start work on filters
754 movq mm2, [rsi+2*rax] ; p1
755 movq mm7, [rdi] ; q1
756 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
757 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
758 psubsb mm2, mm7 ; p1 - q1
760 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
761 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
762 movq mm3, mm0 ; q0
763 psubsb mm0, mm6 ; q0 - p0
764 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
765 paddsb mm2, mm0 ; 2 * (q0 - p0)
766 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
767 pand mm1, mm2 ; mask filter values we don't care about
770 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
771 movq mm2, mm1 ; vp8_filter
772 pand mm2, mm4; ; Filter2 = vp8_filter & hev
774 movq mm5, mm2 ;
775 paddsb mm5, [GLOBAL(t3)];
777 pxor mm0, mm0 ; 0
778 pxor mm7, mm7 ; 0
780 punpcklbw mm0, mm5 ; e0f0g0h0
781 psraw mm0, 11 ; sign extended shift right by 3
782 punpckhbw mm7, mm5 ; a0b0c0d0
783 psraw mm7, 11 ; sign extended shift right by 3
784 packsswb mm0, mm7 ; Filter2 >>=3;
786 movq mm5, mm0 ; Filter2
788 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
789 pxor mm0, mm0 ; 0
790 pxor mm7, mm7 ; 0
792 punpcklbw mm0, mm2 ; e0f0g0h0
793 psraw mm0, 11 ; sign extended shift right by 3
794 punpckhbw mm7, mm2 ; a0b0c0d0
795 psraw mm7, 11 ; sign extended shift right by 3
796 packsswb mm0, mm7 ; Filter2 >>=3;
798 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
799 psubsb mm3, mm0 ; qs0 =qs0 - filter1
800 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
802 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
803 ; vp8_filter &= ~hev;
804 ; Filter2 = vp8_filter;
805 pandn mm4, mm1 ; vp8_filter&=~hev
808 ; mm3=qs0, mm4=filter2, mm6=ps0
810 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
811 ; s = vp8_signed_char_clamp(qs0 - u);
812 ; *oq0 = s^0x80;
813 ; s = vp8_signed_char_clamp(ps0 + u);
814 ; *op0 = s^0x80;
815 pxor mm0, mm0
817 pxor mm1, mm1
818 pxor mm2, mm2
819 punpcklbw mm1, mm4
820 punpckhbw mm2, mm4
821 pmulhw mm1, [GLOBAL(s27)]
822 pmulhw mm2, [GLOBAL(s27)]
823 paddw mm1, [GLOBAL(s63)]
824 paddw mm2, [GLOBAL(s63)]
825 psraw mm1, 7
826 psraw mm2, 7
827 packsswb mm1, mm2
829 psubsb mm3, mm1
830 paddsb mm6, mm1
832 pxor mm3, [GLOBAL(t80)]
833 pxor mm6, [GLOBAL(t80)]
834 movq [rsi+rax], mm6
835 movq [rsi], mm3
837 ; roughly 2/7th difference across boundary
838 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
839 ; s = vp8_signed_char_clamp(qs1 - u);
840 ; *oq1 = s^0x80;
841 ; s = vp8_signed_char_clamp(ps1 + u);
842 ; *op1 = s^0x80;
843 pxor mm1, mm1
844 pxor mm2, mm2
845 punpcklbw mm1, mm4
846 punpckhbw mm2, mm4
847 pmulhw mm1, [GLOBAL(s18)]
848 pmulhw mm2, [GLOBAL(s18)]
849 paddw mm1, [GLOBAL(s63)]
850 paddw mm2, [GLOBAL(s63)]
851 psraw mm1, 7
852 psraw mm2, 7
853 packsswb mm1, mm2
855 movq mm3, [rdi]
856 movq mm6, [rsi+rax*2] ; p1
858 pxor mm3, [GLOBAL(t80)]
859 pxor mm6, [GLOBAL(t80)]
861 paddsb mm6, mm1
862 psubsb mm3, mm1
864 pxor mm6, [GLOBAL(t80)]
865 pxor mm3, [GLOBAL(t80)]
866 movq [rdi], mm3
867 movq [rsi+rax*2], mm6
869 ; roughly 1/7th difference across boundary
870 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
871 ; s = vp8_signed_char_clamp(qs2 - u);
872 ; *oq2 = s^0x80;
873 ; s = vp8_signed_char_clamp(ps2 + u);
874 ; *op2 = s^0x80;
875 pxor mm1, mm1
876 pxor mm2, mm2
877 punpcklbw mm1, mm4
878 punpckhbw mm2, mm4
879 pmulhw mm1, [GLOBAL(s9)]
880 pmulhw mm2, [GLOBAL(s9)]
881 paddw mm1, [GLOBAL(s63)]
882 paddw mm2, [GLOBAL(s63)]
883 psraw mm1, 7
884 psraw mm2, 7
885 packsswb mm1, mm2
888 movq mm6, [rdi+rax*4]
889 neg rax
890 movq mm3, [rdi+rax ]
892 pxor mm6, [GLOBAL(t80)]
893 pxor mm3, [GLOBAL(t80)]
895 paddsb mm6, mm1
896 psubsb mm3, mm1
898 pxor mm6, [GLOBAL(t80)]
899 pxor mm3, [GLOBAL(t80)]
900 movq [rdi+rax ], mm3
901 neg rax
902 movq [rdi+rax*4], mm6
904 ;EARLY_BREAK_OUT:
905 neg rax
906 add rsi,8
907 dec rcx
908 jnz next8_mbh
910 add rsp, 32
911 pop rsp
912 ; begin epilog
913 pop rdi
914 pop rsi
915 RESTORE_GOT
916 UNSHADOW_ARGS
917 pop rbp
921 ;void vp8_mbloop_filter_vertical_edge_mmx
923 ; unsigned char *src_ptr,
924 ; int src_pixel_step,
925 ; const char *flimit,
926 ; const char *limit,
927 ; const char *thresh,
928 ; int count
930 global sym(vp8_mbloop_filter_vertical_edge_mmx)
931 sym(vp8_mbloop_filter_vertical_edge_mmx):
932 push rbp
933 mov rbp, rsp
934 SHADOW_ARGS_TO_STACK 6
935 GET_GOT rbx
936 push rsi
937 push rdi
938 ; end prolog
940 ALIGN_STACK 16, rax
941 sub rsp, 96 ; reserve 96 bytes
942 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
943 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
944 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
946 mov rsi, arg(0) ;src_ptr
947 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
949 lea rsi, [rsi + rax*4 - 4]
951 movsxd rcx, dword ptr arg(5) ;count
952 next8_mbv:
953 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
955 ;transpose
956 movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70
957 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60
959 movq mm7, mm6 ; 77 76 75 74 73 72 71 70
960 punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64
962 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
963 movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50
965 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40
966 movq mm5, mm4 ; 47 46 45 44 43 42 41 40
968 punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44
969 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
971 movq mm3, mm5 ; 57 47 56 46 55 45 54 44
972 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46
974 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44
975 movq mm2, mm4 ; 53 43 52 42 51 41 50 40
977 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42
978 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40
980 neg rax
982 movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30
983 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20
985 movq mm1, mm6 ; 27 26 25 24 23 22 21 20
986 punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24
988 punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20
990 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00
991 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04
993 movq mm0, mm7 ; 17 07 16 06 15 05 14 04
994 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06
996 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04
997 movq mm6, mm7 ; 37 27 17 07 36 26 16 06
999 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3
1000 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2
1002 lea rdx, srct
1003 movq mm5, mm6 ; 76 66 56 46 36 26 16 06
1005 movq [rdx+56], mm7
1006 psubusb mm5, mm7 ; q2-q3
1009 movq [rdx+48], mm6
1010 psubusb mm7, mm6 ; q3-q2
1012 por mm7, mm5; ; mm7=abs (q3-q2)
1013 movq mm5, mm0 ; 35 25 15 05 34 24 14 04
1015 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1
1016 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0
1018 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1
1019 psubusb mm3, mm6 ; q1-q2
1021 psubusb mm6, mm5 ; q2-q1
1022 por mm6, mm3 ; mm6=abs(q2-q1)
1024 movq [rdx+40], mm5 ; save q1
1025 movq [rdx+32], mm0 ; save q0
1027 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00
1028 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00
1030 movq mm0, mm3 ; 13 03 12 02 11 01 10 00
1031 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00
1033 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02
1034 movq mm1, mm0 ; 31 21 11 01 30 20 10 00
1036 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3
1037 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2
1039 movq [rdx], mm0 ; save p3
1040 movq [rdx+8], mm1 ; save p2
1042 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2
1043 psubusb mm2, mm0 ; p2-p3
1045 psubusb mm0, mm1 ; p3-p2
1046 por mm0, mm2 ; mm0=abs(p3-p2)
1048 movq mm2, mm3 ; 33 23 13 03 32 22 12 02
1049 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1
1051 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0
1052 movq [rdx+24], mm3 ; save p0
1054 movq [rdx+16], mm2 ; save p1
1055 movq mm5, mm2 ; mm5 = p1
1057 psubusb mm2, mm1 ; p1-p2
1058 psubusb mm1, mm5 ; p2-p1
1060 por mm1, mm2 ; mm1=abs(p2-p1)
1061 mov rdx, arg(3) ;limit
1063 movq mm4, [rdx] ; mm4 = limit
1064 psubusb mm7, mm4 ; abs(q3-q2) > limit
1066 psubusb mm0, mm4 ; abs(p3-p2) > limit
1067 psubusb mm1, mm4 ; abs(p2-p1) > limit
1069 psubusb mm6, mm4 ; abs(q2-q1) > limit
1070 por mm7, mm6 ; or
1072 por mm0, mm1 ;
1073 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
1075 movq mm1, mm5 ; p1
1077 movq mm7, mm3 ; mm3=mm7=p0
1078 psubusb mm7, mm5 ; p0 - p1
1080 psubusb mm5, mm3 ; p1 - p0
1081 por mm5, mm7 ; abs(p1-p0)
1083 movq t0, mm5 ; save abs(p1-p0)
1084 lea rdx, srct
1086 psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit
1087 por mm0, mm5 ; mm0=mask
1089 movq mm5, [rdx+32] ; mm5=q0
1090 movq mm7, [rdx+40] ; mm7=q1
1092 movq mm6, mm5 ; mm6=q0
1093 movq mm2, mm7 ; q1
1094 psubusb mm5, mm7 ; q0-q1
1096 psubusb mm7, mm6 ; q1-q0
1097 por mm7, mm5 ; abs(q1-q0)
1099 movq t1, mm7 ; save abs(q1-q0)
1100 psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit
1102 por mm0, mm7 ; mask
1104 movq mm5, mm2 ; q1
1105 psubusb mm5, mm1 ; q1-=p1
1106 psubusb mm1, mm2 ; p1-=q1
1107 por mm5, mm1 ; abs(p1-q1)
1108 pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
1109 psrlw mm5, 1 ; abs(p1-q1)/2
1111 mov rdx, arg(2) ;flimit ;
1113 movq mm2, [rdx] ;flimit mm2
1114 movq mm1, mm3 ; mm1=mm3=p0
1116 movq mm7, mm6 ; mm7=mm6=q0
1117 psubusb mm1, mm7 ; p0-q0
1119 psubusb mm7, mm3 ; q0-p0
1120 por mm1, mm7 ; abs(q0-p0)
1121 paddusb mm1, mm1 ; abs(q0-p0)*2
1122 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1124 paddb mm2, mm2 ; flimit*2 (less than 255)
1125 paddb mm4, mm2 ; flimit * 2 + limit (less than 255)
1127 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1128 por mm1, mm0; ; mask
1130 pxor mm0, mm0
1131 pcmpeqb mm1, mm0
1133 ; calculate high edge variance
1134 mov rdx, arg(4) ;thresh ; get thresh
1135 movq mm7, [rdx]
1137 movq mm4, t0 ; get abs (q1 - q0)
1138 psubusb mm4, mm7 ; abs(q1 - q0) > thresh
1140 movq mm3, t1 ; get abs (p1 - p0)
1141 psubusb mm3, mm7 ; abs(p1 - p0)> thresh
1143 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
1144 pcmpeqb mm4, mm0
1146 pcmpeqb mm0, mm0
1147 pxor mm4, mm0
1152 ; start work on filters
1153 lea rdx, srct
1155 ; start work on filters
1156 movq mm2, [rdx+16] ; p1
1157 movq mm7, [rdx+40] ; q1
1158 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
1159 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
1160 psubsb mm2, mm7 ; p1 - q1
1162 movq mm6, [rdx+24] ; p0
1163 movq mm0, [rdx+32] ; q0
1164 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
1165 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
1167 movq mm3, mm0 ; q0
1168 psubsb mm0, mm6 ; q0 - p0
1169 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1)
1170 paddsb mm2, mm0 ; 2 * (q0 - p0)
1171 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1)
1172 pand mm1, mm2 ; mask filter values we don't care about
1174 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
1175 movq mm2, mm1 ; vp8_filter
1176 pand mm2, mm4; ; Filter2 = vp8_filter & hev
1178 movq mm5, mm2 ;
1179 paddsb mm5, [GLOBAL(t3)];
1181 pxor mm0, mm0 ; 0
1182 pxor mm7, mm7 ; 0
1184 punpcklbw mm0, mm5 ; e0f0g0h0
1185 psraw mm0, 11 ; sign extended shift right by 3
1186 punpckhbw mm7, mm5 ; a0b0c0d0
1187 psraw mm7, 11 ; sign extended shift right by 3
1188 packsswb mm0, mm7 ; Filter2 >>=3;
1190 movq mm5, mm0 ; Filter2
1192 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
1193 pxor mm0, mm0 ; 0
1194 pxor mm7, mm7 ; 0
1196 punpcklbw mm0, mm2 ; e0f0g0h0
1197 psraw mm0, 11 ; sign extended shift right by 3
1198 punpckhbw mm7, mm2 ; a0b0c0d0
1199 psraw mm7, 11 ; sign extended shift right by 3
1200 packsswb mm0, mm7 ; Filter2 >>=3;
1202 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
1203 psubsb mm3, mm0 ; qs0 =qs0 - filter1
1204 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2
1206 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
1207 ; vp8_filter &= ~hev;
1208 ; Filter2 = vp8_filter;
1209 pandn mm4, mm1 ; vp8_filter&=~hev
1212 ; mm3=qs0, mm4=filter2, mm6=ps0
1214 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
1215 ; s = vp8_signed_char_clamp(qs0 - u);
1216 ; *oq0 = s^0x80;
1217 ; s = vp8_signed_char_clamp(ps0 + u);
1218 ; *op0 = s^0x80;
1219 pxor mm0, mm0
1221 pxor mm1, mm1
1222 pxor mm2, mm2
1223 punpcklbw mm1, mm4
1224 punpckhbw mm2, mm4
1225 pmulhw mm1, [GLOBAL(s27)]
1226 pmulhw mm2, [GLOBAL(s27)]
1227 paddw mm1, [GLOBAL(s63)]
1228 paddw mm2, [GLOBAL(s63)]
1229 psraw mm1, 7
1230 psraw mm2, 7
1231 packsswb mm1, mm2
1233 psubsb mm3, mm1
1234 paddsb mm6, mm1
1236 pxor mm3, [GLOBAL(t80)]
1237 pxor mm6, [GLOBAL(t80)]
1238 movq [rdx+24], mm6
1239 movq [rdx+32], mm3
1241 ; roughly 2/7th difference across boundary
1242 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
1243 ; s = vp8_signed_char_clamp(qs1 - u);
1244 ; *oq1 = s^0x80;
1245 ; s = vp8_signed_char_clamp(ps1 + u);
1246 ; *op1 = s^0x80;
1247 pxor mm1, mm1
1248 pxor mm2, mm2
1249 punpcklbw mm1, mm4
1250 punpckhbw mm2, mm4
1251 pmulhw mm1, [GLOBAL(s18)]
1252 pmulhw mm2, [GLOBAL(s18)]
1253 paddw mm1, [GLOBAL(s63)]
1254 paddw mm2, [GLOBAL(s63)]
1255 psraw mm1, 7
1256 psraw mm2, 7
1257 packsswb mm1, mm2
1259 movq mm3, [rdx + 40]
1260 movq mm6, [rdx + 16] ; p1
1261 pxor mm3, [GLOBAL(t80)]
1262 pxor mm6, [GLOBAL(t80)]
1264 paddsb mm6, mm1
1265 psubsb mm3, mm1
1267 pxor mm6, [GLOBAL(t80)]
1268 pxor mm3, [GLOBAL(t80)]
1269 movq [rdx + 40], mm3
1270 movq [rdx + 16], mm6
1272 ; roughly 1/7th difference across boundary
1273 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
1274 ; s = vp8_signed_char_clamp(qs2 - u);
1275 ; *oq2 = s^0x80;
1276 ; s = vp8_signed_char_clamp(ps2 + u);
1277 ; *op2 = s^0x80;
1278 pxor mm1, mm1
1279 pxor mm2, mm2
1280 punpcklbw mm1, mm4
1281 punpckhbw mm2, mm4
1282 pmulhw mm1, [GLOBAL(s9)]
1283 pmulhw mm2, [GLOBAL(s9)]
1284 paddw mm1, [GLOBAL(s63)]
1285 paddw mm2, [GLOBAL(s63)]
1286 psraw mm1, 7
1287 psraw mm2, 7
1288 packsswb mm1, mm2
1290 movq mm6, [rdx+ 8]
1291 movq mm3, [rdx+48]
1293 pxor mm6, [GLOBAL(t80)]
1294 pxor mm3, [GLOBAL(t80)]
1296 paddsb mm6, mm1
1297 psubsb mm3, mm1
1299 pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
1300 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
1302 ; tranpose and write back
1303 movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
1304 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
1306 punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00
1307 punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40
1309 movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02
1310 movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02
1312 punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02
1313 punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42
1315 movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00
1316 punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00
1318 punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20
1319 movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40
1321 punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40
1322 punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60
1324 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
1325 punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04
1327 movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06
1328 punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06
1330 movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04
1331 punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04
1333 punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24
1334 movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00
1336 punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00
1337 punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10
1339 movq [rsi+rax*4], mm0 ; write out
1340 movq [rdi+rax*4], mm6 ; write out
1342 movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20
1343 punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20
1345 punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30
1346 movq [rsi+rax*2], mm0 ; write out
1348 movq [rdi+rax*2], mm5 ; write out
1349 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04
1351 punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44
1352 punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46
1354 movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44
1355 punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44
1357 punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64
1358 movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40
1360 movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60
1361 punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40
1363 punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50
1364 movq [rsi], mm0 ; write out
1366 movq [rdi], mm1 ; write out
1367 neg rax
1369 punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60
1370 punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60
1372 movq [rsi+rax*2], mm3
1373 movq [rdi+rax*2], mm4
1375 lea rsi, [rsi+rax*8]
1376 dec rcx
1378 jnz next8_mbv
1380 add rsp, 96
1381 pop rsp
1382 ; begin epilog
1383 pop rdi
1384 pop rsi
1385 RESTORE_GOT
1386 UNSHADOW_ARGS
1387 pop rbp
1391 ;void vp8_loop_filter_simple_horizontal_edge_mmx
1393 ; unsigned char *src_ptr,
1394 ; int src_pixel_step,
1395 ; const char *flimit,
1396 ; const char *limit,
1397 ; const char *thresh,
1398 ; int count
1400 global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
1401 sym(vp8_loop_filter_simple_horizontal_edge_mmx):
1402 push rbp
1403 mov rbp, rsp
1404 SHADOW_ARGS_TO_STACK 6
1405 GET_GOT rbx
1406 push rsi
1407 push rdi
1408 ; end prolog
1410 mov rsi, arg(0) ;src_ptr
1411 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
1413 movsxd rcx, dword ptr arg(5) ;count
1414 nexts8_h:
1415 mov rdx, arg(3) ;limit
1416 movq mm7, [rdx]
1417 mov rdx, arg(2) ;flimit ; get flimit
1418 movq mm3, [rdx] ;
1419 paddb mm3, mm3 ; flimit*2 (less than 255)
1420 paddb mm3, mm7 ; flimit * 2 + limit (less than 255)
1422 mov rdi, rsi ; rdi points to row +1 for indirect addressing
1423 add rdi, rax
1424 neg rax
1426 ; calculate mask
1427 movq mm1, [rsi+2*rax] ; p1
1428 movq mm0, [rdi] ; q1
1429 movq mm2, mm1
1430 movq mm7, mm0
1431 movq mm4, mm0
1432 psubusb mm0, mm1 ; q1-=p1
1433 psubusb mm1, mm4 ; p1-=q1
1434 por mm1, mm0 ; abs(p1-q1)
1435 pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
1436 psrlw mm1, 1 ; abs(p1-q1)/2
1438 movq mm5, [rsi+rax] ; p0
1439 movq mm4, [rsi] ; q0
1440 movq mm0, mm4 ; q0
1441 movq mm6, mm5 ; p0
1442 psubusb mm5, mm4 ; p0-=q0
1443 psubusb mm4, mm6 ; q0-=p0
1444 por mm5, mm4 ; abs(p0 - q0)
1445 paddusb mm5, mm5 ; abs(p0-q0)*2
1446 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1448 psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1449 pxor mm3, mm3
1450 pcmpeqb mm5, mm3
1452 ; start work on filters
1453 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
1454 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
1455 psubsb mm2, mm7 ; p1 - q1
1457 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
1458 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
1459 movq mm3, mm0 ; q0
1460 psubsb mm0, mm6 ; q0 - p0
1461 paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
1462 paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
1463 paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
1464 pand mm5, mm2 ; mask filter values we don't care about
1466 ; do + 4 side
1467 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1469 movq mm0, mm5 ; get a copy of filters
1470 psllw mm0, 8 ; shift left 8
1471 psraw mm0, 3 ; arithmetic shift right 11
1472 psrlw mm0, 8
1473 movq mm1, mm5 ; get a copy of filters
1474 psraw mm1, 11 ; arithmetic shift right 11
1475 psllw mm1, 8 ; shift left 8 to put it back
1477 por mm0, mm1 ; put the two together to get result
1479 psubsb mm3, mm0 ; q0-= q0 add
1480 pxor mm3, [GLOBAL(t80)] ; unoffset
1481 movq [rsi], mm3 ; write back
1484 ; now do +3 side
1485 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
1487 movq mm0, mm5 ; get a copy of filters
1488 psllw mm0, 8 ; shift left 8
1489 psraw mm0, 3 ; arithmetic shift right 11
1490 psrlw mm0, 8
1491 psraw mm5, 11 ; arithmetic shift right 11
1492 psllw mm5, 8 ; shift left 8 to put it back
1493 por mm0, mm5 ; put the two together to get result
1496 paddsb mm6, mm0 ; p0+= p0 add
1497 pxor mm6, [GLOBAL(t80)] ; unoffset
1498 movq [rsi+rax], mm6 ; write back
1500 add rsi,8
1501 neg rax
1502 dec rcx
1503 jnz nexts8_h
1505 ; begin epilog
1506 pop rdi
1507 pop rsi
1508 RESTORE_GOT
1509 UNSHADOW_ARGS
1510 pop rbp
1514 ;void vp8_loop_filter_simple_vertical_edge_mmx
1516 ; unsigned char *src_ptr,
1517 ; int src_pixel_step,
1518 ; const char *flimit,
1519 ; const char *limit,
1520 ; const char *thresh,
1521 ; int count
1523 global sym(vp8_loop_filter_simple_vertical_edge_mmx)
1524 sym(vp8_loop_filter_simple_vertical_edge_mmx):
1525 push rbp
1526 mov rbp, rsp
1527 SHADOW_ARGS_TO_STACK 6
1528 GET_GOT rbx
1529 push rsi
1530 push rdi
1531 ; end prolog
1533 ALIGN_STACK 16, rax
1534 sub rsp, 32 ; reserve 32 bytes
1535 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
1536 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
1538 mov rsi, arg(0) ;src_ptr
1539 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
1541 lea rsi, [rsi + rax*4- 2]; ;
1542 movsxd rcx, dword ptr arg(5) ;count
1543 nexts8_v:
1545 lea rdi, [rsi + rax];
1546 movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
1548 movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
1549 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
1551 movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
1552 movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
1554 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
1555 movq mm5, mm4 ; 53 43 52 42 51 41 50 40
1557 punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
1558 punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
1560 neg rax
1562 movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
1563 movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
1565 punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
1566 movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
1568 movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
1569 punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
1571 movq mm2, mm0 ; 13 03 12 02 11 01 10 00
1572 punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
1574 punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
1575 movq mm1, mm0 ; 13 03 12 02 11 01 10 00
1577 punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
1578 movq mm3, mm2 ; 33 23 13 03 32 22 12 02
1580 punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
1581 punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
1583 punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
1586 ; calculate mask
1587 movq mm6, mm0 ; p1
1588 movq mm7, mm3 ; q1
1589 psubusb mm7, mm6 ; q1-=p1
1590 psubusb mm6, mm3 ; p1-=q1
1591 por mm6, mm7 ; abs(p1-q1)
1592 pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
1593 psrlw mm6, 1 ; abs(p1-q1)/2
1595 movq mm5, mm1 ; p0
1596 movq mm4, mm2 ; q0
1598 psubusb mm5, mm2 ; p0-=q0
1599 psubusb mm4, mm1 ; q0-=p0
1601 por mm5, mm4 ; abs(p0 - q0)
1602 paddusb mm5, mm5 ; abs(p0-q0)*2
1603 paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1605 mov rdx, arg(2) ;flimit ; get flimit
1606 movq mm7, [rdx]
1607 mov rdx, arg(3) ; get limit
1608 movq mm6, [rdx]
1609 paddb mm7, mm7 ; flimit*2 (less than 255)
1610 paddb mm7, mm6 ; flimit * 2 + limit (less than 255)
1612 psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1613 pxor mm7, mm7
1614 pcmpeqb mm5, mm7 ; mm5 = mask
1616 ; start work on filters
1617 movq t0, mm0
1618 movq t1, mm3
1620 pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
1621 pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
1623 psubsb mm0, mm3 ; p1 - q1
1624 movq mm6, mm1 ; p0
1626 movq mm7, mm2 ; q0
1627 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
1629 pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
1630 movq mm3, mm7 ; offseted ; q0
1632 psubsb mm7, mm6 ; q0 - p0
1633 paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
1635 paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
1636 paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
1638 pand mm5, mm0 ; mask filter values we don't care about
1640 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1642 movq mm0, mm5 ; get a copy of filters
1643 psllw mm0, 8 ; shift left 8
1644 psraw mm0, 3 ; arithmetic shift right 11
1645 psrlw mm0, 8
1647 movq mm7, mm5 ; get a copy of filters
1648 psraw mm7, 11 ; arithmetic shift right 11
1649 psllw mm7, 8 ; shift left 8 to put it back
1651 por mm0, mm7 ; put the two together to get result
1653 psubsb mm3, mm0 ; q0-= q0sz add
1654 pxor mm3, [GLOBAL(t80)] ; unoffset
1656 ; now do +3 side
1657 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
1659 movq mm0, mm5 ; get a copy of filters
1660 psllw mm0, 8 ; shift left 8
1661 psraw mm0, 3 ; arithmetic shift right 11
1662 psrlw mm0, 8
1664 psraw mm5, 11 ; arithmetic shift right 11
1665 psllw mm5, 8 ; shift left 8 to put it back
1666 por mm0, mm5 ; put the two together to get result
1668 paddsb mm6, mm0 ; p0+= p0 add
1669 pxor mm6, [GLOBAL(t80)] ; unoffset
1672 movq mm0, t0
1673 movq mm4, t1
1675 ; mm0 = 70 60 50 40 30 20 10 00
1676 ; mm6 = 71 61 51 41 31 21 11 01
1677 ; mm3 = 72 62 52 42 32 22 12 02
1678 ; mm4 = 73 63 53 43 33 23 13 03
1679 ; transpose back to write out
1681 movq mm1, mm0 ;
1682 punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
1684 punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
1685 movq mm2, mm3 ;
1687 punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
1688 movq mm5, mm1 ; 71 70 61 60 51 50 41 40
1690 punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
1691 movq mm6, mm0 ; 31 30 21 20 11 10 01 00
1693 punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
1694 punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
1696 movd [rsi+rax*4], mm0 ; write 03 02 01 00
1697 punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
1699 psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
1700 punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
1702 movd [rdi+rax*4], mm0 ; write 13 12 11 10
1703 movd [rsi+rax*2], mm6 ; write 23 22 21 20
1705 psrlq mm6, 32 ; 33 32 31 30
1706 movd [rsi], mm1 ; write 43 42 41 40
1708 movd [rsi + rax], mm6 ; write 33 32 31 30
1709 neg rax
1711 movd [rsi + rax*2], mm5 ; write 63 62 61 60
1712 psrlq mm1, 32 ; 53 52 51 50
1714 movd [rdi], mm1 ; write out 53 52 51 50
1715 psrlq mm5, 32 ; 73 72 71 70
1717 movd [rdi + rax*2], mm5 ; write 73 72 71 70
1719 lea rsi, [rsi+rax*8] ; next 8
1721 dec rcx
1722 jnz nexts8_v
1724 add rsp, 32
1725 pop rsp
1726 ; begin epilog
1727 pop rdi
1728 pop rsi
1729 RESTORE_GOT
1730 UNSHADOW_ARGS
1731 pop rbp
1736 ;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
1737 ; int y_stride,
1738 ; loop_filter_info *lfi)
1742 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1743 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1744 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1747 SECTION_RODATA
1748 align 16
1749 tfe:
1750 times 8 db 0xfe
1751 align 16
1752 t80:
1753 times 8 db 0x80
1754 align 16
1755 t1s:
1756 times 8 db 0x01
1757 align 16
1759 times 8 db 0x03
1760 align 16
1762 times 8 db 0x04
1763 align 16
1764 ones:
1765 times 4 dw 0x0001
1766 align 16
1767 s27:
1768 times 4 dw 0x1b00
1769 align 16
1770 s18:
1771 times 4 dw 0x1200
1772 align 16
1774 times 4 dw 0x0900
1775 align 16
1776 s63:
1777 times 4 dw 0x003f