2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
14 #include "vpx_scale/yv12config.h"
17 #define VP8_FILTER_WEIGHT 128
18 #define VP8_FILTER_SHIFT 7
22 /* static constants */
24 const static short Blur
[48] =
27 16, 16, 16, 16, 16, 16, 16, 16,
28 16, 16, 16, 16, 16, 16, 16, 16,
29 64, 64, 64, 64, 64, 64, 64, 64,
30 16, 16, 16, 16, 16, 16, 16, 16,
31 16, 16, 16, 16, 16, 16, 16, 16,
32 0, 0, 0, 0, 0, 0, 0, 0,
35 #define RD __declspec(align(16)) __int64 rd = 0x0040004000400040;
36 #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
44 /* external references */
45 extern double vp8_gaussian(double sigma
, double mu
, double x
);
46 extern short vp8_rv
[];
47 extern int vp8_q2mbl(int x
) ;
51 void vp8_post_proc_down_and_across_mmx
53 unsigned char *src_ptr
,
54 unsigned char *dst_ptr
,
55 int src_pixels_per_line
,
56 int dst_pixels_per_line
,
78 mov ecx
, DWORD PTR rows
79 mov eax
, src_pixels_per_line
;
88 clear out edx
for use as loop counter
94 movq mm6
, [ebx
+ 32 ] ;
103 mm3
*= kernel
2 modifiers
105 movq mm6
, [ebx
+ 48] ;
107 movq mm5
, [esi
+ eax
] ;
112 mm6
*= p0
..p3
* kernel
3 modifiers
121 mm7
= r0 p0
..p3
- r1 p0
..p3
123 mm5
= r1 p0
..p3
- r0 p0
..p3
125 mm7
= abs(r0 p0
..p3
- r1 p0
..p3
)
128 movq mm6
, [ebx
+ 64 ] ;
129 mm6
= kernel
4 modifiers
130 movq mm5
, [esi
+ 2*eax
] ;
135 mm5
*= kernel
4 modifiers
144 mm6
= r0 p0
..p3
- r2 p0
..p3
146 mm5
= r2 p0
..p3
- r2 p0
..p3
148 mm6
= abs(r0 p0
..p3
- r2 p0
..p3
)
151 accumulate thresholds
157 movq mm5
, [esi
+2*eax
] ;
162 mm5
*= kernel
0 modifiers
171 mm6
= p0
..p3
- r
-2 p0
..p3
173 mm5
= r
-2 p0
..p3
- p0
..p3
175 mm6
= abs(r0 p0
..p3
- r
-2 p0
..p3
)
178 accumulate thresholds
180 movq mm6
, [ebx
+ 16] ;
182 movq mm4
, [esi
+eax
] ;
187 mm4
*= kernel
1 modifiers
.
196 mm6
= p0
..p3
- r
-2 p0
..p3
198 mm5
= r
-1 p0
..p3
- p0
..p3
200 mm6
= abs(r0 p0
..p3
- r
-1 p0
..p3
)
203 accumulate thresholds
208 psraw mm3
, VP8_FILTER_SHIFT
;
212 mm1 select vals
> thresh from source
214 mm7 select vals
< thresh from blurred result
232 // done with the all cols, start the across filtering in place
244 movq mm6
, [ebx
+ 32 ] ;
245 movq mm4
, [edi
+edx
] ;
254 mm3
*= kernel
2 modifiers
264 mm6
*= p1
..p4
* kernel
3 modifiers
273 mm7
= p0
..p3
- p1
..p4
275 mm5
= p1
..p4
- p0
..p3
277 mm7
= abs(p0
..p3
- p1
..p4
)
280 movq mm6
, [ebx
+ 64 ]
288 mm5
*= kernel
4 modifiers
297 mm6
= p0
..p3
- p1
..p4
299 mm5
= p1
..p4
- p0
..p3
301 mm6
= abs(p0
..p3
- p1
..p4
)
304 accumulate thresholds
308 movq mm4
, [edi
+edx
-2] ;
315 mm5
*= kernel
0 modifiers
324 mm6
= p0
..p3
- p1
..p4
326 mm5
= p1
..p4
- p0
..p3
328 mm6
= abs(p0
..p3
- p1
..p4
)
331 accumulate thresholds
339 mm4
*= kernel
1 modifiers
.
348 mm6
= p0
..p3
- p1
..p4
350 mm5
= p1
..p4
- p0
..p3
352 mm6
= abs(p0
..p3
- p1
..p4
)
355 accumulate thresholds
359 psraw mm3
, VP8_FILTER_SHIFT
;
363 mm1 select vals
> thresh from source
365 mm7 select vals
< thresh from blurred result
371 mov DWORD PTR
[edi
+edx
-4], eax
;
372 store previous four bytes
379 mov DWORD PTR
[edi
+edx
-4], eax
382 // done with this rwo
385 mov eax
, dst_pixels_per_line
;
389 mov eax
, src_pixels_per_line
;
403 void vp8_post_proc_down_and_across_xmm
405 unsigned char *src_ptr
,
406 unsigned char *dst_ptr
,
407 int src_pixels_per_line
,
408 int dst_pixels_per_line
,
423 punpcklqdq xmm2
, xmm2
428 mov ecx
, DWORD PTR rows
429 mov eax
, src_pixels_per_line
;
438 clear out edx
for use as loop counter
440 movq xmm3
, QWORD PTR
[esi
] ;
443 punpcklbw xmm3
, xmm0
;
449 movq xmm5
, QWORD PTR
[esi
+ eax
] ;
451 punpcklbw xmm5
, xmm0
;
461 mm7
= r0 p0
..p3
- r1 p0
..p3
463 mm5
= r1 p0
..p3
- r0 p0
..p3
465 mm7
= abs(r0 p0
..p3
- r1 p0
..p3
)
468 movq xmm5
, QWORD PTR
[esi
+ 2*eax
] ;
470 punpcklbw xmm5
, xmm0
;
480 mm6
= r0 p0
..p3
- r2 p0
..p3
482 mm5
= r2 p0
..p3
- r2 p0
..p3
484 mm6
= abs(r0 p0
..p3
- r2 p0
..p3
)
487 accumulate thresholds
491 movq xmm5
, QWORD PTR
[esi
+2*eax
] ;
493 punpcklbw xmm5
, xmm0
;
503 mm6
= p0
..p3
- r
-2 p0
..p3
505 mm5
= r
-2 p0
..p3
- p0
..p3
507 mm6
= abs(r0 p0
..p3
- r
-2 p0
..p3
)
510 accumulate thresholds
512 movq xmm4
, QWORD PTR
[esi
+eax
] ;
514 punpcklbw xmm4
, xmm0
;
524 mm6
= p0
..p3
- r
-2 p0
..p3
526 mm5
= r
-1 p0
..p3
- p0
..p3
528 mm6
= abs(r0 p0
..p3
- r
-1 p0
..p3
)
531 accumulate thresholds
540 mm1 select vals
> thresh from source
542 mm7 select vals
< thresh from blurred result
546 packuswb xmm1
, xmm0
;
548 movq QWORD PTR
[edi
], xmm1
;
560 // done with the all cols, start the across filtering in place
565 movq mm0
, QWORD PTR
[edi
-8];
568 movq xmm7
, QWORD PTR
[edi
+edx
-2]
569 movd xmm4
, DWORD PTR
[edi
+edx
+6]
576 punpcklbw xmm3
, xmm0
;
585 punpcklbw xmm5
, xmm0
;
595 mm7
= p0
..p3
- p1
..p4
597 mm5
= p1
..p4
- p0
..p3
599 mm7
= abs(p0
..p3
- p1
..p4
)
604 punpcklbw xmm5
, xmm0
;
614 mm6
= p0
..p3
- p1
..p4
616 mm5
= p1
..p4
- p0
..p3
618 mm6
= abs(p0
..p3
- p1
..p4
)
621 accumulate thresholds
626 punpcklbw xmm5
, xmm0
;
636 mm6
= p0
..p3
- p1
..p4
638 mm5
= p1
..p4
- p0
..p3
640 mm6
= abs(p0
..p3
- p1
..p4
)
643 accumulate thresholds
647 punpcklbw xmm4
, xmm0
;
657 mm6
= p0
..p3
- p1
..p4
659 mm5
= p1
..p4
- p0
..p3
661 mm6
= abs(p0
..p3
- p1
..p4
)
664 accumulate thresholds
672 mm1 select vals
> thresh from source
674 mm7 select vals
< thresh from blurred result
678 packuswb xmm1
, xmm0
;
680 movq QWORD PTR
[edi
+edx
-8], mm0
;
681 store previous four bytes
689 movq QWORD PTR
[edi
+edx
-8], mm0
691 // done with this rwo
694 mov eax
, dst_pixels_per_line
;
698 mov eax
, src_pixels_per_line
;
709 void vp8_mbpost_proc_down_mmx(unsigned char *dst
, int pitch
, int rows
, int cols
, int flimit
)
712 __declspec(align(16))
714 __declspec(align(16))
715 unsigned char d
[16][8];
717 flimit
= vp8_q2mbl(flimit
);
719 for (i
= 0; i
< 2; i
++)
724 for (c
= 0; c
< cols
; c
+= 4)
726 unsigned char *s
= &dst
[c
];
734 neg eax
// eax = -pitch
736 lea esi
, [esi
+ eax
*8]; // edi = s[-pitch*8]
749 movd mm1
, DWORD PTR
[edi
];
766 //save the var and sum
769 movd mm1
, DWORD PTR
[esi
] // [s-pitch*8]
770 movd mm2
, DWORD PTR
[edi
] // [s+pitch*7]
829 movd mm1
, DWORD PTR
[esi
+eax
*8]
838 movq mm4
, vp8_rv
[ecx
*2]
841 //paddw xmm1, eight8s
851 movd DWORD PTR d
[ecx
*4], mm1
857 movd mm1
, DWORD PTR d
[ecx
*4]
873 void vp8_mbpost_proc_down_xmm(unsigned char *dst
, int pitch
, int rows
, int cols
, int flimit
)
876 __declspec(align(16))
878 __declspec(align(16))
879 unsigned char d
[16][8];
881 flimit
= vp8_q2mbl(flimit
);
883 for (i
= 0; i
< 4; i
++)
888 for (c
= 0; c
< cols
; c
+= 8)
890 unsigned char *s
= &dst
[c
];
898 neg eax
// eax = -pitch
900 lea esi
, [esi
+ eax
*8]; // edi = s[-pitch*8]
913 movq xmm1
, QWORD PTR
[edi
];
914 punpcklbw xmm1
, xmm0
;
920 punpcklwd xmm1
, xmm0
;
922 punpckhwd xmm2
, xmm0
;
930 //save the var and sum
933 movq xmm1
, QWORD PTR
[esi
] // [s-pitch*8]
934 movq xmm2
, QWORD PTR
[edi
] // [s+pitch*7]
993 movq xmm1
, QWORD PTR
[esi
+eax
*8]
1002 movdqu xmm4
, vp8_rv
[ecx
*2]
1005 //paddw xmm1, eight8s
1015 movq QWORD PTR d
[ecx
*8], xmm1
1037 /****************************************************************************
1039 * ROUTINE : plane_add_noise_wmt
1041 * INPUTS : unsigned char *Start starting address of buffer to add gaussian
1043 * unsigned int Width width of plane
1044 * unsigned int Height height of plane
1045 * int Pitch distance between subsequent lines of frame
1046 * int q quantizer used to determine amount of noise
1053 * FUNCTION : adds gaussian noise to a plane of pixels
1055 * SPECIAL NOTES : None.
1057 ****************************************************************************/
1058 void vp8_plane_add_noise_wmt(unsigned char *Start
, unsigned int Width
, unsigned int Height
, int Pitch
, int q
, int a
)
1062 __declspec(align(16)) unsigned char blackclamp
[16];
1063 __declspec(align(16)) unsigned char whiteclamp
[16];
1064 __declspec(align(16)) unsigned char bothclamp
[16];
1065 char char_dist
[300];
1070 sigma
= a
+ .5 + .6 * (63 - q
) / 63.0;
1072 // set up a lookup table of 256 entries that matches
1073 // a gaussian distribution with sigma determined by q.
1081 for (i
= -32; i
< 32; i
++)
1083 double g
= 256 * vp8_gaussian(sigma
, 0, 1.0 * i
);
1084 int a
= (int)(g
+ .5);
1088 for (j
= 0; j
< a
; j
++)
1090 char_dist
[next
+j
] = (char) i
;
1098 for (next
= next
; next
< 256; next
++)
1099 char_dist
[next
] = 0;
1103 for (i
= 0; i
< 2048; i
++)
1105 Rand
[i
] = char_dist
[rand() & 0xff];
1108 for (i
= 0; i
< 16; i
++)
1110 blackclamp
[i
] = -char_dist
[0];
1111 whiteclamp
[i
] = -char_dist
[0];
1112 bothclamp
[i
] = -2 * char_dist
[0];
1115 for (i
= 0; i
< Height
; i
++)
1117 unsigned char *Pos
= Start
+ i
* Pitch
;
1118 char *Ref
= Rand
+ (rand() & 0xff);
1128 movdqu xmm1
, [esi
+eax
] // get the source
1130 psubusb xmm1
, blackclamp
// clamp both sides so we don't outrange adding noise
1131 paddusb xmm1
, bothclamp
1132 psubusb xmm1
, whiteclamp
1134 movdqu xmm2
, [edi
+eax
] // get the noise for this line
1135 paddb xmm1
, xmm2
// add it in
1136 movdqu
[esi
+eax
], xmm1
// store the result
1138 add eax
, 16 // move to the next line
1149 __declspec(align(16))
1150 static const int four8s
[4] = { 8, 8, 8, 8};
1151 void vp8_mbpost_proc_across_ip_xmm(unsigned char *src
, int pitch
, int rows
, int cols
, int flimit
)
1154 __declspec(align(16))
1156 unsigned char *s
= src
;
1161 flimit
= vp8_q2mbl(flimit
);
1165 flimit4
[3] = flimit
;
1167 for (r
= 0; r
< rows
; r
++)
1174 for (i
= -8; i
<= 6; i
++)
1176 sumsq
+= s
[i
] * s
[i
];
1199 movd xmm1
, DWORD PTR
[esi
+ecx
-8] // -8 -7 -6 -5
1200 movd xmm2
, DWORD PTR
[esi
+ecx
+7] // +7 +8 +9 +10
1202 punpcklbw xmm1
, xmm0
// expanding
1203 punpcklbw xmm2
, xmm0
// expanding
1205 punpcklwd xmm1
, xmm0
// expanding to dwords
1206 punpcklwd xmm2
, xmm0
// expanding to dwords
1208 psubd xmm2
, xmm1
// 7--8 8--7 9--6 10--5
1209 paddd xmm1
, xmm1
// -8*2 -7*2 -6*2 -5*2
1211 paddd xmm1
, xmm2
// 7+-8 8+-7 9+-6 10+-5
1212 pmaddwd xmm1
, xmm2
// squared of 7+-8 8+-7 9+-6 10+-5
1217 pshufd xmm6
, xmm6
, 0 // duplicate the last ones
1218 pshufd xmm7
, xmm7
, 0 // duplicate the last ones
1220 psrldq xmm1
, 4 // 8--7 9--6 10--5 0000
1221 psrldq xmm2
, 4 // 8--7 9--6 10--5 0000
1223 pshufd xmm3
, xmm1
, 3 // 0000 8--7 8--7 8--7 squared
1224 pshufd xmm4
, xmm2
, 3 // 0000 8--7 8--7 8--7 squared
1229 pshufd xmm3
, xmm1
, 01011111b
// 0000 0000 9--6 9--6 squared
1230 pshufd xmm4
, xmm2
, 01011111b
// 0000 0000 9--6 9--6 squared
1235 pshufd xmm3
, xmm1
, 10111111b
// 0000 0000 8--7 8--7 squared
1236 pshufd xmm4
, xmm2
, 10111111b
// 0000 0000 8--7 8--7 squared
1256 movd xmm1
, DWORD PTR
[esi
+ecx
]
1259 punpcklbw xmm1
, xmm0
1260 punpcklwd xmm1
, xmm0
1274 movd
[esi
+ecx
-8], mm0
1293 /****************************************************************************
1295 * ROUTINE : plane_add_noise_mmx
1297 * INPUTS : unsigned char *Start starting address of buffer to add gaussian
1299 * unsigned int Width width of plane
1300 * unsigned int Height height of plane
1301 * int Pitch distance between subsequent lines of frame
1302 * int q quantizer used to determine amount of noise
1309 * FUNCTION : adds gaussian noise to a plane of pixels
1311 * SPECIAL NOTES : None.
1313 ****************************************************************************/
1314 void vp8_plane_add_noise_mmx(unsigned char *Start
, unsigned int Width
, unsigned int Height
, int Pitch
, int q
, int a
)
1317 int Pitch4
= Pitch
* 4;
1318 const int noise_amount
= 2;
1319 const int noise_adder
= 2 * noise_amount
+ 1;
1321 __declspec(align(16)) unsigned char blackclamp
[16];
1322 __declspec(align(16)) unsigned char whiteclamp
[16];
1323 __declspec(align(16)) unsigned char bothclamp
[16];
1325 char char_dist
[300];
1330 sigma
= a
+ .5 + .6 * (63 - q
) / 63.0;
1332 // set up a lookup table of 256 entries that matches
1333 // a gaussian distribution with sigma determined by q.
1341 for (i
= -32; i
< 32; i
++)
1343 int a
= (int)(.5 + 256 * vp8_gaussian(sigma
, 0, i
));
1347 for (j
= 0; j
< a
; j
++)
1349 char_dist
[next
+j
] = (char) i
;
1357 for (next
= next
; next
< 256; next
++)
1358 char_dist
[next
] = 0;
1362 for (i
= 0; i
< 2048; i
++)
1364 Rand
[i
] = char_dist
[rand() & 0xff];
1367 for (i
= 0; i
< 16; i
++)
1369 blackclamp
[i
] = -char_dist
[0];
1370 whiteclamp
[i
] = -char_dist
[0];
1371 bothclamp
[i
] = -2 * char_dist
[0];
1374 for (i
= 0; i
< Height
; i
++)
1376 unsigned char *Pos
= Start
+ i
* Pitch
;
1377 char *Ref
= Rand
+ (rand() & 0xff);
1387 movq mm1
, [esi
+eax
] // get the source
1389 psubusb mm1
, blackclamp
// clamp both sides so we don't outrange adding noise
1390 paddusb mm1
, bothclamp
1391 psubusb mm1
, whiteclamp
1393 movq mm2
, [edi
+eax
] // get the noise for this line
1394 paddb mm1
, mm2
// add it in
1395 movq
[esi
+eax
], mm1
// store the result
1397 add eax
, 8 // move to the next line
1408 extern char an
[8][64][3072];
1409 extern int cd
[8][64];
1411 void vp8_plane_add_noise_mmx(unsigned char *Start
, unsigned int Width
, unsigned int Height
, int Pitch
, int q
, int a
)
1414 __declspec(align(16)) unsigned char blackclamp
[16];
1415 __declspec(align(16)) unsigned char whiteclamp
[16];
1416 __declspec(align(16)) unsigned char bothclamp
[16];
1421 for (i
= 0; i
< 16; i
++)
1423 blackclamp
[i
] = -cd
[a
][q
];
1424 whiteclamp
[i
] = -cd
[a
][q
];
1425 bothclamp
[i
] = -2 * cd
[a
][q
];
1428 for (i
= 0; i
< Height
; i
++)
1430 unsigned char *Pos
= Start
+ i
* Pitch
;
1431 char *Ref
= an
[a
][q
] + (rand() & 0xff);
1441 movq mm1
, [esi
+eax
] // get the source
1443 psubusb mm1
, blackclamp
// clamp both sides so we don't outrange adding noise
1444 paddusb mm1
, bothclamp
1445 psubusb mm1
, whiteclamp
1447 movq mm2
, [edi
+eax
] // get the noise for this line
1448 paddb mm1
, mm2
// add it in
1449 movq
[esi
+eax
], mm1
// store the result
1451 add eax
, 8 // move to the next line
1460 void vp8_plane_add_noise_wmt(unsigned char *Start
, unsigned int Width
, unsigned int Height
, int Pitch
, int q
, int a
)
1464 __declspec(align(16)) unsigned char blackclamp
[16];
1465 __declspec(align(16)) unsigned char whiteclamp
[16];
1466 __declspec(align(16)) unsigned char bothclamp
[16];
1470 for (i
= 0; i
< 16; i
++)
1472 blackclamp
[i
] = -cd
[a
][q
];
1473 whiteclamp
[i
] = -cd
[a
][q
];
1474 bothclamp
[i
] = -2 * cd
[a
][q
];
1477 for (i
= 0; i
< Height
; i
++)
1479 unsigned char *Pos
= Start
+ i
* Pitch
;
1480 char *Ref
= an
[a
][q
] + (rand() & 0xff);
1490 movdqu xmm1
, [esi
+eax
] // get the source
1492 psubusb xmm1
, blackclamp
// clamp both sides so we don't outrange adding noise
1493 paddusb xmm1
, bothclamp
1494 psubusb xmm1
, whiteclamp
1496 movdqu xmm2
, [edi
+eax
] // get the noise for this line
1497 paddb xmm1
, xmm2
// add it in
1498 movdqu
[esi
+eax
], xmm1
// store the result
1500 add eax
, 16 // move to the next line