1 // Copyright 2010 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
10 // Speed-critical decoding functions.
12 // Author: Skal (pascal.massimino@gmail.com)
15 #include "../dec/vp8i.h"
17 //------------------------------------------------------------------------------
18 // run-time tables (~4k)
20 static uint8_t abs0
[255 + 255 + 1]; // abs(i)
21 static uint8_t abs1
[255 + 255 + 1]; // abs(i)>>1
22 static int8_t sclip1
[1020 + 1020 + 1]; // clips [-1020, 1020] to [-128, 127]
23 static int8_t sclip2
[112 + 112 + 1]; // clips [-112, 112] to [-16, 15]
24 static uint8_t clip1
[255 + 510 + 1]; // clips [-255,510] to [0,255]
26 // We declare this variable 'volatile' to prevent instruction reordering
27 // and make sure it's set to true _last_ (so as to be thread-safe)
28 static volatile int tables_ok
= 0;
30 static void DspInitTables(void) {
33 for (i
= -255; i
<= 255; ++i
) {
34 abs0
[255 + i
] = (i
< 0) ? -i
: i
;
35 abs1
[255 + i
] = abs0
[255 + i
] >> 1;
37 for (i
= -1020; i
<= 1020; ++i
) {
38 sclip1
[1020 + i
] = (i
< -128) ? -128 : (i
> 127) ? 127 : i
;
40 for (i
= -112; i
<= 112; ++i
) {
41 sclip2
[112 + i
] = (i
< -16) ? -16 : (i
> 15) ? 15 : i
;
43 for (i
= -255; i
<= 255 + 255; ++i
) {
44 clip1
[255 + i
] = (i
< 0) ? 0 : (i
> 255) ? 255 : i
;
50 static WEBP_INLINE
uint8_t clip_8b(int v
) {
51 return (!(v
& ~0xff)) ? v
: (v
< 0) ? 0 : 255;
54 //------------------------------------------------------------------------------
55 // Transforms (Paragraph 14.4)
57 #define STORE(x, y, v) \
58 dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
60 #define STORE2(y, dc, d, c) do { \
61 const int DC = (dc); \
62 STORE(0, y, DC + (d)); \
63 STORE(1, y, DC + (c)); \
64 STORE(2, y, DC - (c)); \
65 STORE(3, y, DC - (d)); \
68 static const int kC1
= 20091 + (1 << 16);
69 static const int kC2
= 35468;
70 #define MUL(a, b) (((a) * (b)) >> 16)
72 static void TransformOne(const int16_t* in
, uint8_t* dst
) {
76 for (i
= 0; i
< 4; ++i
) { // vertical pass
77 const int a
= in
[0] + in
[8]; // [-4096, 4094]
78 const int b
= in
[0] - in
[8]; // [-4095, 4095]
79 const int c
= MUL(in
[4], kC2
) - MUL(in
[12], kC1
); // [-3783, 3783]
80 const int d
= MUL(in
[4], kC1
) + MUL(in
[12], kC2
); // [-3785, 3781]
81 tmp
[0] = a
+ d
; // [-7881, 7875]
82 tmp
[1] = b
+ c
; // [-7878, 7878]
83 tmp
[2] = b
- c
; // [-7878, 7878]
84 tmp
[3] = a
- d
; // [-7877, 7879]
88 // Each pass is expanding the dynamic range by ~3.85 (upper bound).
89 // The exact value is (2. + (kC1 + kC2) / 65536).
90 // After the second pass, maximum interval is [-3794, 3794], assuming
91 // an input in [-2048, 2047] interval. We then need to add a dst value
92 // in the [0, 255] range.
93 // In the worst case scenario, the input to clip_8b() can be as large as
96 for (i
= 0; i
< 4; ++i
) { // horizontal pass
97 const int dc
= tmp
[0] + 4;
98 const int a
= dc
+ tmp
[8];
99 const int b
= dc
- tmp
[8];
100 const int c
= MUL(tmp
[4], kC2
) - MUL(tmp
[12], kC1
);
101 const int d
= MUL(tmp
[4], kC1
) + MUL(tmp
[12], kC2
);
111 // Simplified transform when only in[0], in[1] and in[4] are non-zero
112 static void TransformAC3(const int16_t* in
, uint8_t* dst
) {
113 const int a
= in
[0] + 4;
114 const int c4
= MUL(in
[4], kC2
);
115 const int d4
= MUL(in
[4], kC1
);
116 const int c1
= MUL(in
[1], kC2
);
117 const int d1
= MUL(in
[1], kC1
);
118 STORE2(0, a
+ d4
, d1
, c1
);
119 STORE2(1, a
+ c4
, d1
, c1
);
120 STORE2(2, a
- c4
, d1
, c1
);
121 STORE2(3, a
- d4
, d1
, c1
);
126 static void TransformTwo(const int16_t* in
, uint8_t* dst
, int do_two
) {
127 TransformOne(in
, dst
);
129 TransformOne(in
+ 16, dst
+ 4);
133 static void TransformUV(const int16_t* in
, uint8_t* dst
) {
134 VP8Transform(in
+ 0 * 16, dst
, 1);
135 VP8Transform(in
+ 2 * 16, dst
+ 4 * BPS
, 1);
138 static void TransformDC(const int16_t *in
, uint8_t* dst
) {
139 const int DC
= in
[0] + 4;
141 for (j
= 0; j
< 4; ++j
) {
142 for (i
= 0; i
< 4; ++i
) {
148 static void TransformDCUV(const int16_t* in
, uint8_t* dst
) {
149 if (in
[0 * 16]) TransformDC(in
+ 0 * 16, dst
);
150 if (in
[1 * 16]) TransformDC(in
+ 1 * 16, dst
+ 4);
151 if (in
[2 * 16]) TransformDC(in
+ 2 * 16, dst
+ 4 * BPS
);
152 if (in
[3 * 16]) TransformDC(in
+ 3 * 16, dst
+ 4 * BPS
+ 4);
157 //------------------------------------------------------------------------------
160 static void TransformWHT(const int16_t* in
, int16_t* out
) {
163 for (i
= 0; i
< 4; ++i
) {
164 const int a0
= in
[0 + i
] + in
[12 + i
];
165 const int a1
= in
[4 + i
] + in
[ 8 + i
];
166 const int a2
= in
[4 + i
] - in
[ 8 + i
];
167 const int a3
= in
[0 + i
] - in
[12 + i
];
168 tmp
[0 + i
] = a0
+ a1
;
169 tmp
[8 + i
] = a0
- a1
;
170 tmp
[4 + i
] = a3
+ a2
;
171 tmp
[12 + i
] = a3
- a2
;
173 for (i
= 0; i
< 4; ++i
) {
174 const int dc
= tmp
[0 + i
* 4] + 3; // w/ rounder
175 const int a0
= dc
+ tmp
[3 + i
* 4];
176 const int a1
= tmp
[1 + i
* 4] + tmp
[2 + i
* 4];
177 const int a2
= tmp
[1 + i
* 4] - tmp
[2 + i
* 4];
178 const int a3
= dc
- tmp
[3 + i
* 4];
179 out
[ 0] = (a0
+ a1
) >> 3;
180 out
[16] = (a3
+ a2
) >> 3;
181 out
[32] = (a0
- a1
) >> 3;
182 out
[48] = (a3
- a2
) >> 3;
187 void (*VP8TransformWHT
)(const int16_t* in
, int16_t* out
) = TransformWHT
;
189 //------------------------------------------------------------------------------
192 #define DST(x, y) dst[(x) + (y) * BPS]
194 static WEBP_INLINE
void TrueMotion(uint8_t *dst
, int size
) {
195 const uint8_t* top
= dst
- BPS
;
196 const uint8_t* const clip0
= clip1
+ 255 - top
[-1];
198 for (y
= 0; y
< size
; ++y
) {
199 const uint8_t* const clip
= clip0
+ dst
[-1];
201 for (x
= 0; x
< size
; ++x
) {
202 dst
[x
] = clip
[top
[x
]];
207 static void TM4(uint8_t *dst
) { TrueMotion(dst
, 4); }
208 static void TM8uv(uint8_t *dst
) { TrueMotion(dst
, 8); }
209 static void TM16(uint8_t *dst
) { TrueMotion(dst
, 16); }
211 //------------------------------------------------------------------------------
214 static void VE16(uint8_t *dst
) { // vertical
216 for (j
= 0; j
< 16; ++j
) {
217 memcpy(dst
+ j
* BPS
, dst
- BPS
, 16);
221 static void HE16(uint8_t *dst
) { // horizontal
223 for (j
= 16; j
> 0; --j
) {
224 memset(dst
, dst
[-1], 16);
229 static WEBP_INLINE
void Put16(int v
, uint8_t* dst
) {
231 for (j
= 0; j
< 16; ++j
) {
232 memset(dst
+ j
* BPS
, v
, 16);
236 static void DC16(uint8_t *dst
) { // DC
239 for (j
= 0; j
< 16; ++j
) {
240 DC
+= dst
[-1 + j
* BPS
] + dst
[j
- BPS
];
245 static void DC16NoTop(uint8_t *dst
) { // DC with top samples not available
248 for (j
= 0; j
< 16; ++j
) {
249 DC
+= dst
[-1 + j
* BPS
];
254 static void DC16NoLeft(uint8_t *dst
) { // DC with left samples not available
257 for (i
= 0; i
< 16; ++i
) {
263 static void DC16NoTopLeft(uint8_t *dst
) { // DC with no top and left samples
267 //------------------------------------------------------------------------------
270 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
271 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
273 static void VE4(uint8_t *dst
) { // vertical
274 const uint8_t* top
= dst
- BPS
;
275 const uint8_t vals
[4] = {
276 AVG3(top
[-1], top
[0], top
[1]),
277 AVG3(top
[ 0], top
[1], top
[2]),
278 AVG3(top
[ 1], top
[2], top
[3]),
279 AVG3(top
[ 2], top
[3], top
[4])
282 for (i
= 0; i
< 4; ++i
) {
283 memcpy(dst
+ i
* BPS
, vals
, sizeof(vals
));
287 static void HE4(uint8_t *dst
) { // horizontal
288 const int A
= dst
[-1 - BPS
];
289 const int B
= dst
[-1];
290 const int C
= dst
[-1 + BPS
];
291 const int D
= dst
[-1 + 2 * BPS
];
292 const int E
= dst
[-1 + 3 * BPS
];
293 *(uint32_t*)(dst
+ 0 * BPS
) = 0x01010101U
* AVG3(A
, B
, C
);
294 *(uint32_t*)(dst
+ 1 * BPS
) = 0x01010101U
* AVG3(B
, C
, D
);
295 *(uint32_t*)(dst
+ 2 * BPS
) = 0x01010101U
* AVG3(C
, D
, E
);
296 *(uint32_t*)(dst
+ 3 * BPS
) = 0x01010101U
* AVG3(D
, E
, E
);
299 static void DC4(uint8_t *dst
) { // DC
302 for (i
= 0; i
< 4; ++i
) dc
+= dst
[i
- BPS
] + dst
[-1 + i
* BPS
];
304 for (i
= 0; i
< 4; ++i
) memset(dst
+ i
* BPS
, dc
, 4);
307 static void RD4(uint8_t *dst
) { // Down-right
308 const int I
= dst
[-1 + 0 * BPS
];
309 const int J
= dst
[-1 + 1 * BPS
];
310 const int K
= dst
[-1 + 2 * BPS
];
311 const int L
= dst
[-1 + 3 * BPS
];
312 const int X
= dst
[-1 - BPS
];
313 const int A
= dst
[0 - BPS
];
314 const int B
= dst
[1 - BPS
];
315 const int C
= dst
[2 - BPS
];
316 const int D
= dst
[3 - BPS
];
317 DST(0, 3) = AVG3(J
, K
, L
);
318 DST(0, 2) = DST(1, 3) = AVG3(I
, J
, K
);
319 DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X
, I
, J
);
320 DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A
, X
, I
);
321 DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B
, A
, X
);
322 DST(2, 0) = DST(3, 1) = AVG3(C
, B
, A
);
323 DST(3, 0) = AVG3(D
, C
, B
);
326 static void LD4(uint8_t *dst
) { // Down-Left
327 const int A
= dst
[0 - BPS
];
328 const int B
= dst
[1 - BPS
];
329 const int C
= dst
[2 - BPS
];
330 const int D
= dst
[3 - BPS
];
331 const int E
= dst
[4 - BPS
];
332 const int F
= dst
[5 - BPS
];
333 const int G
= dst
[6 - BPS
];
334 const int H
= dst
[7 - BPS
];
335 DST(0, 0) = AVG3(A
, B
, C
);
336 DST(1, 0) = DST(0, 1) = AVG3(B
, C
, D
);
337 DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C
, D
, E
);
338 DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D
, E
, F
);
339 DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E
, F
, G
);
340 DST(3, 2) = DST(2, 3) = AVG3(F
, G
, H
);
341 DST(3, 3) = AVG3(G
, H
, H
);
344 static void VR4(uint8_t *dst
) { // Vertical-Right
345 const int I
= dst
[-1 + 0 * BPS
];
346 const int J
= dst
[-1 + 1 * BPS
];
347 const int K
= dst
[-1 + 2 * BPS
];
348 const int X
= dst
[-1 - BPS
];
349 const int A
= dst
[0 - BPS
];
350 const int B
= dst
[1 - BPS
];
351 const int C
= dst
[2 - BPS
];
352 const int D
= dst
[3 - BPS
];
353 DST(0, 0) = DST(1, 2) = AVG2(X
, A
);
354 DST(1, 0) = DST(2, 2) = AVG2(A
, B
);
355 DST(2, 0) = DST(3, 2) = AVG2(B
, C
);
356 DST(3, 0) = AVG2(C
, D
);
358 DST(0, 3) = AVG3(K
, J
, I
);
359 DST(0, 2) = AVG3(J
, I
, X
);
360 DST(0, 1) = DST(1, 3) = AVG3(I
, X
, A
);
361 DST(1, 1) = DST(2, 3) = AVG3(X
, A
, B
);
362 DST(2, 1) = DST(3, 3) = AVG3(A
, B
, C
);
363 DST(3, 1) = AVG3(B
, C
, D
);
366 static void VL4(uint8_t *dst
) { // Vertical-Left
367 const int A
= dst
[0 - BPS
];
368 const int B
= dst
[1 - BPS
];
369 const int C
= dst
[2 - BPS
];
370 const int D
= dst
[3 - BPS
];
371 const int E
= dst
[4 - BPS
];
372 const int F
= dst
[5 - BPS
];
373 const int G
= dst
[6 - BPS
];
374 const int H
= dst
[7 - BPS
];
375 DST(0, 0) = AVG2(A
, B
);
376 DST(1, 0) = DST(0, 2) = AVG2(B
, C
);
377 DST(2, 0) = DST(1, 2) = AVG2(C
, D
);
378 DST(3, 0) = DST(2, 2) = AVG2(D
, E
);
380 DST(0, 1) = AVG3(A
, B
, C
);
381 DST(1, 1) = DST(0, 3) = AVG3(B
, C
, D
);
382 DST(2, 1) = DST(1, 3) = AVG3(C
, D
, E
);
383 DST(3, 1) = DST(2, 3) = AVG3(D
, E
, F
);
384 DST(3, 2) = AVG3(E
, F
, G
);
385 DST(3, 3) = AVG3(F
, G
, H
);
388 static void HU4(uint8_t *dst
) { // Horizontal-Up
389 const int I
= dst
[-1 + 0 * BPS
];
390 const int J
= dst
[-1 + 1 * BPS
];
391 const int K
= dst
[-1 + 2 * BPS
];
392 const int L
= dst
[-1 + 3 * BPS
];
393 DST(0, 0) = AVG2(I
, J
);
394 DST(2, 0) = DST(0, 1) = AVG2(J
, K
);
395 DST(2, 1) = DST(0, 2) = AVG2(K
, L
);
396 DST(1, 0) = AVG3(I
, J
, K
);
397 DST(3, 0) = DST(1, 1) = AVG3(J
, K
, L
);
398 DST(3, 1) = DST(1, 2) = AVG3(K
, L
, L
);
399 DST(3, 2) = DST(2, 2) =
400 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L
;
403 static void HD4(uint8_t *dst
) { // Horizontal-Down
404 const int I
= dst
[-1 + 0 * BPS
];
405 const int J
= dst
[-1 + 1 * BPS
];
406 const int K
= dst
[-1 + 2 * BPS
];
407 const int L
= dst
[-1 + 3 * BPS
];
408 const int X
= dst
[-1 - BPS
];
409 const int A
= dst
[0 - BPS
];
410 const int B
= dst
[1 - BPS
];
411 const int C
= dst
[2 - BPS
];
413 DST(0, 0) = DST(2, 1) = AVG2(I
, X
);
414 DST(0, 1) = DST(2, 2) = AVG2(J
, I
);
415 DST(0, 2) = DST(2, 3) = AVG2(K
, J
);
416 DST(0, 3) = AVG2(L
, K
);
418 DST(3, 0) = AVG3(A
, B
, C
);
419 DST(2, 0) = AVG3(X
, A
, B
);
420 DST(1, 0) = DST(3, 1) = AVG3(I
, X
, A
);
421 DST(1, 1) = DST(3, 2) = AVG3(J
, I
, X
);
422 DST(1, 2) = DST(3, 3) = AVG3(K
, J
, I
);
423 DST(1, 3) = AVG3(L
, K
, J
);
430 //------------------------------------------------------------------------------
433 static void VE8uv(uint8_t *dst
) { // vertical
435 for (j
= 0; j
< 8; ++j
) {
436 memcpy(dst
+ j
* BPS
, dst
- BPS
, 8);
440 static void HE8uv(uint8_t *dst
) { // horizontal
442 for (j
= 0; j
< 8; ++j
) {
443 memset(dst
, dst
[-1], 8);
448 // helper for chroma-DC predictions
449 static WEBP_INLINE
void Put8x8uv(uint8_t value
, uint8_t* dst
) {
451 #ifndef WEBP_REFERENCE_IMPLEMENTATION
452 const uint64_t v
= (uint64_t)value
* 0x0101010101010101ULL
;
453 for (j
= 0; j
< 8; ++j
) {
454 *(uint64_t*)(dst
+ j
* BPS
) = v
;
457 for (j
= 0; j
< 8; ++j
) memset(dst
+ j
* BPS
, value
, 8);
461 static void DC8uv(uint8_t *dst
) { // DC
464 for (i
= 0; i
< 8; ++i
) {
465 dc0
+= dst
[i
- BPS
] + dst
[-1 + i
* BPS
];
467 Put8x8uv(dc0
>> 4, dst
);
470 static void DC8uvNoLeft(uint8_t *dst
) { // DC with no left samples
473 for (i
= 0; i
< 8; ++i
) {
476 Put8x8uv(dc0
>> 3, dst
);
479 static void DC8uvNoTop(uint8_t *dst
) { // DC with no top samples
482 for (i
= 0; i
< 8; ++i
) {
483 dc0
+= dst
[-1 + i
* BPS
];
485 Put8x8uv(dc0
>> 3, dst
);
488 static void DC8uvNoTopLeft(uint8_t *dst
) { // DC with nothing
492 //------------------------------------------------------------------------------
493 // default C implementations
495 const VP8PredFunc VP8PredLuma4
[NUM_BMODES
] = {
496 DC4
, TM4
, VE4
, HE4
, RD4
, VR4
, LD4
, VL4
, HD4
, HU4
499 const VP8PredFunc VP8PredLuma16
[NUM_B_DC_MODES
] = {
500 DC16
, TM16
, VE16
, HE16
,
501 DC16NoTop
, DC16NoLeft
, DC16NoTopLeft
504 const VP8PredFunc VP8PredChroma8
[NUM_B_DC_MODES
] = {
505 DC8uv
, TM8uv
, VE8uv
, HE8uv
,
506 DC8uvNoTop
, DC8uvNoLeft
, DC8uvNoTopLeft
509 //------------------------------------------------------------------------------
510 // Edge filtering functions
512 // 4 pixels in, 2 pixels out
513 static WEBP_INLINE
void do_filter2(uint8_t* p
, int step
) {
514 const int p1
= p
[-2*step
], p0
= p
[-step
], q0
= p
[0], q1
= p
[step
];
515 const int a
= 3 * (q0
- p0
) + sclip1
[1020 + p1
- q1
];
516 const int a1
= sclip2
[112 + ((a
+ 4) >> 3)];
517 const int a2
= sclip2
[112 + ((a
+ 3) >> 3)];
518 p
[-step
] = clip1
[255 + p0
+ a2
];
519 p
[ 0] = clip1
[255 + q0
- a1
];
522 // 4 pixels in, 4 pixels out
523 static WEBP_INLINE
void do_filter4(uint8_t* p
, int step
) {
524 const int p1
= p
[-2*step
], p0
= p
[-step
], q0
= p
[0], q1
= p
[step
];
525 const int a
= 3 * (q0
- p0
);
526 const int a1
= sclip2
[112 + ((a
+ 4) >> 3)];
527 const int a2
= sclip2
[112 + ((a
+ 3) >> 3)];
528 const int a3
= (a1
+ 1) >> 1;
529 p
[-2*step
] = clip1
[255 + p1
+ a3
];
530 p
[- step
] = clip1
[255 + p0
+ a2
];
531 p
[ 0] = clip1
[255 + q0
- a1
];
532 p
[ step
] = clip1
[255 + q1
- a3
];
535 // 6 pixels in, 6 pixels out
536 static WEBP_INLINE
void do_filter6(uint8_t* p
, int step
) {
537 const int p2
= p
[-3*step
], p1
= p
[-2*step
], p0
= p
[-step
];
538 const int q0
= p
[0], q1
= p
[step
], q2
= p
[2*step
];
539 const int a
= sclip1
[1020 + 3 * (q0
- p0
) + sclip1
[1020 + p1
- q1
]];
540 const int a1
= (27 * a
+ 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7
541 const int a2
= (18 * a
+ 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7
542 const int a3
= (9 * a
+ 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7
543 p
[-3*step
] = clip1
[255 + p2
+ a3
];
544 p
[-2*step
] = clip1
[255 + p1
+ a2
];
545 p
[- step
] = clip1
[255 + p0
+ a1
];
546 p
[ 0] = clip1
[255 + q0
- a1
];
547 p
[ step
] = clip1
[255 + q1
- a2
];
548 p
[ 2*step
] = clip1
[255 + q2
- a3
];
551 static WEBP_INLINE
int hev(const uint8_t* p
, int step
, int thresh
) {
552 const int p1
= p
[-2*step
], p0
= p
[-step
], q0
= p
[0], q1
= p
[step
];
553 return (abs0
[255 + p1
- p0
] > thresh
) || (abs0
[255 + q1
- q0
] > thresh
);
556 static WEBP_INLINE
int needs_filter(const uint8_t* p
, int step
, int thresh
) {
557 const int p1
= p
[-2*step
], p0
= p
[-step
], q0
= p
[0], q1
= p
[step
];
558 return (2 * abs0
[255 + p0
- q0
] + abs1
[255 + p1
- q1
]) <= thresh
;
561 static WEBP_INLINE
int needs_filter2(const uint8_t* p
,
562 int step
, int t
, int it
) {
563 const int p3
= p
[-4*step
], p2
= p
[-3*step
], p1
= p
[-2*step
], p0
= p
[-step
];
564 const int q0
= p
[0], q1
= p
[step
], q2
= p
[2*step
], q3
= p
[3*step
];
565 if ((2 * abs0
[255 + p0
- q0
] + abs1
[255 + p1
- q1
]) > t
)
567 return abs0
[255 + p3
- p2
] <= it
&& abs0
[255 + p2
- p1
] <= it
&&
568 abs0
[255 + p1
- p0
] <= it
&& abs0
[255 + q3
- q2
] <= it
&&
569 abs0
[255 + q2
- q1
] <= it
&& abs0
[255 + q1
- q0
] <= it
;
572 //------------------------------------------------------------------------------
573 // Simple In-loop filtering (Paragraph 15.2)
575 static void SimpleVFilter16(uint8_t* p
, int stride
, int thresh
) {
577 for (i
= 0; i
< 16; ++i
) {
578 if (needs_filter(p
+ i
, stride
, thresh
)) {
579 do_filter2(p
+ i
, stride
);
584 static void SimpleHFilter16(uint8_t* p
, int stride
, int thresh
) {
586 for (i
= 0; i
< 16; ++i
) {
587 if (needs_filter(p
+ i
* stride
, 1, thresh
)) {
588 do_filter2(p
+ i
* stride
, 1);
593 static void SimpleVFilter16i(uint8_t* p
, int stride
, int thresh
) {
595 for (k
= 3; k
> 0; --k
) {
597 SimpleVFilter16(p
, stride
, thresh
);
601 static void SimpleHFilter16i(uint8_t* p
, int stride
, int thresh
) {
603 for (k
= 3; k
> 0; --k
) {
605 SimpleHFilter16(p
, stride
, thresh
);
609 //------------------------------------------------------------------------------
610 // Complex In-loop filtering (Paragraph 15.3)
612 static WEBP_INLINE
void FilterLoop26(uint8_t* p
,
613 int hstride
, int vstride
, int size
,
614 int thresh
, int ithresh
, int hev_thresh
) {
616 if (needs_filter2(p
, hstride
, thresh
, ithresh
)) {
617 if (hev(p
, hstride
, hev_thresh
)) {
618 do_filter2(p
, hstride
);
620 do_filter6(p
, hstride
);
627 static WEBP_INLINE
void FilterLoop24(uint8_t* p
,
628 int hstride
, int vstride
, int size
,
629 int thresh
, int ithresh
, int hev_thresh
) {
631 if (needs_filter2(p
, hstride
, thresh
, ithresh
)) {
632 if (hev(p
, hstride
, hev_thresh
)) {
633 do_filter2(p
, hstride
);
635 do_filter4(p
, hstride
);
642 // on macroblock edges
643 static void VFilter16(uint8_t* p
, int stride
,
644 int thresh
, int ithresh
, int hev_thresh
) {
645 FilterLoop26(p
, stride
, 1, 16, thresh
, ithresh
, hev_thresh
);
648 static void HFilter16(uint8_t* p
, int stride
,
649 int thresh
, int ithresh
, int hev_thresh
) {
650 FilterLoop26(p
, 1, stride
, 16, thresh
, ithresh
, hev_thresh
);
653 // on three inner edges
654 static void VFilter16i(uint8_t* p
, int stride
,
655 int thresh
, int ithresh
, int hev_thresh
) {
657 for (k
= 3; k
> 0; --k
) {
659 FilterLoop24(p
, stride
, 1, 16, thresh
, ithresh
, hev_thresh
);
663 static void HFilter16i(uint8_t* p
, int stride
,
664 int thresh
, int ithresh
, int hev_thresh
) {
666 for (k
= 3; k
> 0; --k
) {
668 FilterLoop24(p
, 1, stride
, 16, thresh
, ithresh
, hev_thresh
);
672 // 8-pixels wide variant, for chroma filtering
673 static void VFilter8(uint8_t* u
, uint8_t* v
, int stride
,
674 int thresh
, int ithresh
, int hev_thresh
) {
675 FilterLoop26(u
, stride
, 1, 8, thresh
, ithresh
, hev_thresh
);
676 FilterLoop26(v
, stride
, 1, 8, thresh
, ithresh
, hev_thresh
);
679 static void HFilter8(uint8_t* u
, uint8_t* v
, int stride
,
680 int thresh
, int ithresh
, int hev_thresh
) {
681 FilterLoop26(u
, 1, stride
, 8, thresh
, ithresh
, hev_thresh
);
682 FilterLoop26(v
, 1, stride
, 8, thresh
, ithresh
, hev_thresh
);
685 static void VFilter8i(uint8_t* u
, uint8_t* v
, int stride
,
686 int thresh
, int ithresh
, int hev_thresh
) {
687 FilterLoop24(u
+ 4 * stride
, stride
, 1, 8, thresh
, ithresh
, hev_thresh
);
688 FilterLoop24(v
+ 4 * stride
, stride
, 1, 8, thresh
, ithresh
, hev_thresh
);
691 static void HFilter8i(uint8_t* u
, uint8_t* v
, int stride
,
692 int thresh
, int ithresh
, int hev_thresh
) {
693 FilterLoop24(u
+ 4, 1, stride
, 8, thresh
, ithresh
, hev_thresh
);
694 FilterLoop24(v
+ 4, 1, stride
, 8, thresh
, ithresh
, hev_thresh
);
697 //------------------------------------------------------------------------------
699 VP8DecIdct2 VP8Transform
;
700 VP8DecIdct VP8TransformAC3
;
701 VP8DecIdct VP8TransformUV
;
702 VP8DecIdct VP8TransformDC
;
703 VP8DecIdct VP8TransformDCUV
;
705 VP8LumaFilterFunc VP8VFilter16
;
706 VP8LumaFilterFunc VP8HFilter16
;
707 VP8ChromaFilterFunc VP8VFilter8
;
708 VP8ChromaFilterFunc VP8HFilter8
;
709 VP8LumaFilterFunc VP8VFilter16i
;
710 VP8LumaFilterFunc VP8HFilter16i
;
711 VP8ChromaFilterFunc VP8VFilter8i
;
712 VP8ChromaFilterFunc VP8HFilter8i
;
713 VP8SimpleFilterFunc VP8SimpleVFilter16
;
714 VP8SimpleFilterFunc VP8SimpleHFilter16
;
715 VP8SimpleFilterFunc VP8SimpleVFilter16i
;
716 VP8SimpleFilterFunc VP8SimpleHFilter16i
;
718 extern void VP8DspInitSSE2(void);
719 extern void VP8DspInitNEON(void);
721 void VP8DspInit(void) {
724 VP8Transform
= TransformTwo
;
725 VP8TransformUV
= TransformUV
;
726 VP8TransformDC
= TransformDC
;
727 VP8TransformDCUV
= TransformDCUV
;
728 VP8TransformAC3
= TransformAC3
;
730 VP8VFilter16
= VFilter16
;
731 VP8HFilter16
= HFilter16
;
732 VP8VFilter8
= VFilter8
;
733 VP8HFilter8
= HFilter8
;
734 VP8VFilter16i
= VFilter16i
;
735 VP8HFilter16i
= HFilter16i
;
736 VP8VFilter8i
= VFilter8i
;
737 VP8HFilter8i
= HFilter8i
;
738 VP8SimpleVFilter16
= SimpleVFilter16
;
739 VP8SimpleHFilter16
= SimpleHFilter16
;
740 VP8SimpleVFilter16i
= SimpleVFilter16i
;
741 VP8SimpleHFilter16i
= SimpleHFilter16i
;
743 // If defined, use CPUInfo() to overwrite some pointers with faster versions.
745 #if defined(WEBP_USE_SSE2)
746 if (VP8GetCPUInfo(kSSE2
)) {
749 #elif defined(WEBP_USE_NEON)
750 if (VP8GetCPUInfo(kNEON
)) {