1 // Copyright 2010 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
10 // YUV->RGB conversion function
12 // Author: Skal (pascal.massimino@gmail.com)
17 #if defined(WEBP_YUV_USE_TABLE)
21 static WEBP_INLINE
uint8_t clip(int v
, int max_value
) {
22 return v
< 0 ? 0 : v
> max_value
? max_value
: v
;
25 int16_t VP8kVToR
[256], VP8kUToB
[256];
26 int32_t VP8kVToG
[256], VP8kUToG
[256];
27 uint8_t VP8kClip
[YUV_RANGE_MAX
- YUV_RANGE_MIN
];
28 uint8_t VP8kClip4Bits
[YUV_RANGE_MAX
- YUV_RANGE_MIN
];
30 void VP8YUVInit(void) {
36 for (i
= 0; i
< 256; ++i
) {
37 VP8kVToR
[i
] = (89858 * (i
- 128) + YUV_HALF
) >> YUV_FIX
;
38 VP8kUToG
[i
] = -22014 * (i
- 128) + YUV_HALF
;
39 VP8kVToG
[i
] = -45773 * (i
- 128);
40 VP8kUToB
[i
] = (113618 * (i
- 128) + YUV_HALF
) >> YUV_FIX
;
42 for (i
= YUV_RANGE_MIN
; i
< YUV_RANGE_MAX
; ++i
) {
43 const int k
= ((i
- 16) * 76283 + YUV_HALF
) >> YUV_FIX
;
44 VP8kClip
[i
- YUV_RANGE_MIN
] = clip(k
, 255);
45 VP8kClip4Bits
[i
- YUV_RANGE_MIN
] = clip((k
+ 8) >> 4, 15);
48 for (i
= 0; i
< 256; ++i
) {
49 VP8kVToR
[i
] = (91881 * (i
- 128) + YUV_HALF
) >> YUV_FIX
;
50 VP8kUToG
[i
] = -22554 * (i
- 128) + YUV_HALF
;
51 VP8kVToG
[i
] = -46802 * (i
- 128);
52 VP8kUToB
[i
] = (116130 * (i
- 128) + YUV_HALF
) >> YUV_FIX
;
54 for (i
= YUV_RANGE_MIN
; i
< YUV_RANGE_MAX
; ++i
) {
56 VP8kClip
[i
- YUV_RANGE_MIN
] = clip(k
, 255);
57 VP8kClip4Bits
[i
- YUV_RANGE_MIN
] = clip((k
+ 8) >> 4, 15);
66 void VP8YUVInit(void) {}
68 #endif // WEBP_YUV_USE_TABLE
70 //-----------------------------------------------------------------------------
73 #if defined(WEBP_USE_SSE2)
75 #ifdef FANCY_UPSAMPLING
77 #include <emmintrin.h>
78 #include <string.h> // for memcpy
80 typedef union { // handy struct for converting SSE2 registers
86 static int done_sse2
= 0;
87 static VP8kCstSSE2 VP8kUtoRGBA
[256], VP8kVtoRGBA
[256], VP8kYtoRGBA
[256];
89 void VP8YUVInitSSE2(void) {
92 for (i
= 0; i
< 256; ++i
) {
93 VP8kYtoRGBA
[i
].i32
[0] =
94 VP8kYtoRGBA
[i
].i32
[1] =
95 VP8kYtoRGBA
[i
].i32
[2] = (i
- 16) * kYScale
+ YUV_HALF2
;
96 VP8kYtoRGBA
[i
].i32
[3] = 0xff << YUV_FIX2
;
98 VP8kUtoRGBA
[i
].i32
[0] = 0;
99 VP8kUtoRGBA
[i
].i32
[1] = -kUToG
* (i
- 128);
100 VP8kUtoRGBA
[i
].i32
[2] = kUToB
* (i
- 128);
101 VP8kUtoRGBA
[i
].i32
[3] = 0;
103 VP8kVtoRGBA
[i
].i32
[0] = kVToR
* (i
- 128);
104 VP8kVtoRGBA
[i
].i32
[1] = -kVToG
* (i
- 128);
105 VP8kVtoRGBA
[i
].i32
[2] = 0;
106 VP8kVtoRGBA
[i
].i32
[3] = 0;
112 static WEBP_INLINE __m128i
VP8GetRGBA32b(int y
, int u
, int v
) {
113 const __m128i u_part
= _mm_loadu_si128(&VP8kUtoRGBA
[u
].m
);
114 const __m128i v_part
= _mm_loadu_si128(&VP8kVtoRGBA
[v
].m
);
115 const __m128i y_part
= _mm_loadu_si128(&VP8kYtoRGBA
[y
].m
);
116 const __m128i uv_part
= _mm_add_epi32(u_part
, v_part
);
117 const __m128i rgba1
= _mm_add_epi32(y_part
, uv_part
);
118 const __m128i rgba2
= _mm_srai_epi32(rgba1
, YUV_FIX2
);
122 static WEBP_INLINE
void VP8YuvToRgbSSE2(uint8_t y
, uint8_t u
, uint8_t v
,
123 uint8_t* const rgb
) {
124 const __m128i tmp0
= VP8GetRGBA32b(y
, u
, v
);
125 const __m128i tmp1
= _mm_packs_epi32(tmp0
, tmp0
);
126 const __m128i tmp2
= _mm_packus_epi16(tmp1
, tmp1
);
127 // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
128 _mm_storel_epi64((__m128i
*)rgb
, tmp2
);
131 static WEBP_INLINE
void VP8YuvToBgrSSE2(uint8_t y
, uint8_t u
, uint8_t v
,
132 uint8_t* const bgr
) {
133 const __m128i tmp0
= VP8GetRGBA32b(y
, u
, v
);
134 const __m128i tmp1
= _mm_shuffle_epi32(tmp0
, _MM_SHUFFLE(3, 0, 1, 2));
135 const __m128i tmp2
= _mm_packs_epi32(tmp1
, tmp1
);
136 const __m128i tmp3
= _mm_packus_epi16(tmp2
, tmp2
);
137 // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
138 _mm_storel_epi64((__m128i
*)bgr
, tmp3
);
141 void VP8YuvToRgba32(const uint8_t* y
, const uint8_t* u
, const uint8_t* v
,
144 for (n
= 0; n
< 32; n
+= 4) {
145 const __m128i tmp0_1
= VP8GetRGBA32b(y
[n
+ 0], u
[n
+ 0], v
[n
+ 0]);
146 const __m128i tmp0_2
= VP8GetRGBA32b(y
[n
+ 1], u
[n
+ 1], v
[n
+ 1]);
147 const __m128i tmp0_3
= VP8GetRGBA32b(y
[n
+ 2], u
[n
+ 2], v
[n
+ 2]);
148 const __m128i tmp0_4
= VP8GetRGBA32b(y
[n
+ 3], u
[n
+ 3], v
[n
+ 3]);
149 const __m128i tmp1_1
= _mm_packs_epi32(tmp0_1
, tmp0_2
);
150 const __m128i tmp1_2
= _mm_packs_epi32(tmp0_3
, tmp0_4
);
151 const __m128i tmp2
= _mm_packus_epi16(tmp1_1
, tmp1_2
);
152 _mm_storeu_si128((__m128i
*)dst
, tmp2
);
157 void VP8YuvToBgra32(const uint8_t* y
, const uint8_t* u
, const uint8_t* v
,
160 for (n
= 0; n
< 32; n
+= 2) {
161 const __m128i tmp0_1
= VP8GetRGBA32b(y
[n
+ 0], u
[n
+ 0], v
[n
+ 0]);
162 const __m128i tmp0_2
= VP8GetRGBA32b(y
[n
+ 1], u
[n
+ 1], v
[n
+ 1]);
163 const __m128i tmp1_1
= _mm_shuffle_epi32(tmp0_1
, _MM_SHUFFLE(3, 0, 1, 2));
164 const __m128i tmp1_2
= _mm_shuffle_epi32(tmp0_2
, _MM_SHUFFLE(3, 0, 1, 2));
165 const __m128i tmp2_1
= _mm_packs_epi32(tmp1_1
, tmp1_2
);
166 const __m128i tmp3
= _mm_packus_epi16(tmp2_1
, tmp2_1
);
167 _mm_storel_epi64((__m128i
*)dst
, tmp3
);
172 void VP8YuvToRgb32(const uint8_t* y
, const uint8_t* u
, const uint8_t* v
,
175 uint8_t tmp0
[2 * 3 + 5 + 15];
176 uint8_t* const tmp
= (uint8_t*)((uintptr_t)(tmp0
+ 15) & ~15); // align
177 for (n
= 0; n
< 30; ++n
) { // we directly stomp the *dst memory
178 VP8YuvToRgbSSE2(y
[n
], u
[n
], v
[n
], dst
+ n
* 3);
180 // Last two pixels are special: we write in a tmp buffer before sending
182 VP8YuvToRgbSSE2(y
[n
+ 0], u
[n
+ 0], v
[n
+ 0], tmp
+ 0);
183 VP8YuvToRgbSSE2(y
[n
+ 1], u
[n
+ 1], v
[n
+ 1], tmp
+ 3);
184 memcpy(dst
+ n
* 3, tmp
, 2 * 3);
187 void VP8YuvToBgr32(const uint8_t* y
, const uint8_t* u
, const uint8_t* v
,
190 uint8_t tmp0
[2 * 3 + 5 + 15];
191 uint8_t* const tmp
= (uint8_t*)((uintptr_t)(tmp0
+ 15) & ~15); // align
192 for (n
= 0; n
< 30; ++n
) {
193 VP8YuvToBgrSSE2(y
[n
], u
[n
], v
[n
], dst
+ n
* 3);
195 VP8YuvToBgrSSE2(y
[n
+ 0], u
[n
+ 0], v
[n
+ 0], tmp
+ 0);
196 VP8YuvToBgrSSE2(y
[n
+ 1], u
[n
+ 1], v
[n
+ 1], tmp
+ 3);
197 memcpy(dst
+ n
* 3, tmp
, 2 * 3);
202 void VP8YUVInitSSE2(void) {}
204 #endif // FANCY_UPSAMPLING
206 #endif // WEBP_USE_SSE2