1 // Copyright 2014 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
10 // YUV->RGB conversion functions
12 // Author: Skal (pascal.massimino@gmail.com)
16 #if defined(WEBP_USE_SSE2)
18 #include <emmintrin.h>
19 #include <string.h> // for memcpy
21 typedef union { // handy struct for converting SSE2 registers
27 #if defined(WEBP_YUV_USE_SSE2_TABLES)
29 #include "./yuv_tables_sse2.h"
31 void VP8YUVInitSSE2(void) {}
35 static int done_sse2
= 0;
36 static VP8kCstSSE2 VP8kUtoRGBA
[256], VP8kVtoRGBA
[256], VP8kYtoRGBA
[256];
38 void VP8YUVInitSSE2(void) {
41 for (i
= 0; i
< 256; ++i
) {
42 VP8kYtoRGBA
[i
].i32
[0] =
43 VP8kYtoRGBA
[i
].i32
[1] =
44 VP8kYtoRGBA
[i
].i32
[2] = (i
- 16) * kYScale
+ YUV_HALF2
;
45 VP8kYtoRGBA
[i
].i32
[3] = 0xff << YUV_FIX2
;
47 VP8kUtoRGBA
[i
].i32
[0] = 0;
48 VP8kUtoRGBA
[i
].i32
[1] = -kUToG
* (i
- 128);
49 VP8kUtoRGBA
[i
].i32
[2] = kUToB
* (i
- 128);
50 VP8kUtoRGBA
[i
].i32
[3] = 0;
52 VP8kVtoRGBA
[i
].i32
[0] = kVToR
* (i
- 128);
53 VP8kVtoRGBA
[i
].i32
[1] = -kVToG
* (i
- 128);
54 VP8kVtoRGBA
[i
].i32
[2] = 0;
55 VP8kVtoRGBA
[i
].i32
[3] = 0;
59 #if 0 // code used to generate 'yuv_tables_sse2.h'
60 printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n");
61 for (i
= 0; i
< 256; ++i
) {
62 printf(" {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n",
63 VP8kYtoRGBA
[i
].i32
[0], VP8kYtoRGBA
[i
].i32
[1],
64 VP8kYtoRGBA
[i
].i32
[2], VP8kYtoRGBA
[i
].i32
[3]);
67 printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n");
68 for (i
= 0; i
< 256; ++i
) {
69 printf(" {{0, 0x%.8x, 0x%.8x, 0}},\n",
70 VP8kUtoRGBA
[i
].i32
[1], VP8kUtoRGBA
[i
].i32
[2]);
73 printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n");
74 for (i
= 0; i
< 256; ++i
) {
75 printf(" {{0x%.8x, 0x%.8x, 0, 0}},\n",
76 VP8kVtoRGBA
[i
].i32
[0], VP8kVtoRGBA
[i
].i32
[1]);
83 #endif // WEBP_YUV_USE_SSE2_TABLES
85 //-----------------------------------------------------------------------------
87 static WEBP_INLINE __m128i
LoadUVPart(int u
, int v
) {
88 const __m128i u_part
= _mm_loadu_si128(&VP8kUtoRGBA
[u
].m
);
89 const __m128i v_part
= _mm_loadu_si128(&VP8kVtoRGBA
[v
].m
);
90 const __m128i uv_part
= _mm_add_epi32(u_part
, v_part
);
94 static WEBP_INLINE __m128i
GetRGBA32bWithUV(int y
, const __m128i uv_part
) {
95 const __m128i y_part
= _mm_loadu_si128(&VP8kYtoRGBA
[y
].m
);
96 const __m128i rgba1
= _mm_add_epi32(y_part
, uv_part
);
97 const __m128i rgba2
= _mm_srai_epi32(rgba1
, YUV_FIX2
);
101 static WEBP_INLINE __m128i
GetRGBA32b(int y
, int u
, int v
) {
102 const __m128i uv_part
= LoadUVPart(u
, v
);
103 return GetRGBA32bWithUV(y
, uv_part
);
106 static WEBP_INLINE
void YuvToRgbSSE2(uint8_t y
, uint8_t u
, uint8_t v
,
107 uint8_t* const rgb
) {
108 const __m128i tmp0
= GetRGBA32b(y
, u
, v
);
109 const __m128i tmp1
= _mm_packs_epi32(tmp0
, tmp0
);
110 const __m128i tmp2
= _mm_packus_epi16(tmp1
, tmp1
);
111 // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
112 _mm_storel_epi64((__m128i
*)rgb
, tmp2
);
115 static WEBP_INLINE
void YuvToBgrSSE2(uint8_t y
, uint8_t u
, uint8_t v
,
116 uint8_t* const bgr
) {
117 const __m128i tmp0
= GetRGBA32b(y
, u
, v
);
118 const __m128i tmp1
= _mm_shuffle_epi32(tmp0
, _MM_SHUFFLE(3, 0, 1, 2));
119 const __m128i tmp2
= _mm_packs_epi32(tmp1
, tmp1
);
120 const __m128i tmp3
= _mm_packus_epi16(tmp2
, tmp2
);
121 // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
122 _mm_storel_epi64((__m128i
*)bgr
, tmp3
);
125 //-----------------------------------------------------------------------------
126 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
128 #ifdef FANCY_UPSAMPLING
130 void VP8YuvToRgba32(const uint8_t* y
, const uint8_t* u
, const uint8_t* v
,
133 for (n
= 0; n
< 32; n
+= 4) {
134 const __m128i tmp0_1
= GetRGBA32b(y
[n
+ 0], u
[n
+ 0], v
[n
+ 0]);
135 const __m128i tmp0_2
= GetRGBA32b(y
[n
+ 1], u
[n
+ 1], v
[n
+ 1]);
136 const __m128i tmp0_3
= GetRGBA32b(y
[n
+ 2], u
[n
+ 2], v
[n
+ 2]);
137 const __m128i tmp0_4
= GetRGBA32b(y
[n
+ 3], u
[n
+ 3], v
[n
+ 3]);
138 const __m128i tmp1_1
= _mm_packs_epi32(tmp0_1
, tmp0_2
);
139 const __m128i tmp1_2
= _mm_packs_epi32(tmp0_3
, tmp0_4
);
140 const __m128i tmp2
= _mm_packus_epi16(tmp1_1
, tmp1_2
);
141 _mm_storeu_si128((__m128i
*)dst
, tmp2
);
146 void VP8YuvToBgra32(const uint8_t* y
, const uint8_t* u
, const uint8_t* v
,
149 for (n
= 0; n
< 32; n
+= 2) {
150 const __m128i tmp0_1
= GetRGBA32b(y
[n
+ 0], u
[n
+ 0], v
[n
+ 0]);
151 const __m128i tmp0_2
= GetRGBA32b(y
[n
+ 1], u
[n
+ 1], v
[n
+ 1]);
152 const __m128i tmp1_1
= _mm_shuffle_epi32(tmp0_1
, _MM_SHUFFLE(3, 0, 1, 2));
153 const __m128i tmp1_2
= _mm_shuffle_epi32(tmp0_2
, _MM_SHUFFLE(3, 0, 1, 2));
154 const __m128i tmp2_1
= _mm_packs_epi32(tmp1_1
, tmp1_2
);
155 const __m128i tmp3
= _mm_packus_epi16(tmp2_1
, tmp2_1
);
156 _mm_storel_epi64((__m128i
*)dst
, tmp3
);
161 void VP8YuvToRgb32(const uint8_t* y
, const uint8_t* u
, const uint8_t* v
,
164 uint8_t tmp0
[2 * 3 + 5 + 15];
165 uint8_t* const tmp
= (uint8_t*)((uintptr_t)(tmp0
+ 15) & ~15); // align
166 for (n
= 0; n
< 30; ++n
) { // we directly stomp the *dst memory
167 YuvToRgbSSE2(y
[n
], u
[n
], v
[n
], dst
+ n
* 3);
169 // Last two pixels are special: we write in a tmp buffer before sending
171 YuvToRgbSSE2(y
[n
+ 0], u
[n
+ 0], v
[n
+ 0], tmp
+ 0);
172 YuvToRgbSSE2(y
[n
+ 1], u
[n
+ 1], v
[n
+ 1], tmp
+ 3);
173 memcpy(dst
+ n
* 3, tmp
, 2 * 3);
176 void VP8YuvToBgr32(const uint8_t* y
, const uint8_t* u
, const uint8_t* v
,
179 uint8_t tmp0
[2 * 3 + 5 + 15];
180 uint8_t* const tmp
= (uint8_t*)((uintptr_t)(tmp0
+ 15) & ~15); // align
181 for (n
= 0; n
< 30; ++n
) {
182 YuvToBgrSSE2(y
[n
], u
[n
], v
[n
], dst
+ n
* 3);
184 YuvToBgrSSE2(y
[n
+ 0], u
[n
+ 0], v
[n
+ 0], tmp
+ 0);
185 YuvToBgrSSE2(y
[n
+ 1], u
[n
+ 1], v
[n
+ 1], tmp
+ 3);
186 memcpy(dst
+ n
* 3, tmp
, 2 * 3);
189 #endif // FANCY_UPSAMPLING
191 //-----------------------------------------------------------------------------
192 // Arbitrary-length row conversion functions
194 static void YuvToRgbaRowSSE2(const uint8_t* y
,
195 const uint8_t* u
, const uint8_t* v
,
196 uint8_t* dst
, int len
) {
198 for (n
= 0; n
+ 4 <= len
; n
+= 4) {
199 const __m128i uv_0
= LoadUVPart(u
[0], v
[0]);
200 const __m128i uv_1
= LoadUVPart(u
[1], v
[1]);
201 const __m128i tmp0_1
= GetRGBA32bWithUV(y
[0], uv_0
);
202 const __m128i tmp0_2
= GetRGBA32bWithUV(y
[1], uv_0
);
203 const __m128i tmp0_3
= GetRGBA32bWithUV(y
[2], uv_1
);
204 const __m128i tmp0_4
= GetRGBA32bWithUV(y
[3], uv_1
);
205 const __m128i tmp1_1
= _mm_packs_epi32(tmp0_1
, tmp0_2
);
206 const __m128i tmp1_2
= _mm_packs_epi32(tmp0_3
, tmp0_4
);
207 const __m128i tmp2
= _mm_packus_epi16(tmp1_1
, tmp1_2
);
208 _mm_storeu_si128((__m128i
*)dst
, tmp2
);
216 VP8YuvToRgba(y
[0], u
[0], v
[0], dst
);
225 static void YuvToBgraRowSSE2(const uint8_t* y
,
226 const uint8_t* u
, const uint8_t* v
,
227 uint8_t* dst
, int len
) {
229 for (n
= 0; n
+ 2 <= len
; n
+= 2) {
230 const __m128i uv_0
= LoadUVPart(u
[0], v
[0]);
231 const __m128i tmp0_1
= GetRGBA32bWithUV(y
[0], uv_0
);
232 const __m128i tmp0_2
= GetRGBA32bWithUV(y
[1], uv_0
);
233 const __m128i tmp1_1
= _mm_shuffle_epi32(tmp0_1
, _MM_SHUFFLE(3, 0, 1, 2));
234 const __m128i tmp1_2
= _mm_shuffle_epi32(tmp0_2
, _MM_SHUFFLE(3, 0, 1, 2));
235 const __m128i tmp2_1
= _mm_packs_epi32(tmp1_1
, tmp1_2
);
236 const __m128i tmp3
= _mm_packus_epi16(tmp2_1
, tmp2_1
);
237 _mm_storel_epi64((__m128i
*)dst
, tmp3
);
245 VP8YuvToBgra(y
[0], u
[0], v
[0], dst
);
249 static void YuvToArgbRowSSE2(const uint8_t* y
,
250 const uint8_t* u
, const uint8_t* v
,
251 uint8_t* dst
, int len
) {
253 for (n
= 0; n
+ 2 <= len
; n
+= 2) {
254 const __m128i uv_0
= LoadUVPart(u
[0], v
[0]);
255 const __m128i tmp0_1
= GetRGBA32bWithUV(y
[0], uv_0
);
256 const __m128i tmp0_2
= GetRGBA32bWithUV(y
[1], uv_0
);
257 const __m128i tmp1_1
= _mm_shuffle_epi32(tmp0_1
, _MM_SHUFFLE(2, 1, 0, 3));
258 const __m128i tmp1_2
= _mm_shuffle_epi32(tmp0_2
, _MM_SHUFFLE(2, 1, 0, 3));
259 const __m128i tmp2_1
= _mm_packs_epi32(tmp1_1
, tmp1_2
);
260 const __m128i tmp3
= _mm_packus_epi16(tmp2_1
, tmp2_1
);
261 _mm_storel_epi64((__m128i
*)dst
, tmp3
);
269 VP8YuvToArgb(y
[0], u
[0], v
[0], dst
);
273 static void YuvToRgbRowSSE2(const uint8_t* y
,
274 const uint8_t* u
, const uint8_t* v
,
275 uint8_t* dst
, int len
) {
277 for (n
= 0; n
+ 2 < len
; ++n
) { // we directly stomp the *dst memory
278 YuvToRgbSSE2(y
[0], u
[0], v
[0], dst
); // stomps 8 bytes
284 VP8YuvToRgb(y
[0], u
[0], v
[0], dst
);
286 VP8YuvToRgb(y
[1], u
[n
& 1], v
[n
& 1], dst
+ 3);
290 static void YuvToBgrRowSSE2(const uint8_t* y
,
291 const uint8_t* u
, const uint8_t* v
,
292 uint8_t* dst
, int len
) {
294 for (n
= 0; n
+ 2 < len
; ++n
) { // we directly stomp the *dst memory
295 YuvToBgrSSE2(y
[0], u
[0], v
[0], dst
); // stomps 8 bytes
301 VP8YuvToBgr(y
[0], u
[0], v
[0], dst
+ 0);
303 VP8YuvToBgr(y
[1], u
[n
& 1], v
[n
& 1], dst
+ 3);
307 #endif // WEBP_USE_SSE2
309 //------------------------------------------------------------------------------
312 extern void WebPInitSamplersSSE2(void);
314 void WebPInitSamplersSSE2(void) {
315 #if defined(WEBP_USE_SSE2)
316 WebPSamplers
[MODE_RGB
] = YuvToRgbRowSSE2
;
317 WebPSamplers
[MODE_RGBA
] = YuvToRgbaRowSSE2
;
318 WebPSamplers
[MODE_BGR
] = YuvToBgrRowSSE2
;
319 WebPSamplers
[MODE_BGRA
] = YuvToBgraRowSSE2
;
320 WebPSamplers
[MODE_ARGB
] = YuvToArgbRowSSE2
;
321 #endif // WEBP_USE_SSE2