1 // Copyright 2014 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
10 // NEON variant of methods for lossless decoder
12 // Author: Skal (pascal.massimino@gmail.com)
16 #if defined(WEBP_USE_NEON)
20 #include "./lossless.h"
23 //------------------------------------------------------------------------------
24 // Colorspace conversion functions
26 #if !defined(WORK_AROUND_GCC)
27 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
28 // gcc-4.8.x at least.
29 static void ConvertBGRAToRGBA(const uint32_t* src
,
30 int num_pixels
, uint8_t* dst
) {
31 const uint32_t* const end
= src
+ (num_pixels
& ~15);
32 for (; src
< end
; src
+= 16) {
33 uint8x16x4_t pixel
= vld4q_u8((uint8_t*)src
);
34 // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
35 const uint8x16_t tmp
= pixel
.val
[0];
36 pixel
.val
[0] = pixel
.val
[2];
41 VP8LConvertBGRAToRGBA_C(src
, num_pixels
& 15, dst
); // left-overs
44 static void ConvertBGRAToBGR(const uint32_t* src
,
45 int num_pixels
, uint8_t* dst
) {
46 const uint32_t* const end
= src
+ (num_pixels
& ~15);
47 for (; src
< end
; src
+= 16) {
48 const uint8x16x4_t pixel
= vld4q_u8((uint8_t*)src
);
49 const uint8x16x3_t tmp
= { { pixel
.val
[0], pixel
.val
[1], pixel
.val
[2] } };
53 VP8LConvertBGRAToBGR_C(src
, num_pixels
& 15, dst
); // left-overs
56 static void ConvertBGRAToRGB(const uint32_t* src
,
57 int num_pixels
, uint8_t* dst
) {
58 const uint32_t* const end
= src
+ (num_pixels
& ~15);
59 for (; src
< end
; src
+= 16) {
60 const uint8x16x4_t pixel
= vld4q_u8((uint8_t*)src
);
61 const uint8x16x3_t tmp
= { { pixel
.val
[2], pixel
.val
[1], pixel
.val
[0] } };
65 VP8LConvertBGRAToRGB_C(src
, num_pixels
& 15, dst
); // left-overs
68 #else // WORK_AROUND_GCC
72 static const uint8_t kRGBAShuffle
[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
74 static void ConvertBGRAToRGBA(const uint32_t* src
,
75 int num_pixels
, uint8_t* dst
) {
76 const uint32_t* const end
= src
+ (num_pixels
& ~1);
77 const uint8x8_t shuffle
= vld1_u8(kRGBAShuffle
);
78 for (; src
< end
; src
+= 2) {
79 const uint8x8_t pixels
= vld1_u8((uint8_t*)src
);
80 vst1_u8(dst
, vtbl1_u8(pixels
, shuffle
));
83 VP8LConvertBGRAToRGBA_C(src
, num_pixels
& 1, dst
); // left-overs
86 static const uint8_t kBGRShuffle
[3][8] = {
87 { 0, 1, 2, 4, 5, 6, 8, 9 },
88 { 10, 12, 13, 14, 16, 17, 18, 20 },
89 { 21, 22, 24, 25, 26, 28, 29, 30 }
92 static void ConvertBGRAToBGR(const uint32_t* src
,
93 int num_pixels
, uint8_t* dst
) {
94 const uint32_t* const end
= src
+ (num_pixels
& ~7);
95 const uint8x8_t shuffle0
= vld1_u8(kBGRShuffle
[0]);
96 const uint8x8_t shuffle1
= vld1_u8(kBGRShuffle
[1]);
97 const uint8x8_t shuffle2
= vld1_u8(kBGRShuffle
[2]);
98 for (; src
< end
; src
+= 8) {
101 vld1_u8((const uint8_t*)(src
+ 0)),
102 vld1_u8((const uint8_t*)(src
+ 2)),
103 vld1_u8((const uint8_t*)(src
+ 4)),
104 vld1_u8((const uint8_t*)(src
+ 6)));
105 vst1_u8(dst
+ 0, vtbl4_u8(pixels
, shuffle0
));
106 vst1_u8(dst
+ 8, vtbl4_u8(pixels
, shuffle1
));
107 vst1_u8(dst
+ 16, vtbl4_u8(pixels
, shuffle2
));
110 VP8LConvertBGRAToBGR_C(src
, num_pixels
& 7, dst
); // left-overs
113 static const uint8_t kRGBShuffle
[3][8] = {
114 { 2, 1, 0, 6, 5, 4, 10, 9 },
115 { 8, 14, 13, 12, 18, 17, 16, 22 },
116 { 21, 20, 26, 25, 24, 30, 29, 28 }
119 static void ConvertBGRAToRGB(const uint32_t* src
,
120 int num_pixels
, uint8_t* dst
) {
121 const uint32_t* const end
= src
+ (num_pixels
& ~7);
122 const uint8x8_t shuffle0
= vld1_u8(kRGBShuffle
[0]);
123 const uint8x8_t shuffle1
= vld1_u8(kRGBShuffle
[1]);
124 const uint8x8_t shuffle2
= vld1_u8(kRGBShuffle
[2]);
125 for (; src
< end
; src
+= 8) {
128 vld1_u8((const uint8_t*)(src
+ 0)),
129 vld1_u8((const uint8_t*)(src
+ 2)),
130 vld1_u8((const uint8_t*)(src
+ 4)),
131 vld1_u8((const uint8_t*)(src
+ 6)));
132 vst1_u8(dst
+ 0, vtbl4_u8(pixels
, shuffle0
));
133 vst1_u8(dst
+ 8, vtbl4_u8(pixels
, shuffle1
));
134 vst1_u8(dst
+ 16, vtbl4_u8(pixels
, shuffle2
));
137 VP8LConvertBGRAToRGB_C(src
, num_pixels
& 7, dst
); // left-overs
140 #endif // !WORK_AROUND_GCC
142 //------------------------------------------------------------------------------
144 #ifdef USE_INTRINSICS
146 static WEBP_INLINE
uint32_t Average2(const uint32_t* const a
,
147 const uint32_t* const b
) {
148 const uint8x8_t a0
= vreinterpret_u8_u64(vcreate_u64(*a
));
149 const uint8x8_t b0
= vreinterpret_u8_u64(vcreate_u64(*b
));
150 const uint8x8_t avg
= vhadd_u8(a0
, b0
);
151 return vget_lane_u32(vreinterpret_u32_u8(avg
), 0);
154 static WEBP_INLINE
uint32_t Average3(const uint32_t* const a
,
155 const uint32_t* const b
,
156 const uint32_t* const c
) {
157 const uint8x8_t a0
= vreinterpret_u8_u64(vcreate_u64(*a
));
158 const uint8x8_t b0
= vreinterpret_u8_u64(vcreate_u64(*b
));
159 const uint8x8_t c0
= vreinterpret_u8_u64(vcreate_u64(*c
));
160 const uint8x8_t avg1
= vhadd_u8(a0
, c0
);
161 const uint8x8_t avg2
= vhadd_u8(avg1
, b0
);
162 return vget_lane_u32(vreinterpret_u32_u8(avg2
), 0);
165 static WEBP_INLINE
uint32_t Average4(const uint32_t* const a
,
166 const uint32_t* const b
,
167 const uint32_t* const c
,
168 const uint32_t* const d
) {
169 const uint8x8_t a0
= vreinterpret_u8_u64(vcreate_u64(*a
));
170 const uint8x8_t b0
= vreinterpret_u8_u64(vcreate_u64(*b
));
171 const uint8x8_t c0
= vreinterpret_u8_u64(vcreate_u64(*c
));
172 const uint8x8_t d0
= vreinterpret_u8_u64(vcreate_u64(*d
));
173 const uint8x8_t avg1
= vhadd_u8(a0
, b0
);
174 const uint8x8_t avg2
= vhadd_u8(c0
, d0
);
175 const uint8x8_t avg3
= vhadd_u8(avg1
, avg2
);
176 return vget_lane_u32(vreinterpret_u32_u8(avg3
), 0);
179 static uint32_t Predictor5(uint32_t left
, const uint32_t* const top
) {
180 return Average3(&left
, top
+ 0, top
+ 1);
183 static uint32_t Predictor6(uint32_t left
, const uint32_t* const top
) {
184 return Average2(&left
, top
- 1);
187 static uint32_t Predictor7(uint32_t left
, const uint32_t* const top
) {
188 return Average2(&left
, top
+ 0);
191 static uint32_t Predictor8(uint32_t left
, const uint32_t* const top
) {
193 return Average2(top
- 1, top
+ 0);
196 static uint32_t Predictor9(uint32_t left
, const uint32_t* const top
) {
198 return Average2(top
+ 0, top
+ 1);
201 static uint32_t Predictor10(uint32_t left
, const uint32_t* const top
) {
202 return Average4(&left
, top
- 1, top
+ 0, top
+ 1);
205 //------------------------------------------------------------------------------
207 static WEBP_INLINE
uint32_t Select(const uint32_t* const c0
,
208 const uint32_t* const c1
,
209 const uint32_t* const c2
) {
210 const uint8x8_t p0
= vreinterpret_u8_u64(vcreate_u64(*c0
));
211 const uint8x8_t p1
= vreinterpret_u8_u64(vcreate_u64(*c1
));
212 const uint8x8_t p2
= vreinterpret_u8_u64(vcreate_u64(*c2
));
213 const uint8x8_t bc
= vabd_u8(p1
, p2
); // |b-c|
214 const uint8x8_t ac
= vabd_u8(p0
, p2
); // |a-c|
215 const int16x4_t sum_bc
= vreinterpret_s16_u16(vpaddl_u8(bc
));
216 const int16x4_t sum_ac
= vreinterpret_s16_u16(vpaddl_u8(ac
));
217 const int32x2_t diff
= vpaddl_s16(vsub_s16(sum_bc
, sum_ac
));
218 const int32_t pa_minus_pb
= vget_lane_s32(diff
, 0);
219 return (pa_minus_pb
<= 0) ? *c0
: *c1
;
222 static uint32_t Predictor11(uint32_t left
, const uint32_t* const top
) {
223 return Select(top
+ 0, &left
, top
- 1);
226 static WEBP_INLINE
uint32_t ClampedAddSubtractFull(const uint32_t* const c0
,
227 const uint32_t* const c1
,
228 const uint32_t* const c2
) {
229 const uint8x8_t p0
= vreinterpret_u8_u64(vcreate_u64(*c0
));
230 const uint8x8_t p1
= vreinterpret_u8_u64(vcreate_u64(*c1
));
231 const uint8x8_t p2
= vreinterpret_u8_u64(vcreate_u64(*c2
));
232 const uint16x8_t sum0
= vaddl_u8(p0
, p1
); // add and widen
233 const uint16x8_t sum1
= vqsubq_u16(sum0
, vmovl_u8(p2
)); // widen and subtract
234 const uint8x8_t out
= vqmovn_u16(sum1
); // narrow and clamp
235 return vget_lane_u32(vreinterpret_u32_u8(out
), 0);
238 static uint32_t Predictor12(uint32_t left
, const uint32_t* const top
) {
239 return ClampedAddSubtractFull(&left
, top
+ 0, top
- 1);
242 static WEBP_INLINE
uint32_t ClampedAddSubtractHalf(const uint32_t* const c0
,
243 const uint32_t* const c1
,
244 const uint32_t* const c2
) {
245 const uint8x8_t p0
= vreinterpret_u8_u64(vcreate_u64(*c0
));
246 const uint8x8_t p1
= vreinterpret_u8_u64(vcreate_u64(*c1
));
247 const uint8x8_t p2
= vreinterpret_u8_u64(vcreate_u64(*c2
));
248 const uint8x8_t avg
= vhadd_u8(p0
, p1
); // Average(c0,c1)
249 const uint8x8_t ab
= vshr_n_u8(vqsub_u8(avg
, p2
), 1); // (a-b)>>1 saturated
250 const uint8x8_t ba
= vshr_n_u8(vqsub_u8(p2
, avg
), 1); // (b-a)>>1 saturated
251 const uint8x8_t out
= vqsub_u8(vqadd_u8(avg
, ab
), ba
);
252 return vget_lane_u32(vreinterpret_u32_u8(out
), 0);
255 static uint32_t Predictor13(uint32_t left
, const uint32_t* const top
) {
256 return ClampedAddSubtractHalf(&left
, top
+ 0, top
- 1);
259 //------------------------------------------------------------------------------
260 // Subtract-Green Transform
262 // vtbl? are unavailable in iOS/arm64 builds.
263 #if !defined(__aarch64__)
265 // 255 = byte will be zero'd
266 static const uint8_t kGreenShuffle
[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
268 static void SubtractGreenFromBlueAndRed(uint32_t* argb_data
, int num_pixels
) {
269 const uint32_t* const end
= argb_data
+ (num_pixels
& ~3);
270 const uint8x8_t shuffle
= vld1_u8(kGreenShuffle
);
271 for (; argb_data
< end
; argb_data
+= 4) {
272 const uint8x16_t argb
= vld1q_u8((uint8_t*)argb_data
);
273 const uint8x16_t greens
=
274 vcombine_u8(vtbl1_u8(vget_low_u8(argb
), shuffle
),
275 vtbl1_u8(vget_high_u8(argb
), shuffle
));
276 vst1q_u8((uint8_t*)argb_data
, vsubq_u8(argb
, greens
));
278 // fallthrough and finish off with plain-C
279 VP8LSubtractGreenFromBlueAndRed_C(argb_data
, num_pixels
& 3);
282 static void AddGreenToBlueAndRed(uint32_t* argb_data
, int num_pixels
) {
283 const uint32_t* const end
= argb_data
+ (num_pixels
& ~3);
284 const uint8x8_t shuffle
= vld1_u8(kGreenShuffle
);
285 for (; argb_data
< end
; argb_data
+= 4) {
286 const uint8x16_t argb
= vld1q_u8((uint8_t*)argb_data
);
287 const uint8x16_t greens
=
288 vcombine_u8(vtbl1_u8(vget_low_u8(argb
), shuffle
),
289 vtbl1_u8(vget_high_u8(argb
), shuffle
));
290 vst1q_u8((uint8_t*)argb_data
, vaddq_u8(argb
, greens
));
292 // fallthrough and finish off with plain-C
293 VP8LAddGreenToBlueAndRed_C(argb_data
, num_pixels
& 3);
296 #endif // !__aarch64__
298 #endif // USE_INTRINSICS
300 #endif // WEBP_USE_NEON
302 //------------------------------------------------------------------------------
304 extern void VP8LDspInitNEON(void);
306 void VP8LDspInitNEON(void) {
307 #if defined(WEBP_USE_NEON)
308 VP8LConvertBGRAToRGBA
= ConvertBGRAToRGBA
;
309 VP8LConvertBGRAToBGR
= ConvertBGRAToBGR
;
310 VP8LConvertBGRAToRGB
= ConvertBGRAToRGB
;
312 #ifdef USE_INTRINSICS
313 VP8LPredictors
[5] = Predictor5
;
314 VP8LPredictors
[6] = Predictor6
;
315 VP8LPredictors
[7] = Predictor7
;
316 VP8LPredictors
[8] = Predictor8
;
317 VP8LPredictors
[9] = Predictor9
;
318 VP8LPredictors
[10] = Predictor10
;
319 VP8LPredictors
[11] = Predictor11
;
320 VP8LPredictors
[12] = Predictor12
;
321 VP8LPredictors
[13] = Predictor13
;
323 #if !defined(__aarch64__)
324 VP8LSubtractGreenFromBlueAndRed
= SubtractGreenFromBlueAndRed
;
325 VP8LAddGreenToBlueAndRed
= AddGreenToBlueAndRed
;
329 #endif // WEBP_USE_NEON
332 //------------------------------------------------------------------------------