1 // Copyright 2014 Google Inc. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
12 #ifndef WEBP_DSP_NEON_H_
13 #define WEBP_DSP_NEON_H_
19 // Right now, some intrinsics functions seem slower, so we disable them
20 // everywhere except aarch64 where the inline assembly is incompatible.
21 #if defined(__aarch64__)
22 #define USE_INTRINSICS // use intrinsics when possible
25 #define INIT_VECTOR2(v, a, b) do { \
30 #define INIT_VECTOR3(v, a, b, c) do { \
36 #define INIT_VECTOR4(v, a, b, c, d) do { \
43 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
44 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
45 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
46 #if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
47 #define WORK_AROUND_GCC
50 static WEBP_INLINE int32x4x4_t
Transpose4x4(const int32x4x4_t rows
) {
51 uint64x2x2_t row01
, row23
;
53 row01
.val
[0] = vreinterpretq_u64_s32(rows
.val
[0]);
54 row01
.val
[1] = vreinterpretq_u64_s32(rows
.val
[1]);
55 row23
.val
[0] = vreinterpretq_u64_s32(rows
.val
[2]);
56 row23
.val
[1] = vreinterpretq_u64_s32(rows
.val
[3]);
57 // Transpose 64-bit values (there's no vswp equivalent)
59 const uint64x1_t row0h
= vget_high_u64(row01
.val
[0]);
60 const uint64x1_t row2l
= vget_low_u64(row23
.val
[0]);
61 const uint64x1_t row1h
= vget_high_u64(row01
.val
[1]);
62 const uint64x1_t row3l
= vget_low_u64(row23
.val
[1]);
63 row01
.val
[0] = vcombine_u64(vget_low_u64(row01
.val
[0]), row2l
);
64 row23
.val
[0] = vcombine_u64(row0h
, vget_high_u64(row23
.val
[0]));
65 row01
.val
[1] = vcombine_u64(vget_low_u64(row01
.val
[1]), row3l
);
66 row23
.val
[1] = vcombine_u64(row1h
, vget_high_u64(row23
.val
[1]));
69 const int32x4x2_t out01
= vtrnq_s32(vreinterpretq_s32_u64(row01
.val
[0]),
70 vreinterpretq_s32_u64(row01
.val
[1]));
71 const int32x4x2_t out23
= vtrnq_s32(vreinterpretq_s32_u64(row23
.val
[0]),
72 vreinterpretq_s32_u64(row23
.val
[1]));
74 out
.val
[0] = out01
.val
[0];
75 out
.val
[1] = out01
.val
[1];
76 out
.val
[2] = out23
.val
[0];
77 out
.val
[3] = out23
.val
[1];
82 #endif // WEBP_DSP_NEON_H_