2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_short_idct4x4llm_neon|
17 AREA ||.text||
, CODE
, READONLY
, ALIGN=2
19 ;*************************************************************
20 ;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
24 ;*************************************************************
25 ;static const int cospi8sqrt2minus1=20091;
26 ;static const int sinpi8sqrt2 =35468;
27 ;static const int rounding = 0;
28 ;Optimization note: The resulted data from dequantization are signed 13-bit data that is
29 ;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
30 ;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
31 ;result of the multiplication that is needed in IDCT.
33 |vp8_short_idct4x4llm_neon|
PROC
35 vld1.16
{q1, q2}, [r0
]
38 vswp d3
, d4
;q2(vp[4] vp[12])
40 vqdmulh.s16 q3
, q2
, d0
[2]
41 vqdmulh.s16 q4
, q2
, d0
[0]
43 vqadd.s16 d12
, d2
, d3
;a1
44 vqsub.s16 d13
, d2
, d3
;b1
49 vqadd.s16 q3
, q3
, q2
;modify since sinpi8sqrt2 > 65536/2 (negtive number)
57 vqsub.s16 d10
, d6
, d9
;c1
58 vqadd.s16 d11
, d7
, d8
;d1
60 vqadd.s16 d2
, d12
, d11
61 vqadd.s16 d3
, d13
, d10
62 vqsub.s16 d4
, d13
, d10
63 vqsub.s16 d5
, d12
, d11
72 vqdmulh.s16 q3
, q2
, d0
[2]
73 vqdmulh.s16 q4
, q2
, d0
[0]
75 vqadd.s16 d12
, d2
, d3
;a1
76 vqsub.s16 d13
, d2
, d3
;b1
81 vqadd.s16 q3
, q3
, q2
;modify since sinpi8sqrt2 > 65536/2 (negtive number)
84 vqsub.s16 d10
, d6
, d9
;c1
85 vqadd.s16 d11
, d7
, d8
;d1
87 vqadd.s16 d2
, d12
, d11
88 vqadd.s16 d3
, d13
, d10
89 vqsub.s16 d4
, d13
, d10
90 vqsub.s16 d5
, d12
, d11
116 AREA idct4x4_dat
, DATA
, READWRITE
;read/write by default
117 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
118 ;One word each is reserved. Label filter_coeff can be used to access the data.
119 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
123 DCD
0x4e7b4e7b, 0x8a8c8a8c
125 ;20091, 20091, 35468, 35468