Removed unused vp8_recon_intra4x4mb function
[libvpx.git] / vp8 / encoder / ppc / fdct_altivec.asm
blob935d0cb097743755da5a427aef6db9efb16888f4
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 .globl vp8_short_fdct4x4_ppc
13 .globl vp8_short_fdct8x4_ppc
15 .macro load_c V, LABEL, OFF, R0, R1
16 lis \R0, \LABEL@ha
17 la \R1, \LABEL@l(\R0)
18 lvx \V, \OFF, \R1
19 .endm
21 ;# Forward and inverse DCTs are nearly identical; only differences are
22 ;# in normalization (fwd is twice unitary, inv is half unitary)
23 ;# and that they are of course transposes of each other.
25 ;# The following three accomplish most of implementation and
26 ;# are used only by ppc_idct.c and ppc_fdct.c.
27 .macro prologue
28 mfspr r11, 256 ;# get old VRSAVE
29 oris r12, r11, 0xfffc
30 mtspr 256, r12 ;# set VRSAVE
32 stwu r1,-32(r1) ;# create space on the stack
34 li r6, 16
36 load_c v0, dct_tab, 0, r9, r10
37 lvx v1, r6, r10
38 addi r10, r10, 32
39 lvx v2, 0, r10
40 lvx v3, r6, r10
42 load_c v4, ppc_dctperm_tab, 0, r9, r10
43 load_c v5, ppc_dctperm_tab, r6, r9, r10
45 load_c v6, round_tab, 0, r10, r9
46 .endm
48 .macro epilogue
49 addi r1, r1, 32 ;# recover stack
51 mtspr 256, r11 ;# reset old VRSAVE
52 .endm
54 ;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3.
55 ;# a/A are the even rows 0,2 b/B are the odd rows 1,3
56 ;# For fwd transform, indices are horizontal positions, then frequencies.
57 ;# For inverse transform, frequencies then positions.
58 ;# The two resulting A0..A3 B0..B3 are later combined
59 ;# and vertically transformed.
61 .macro two_rows_horiz Dst
62 vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1
64 vmsumshm v10, v0, v8, v6
65 vmsumshm v10, v1, v9, v10
66 vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1
68 vmsumshm v11, v2, v8, v6
69 vmsumshm v11, v3, v9, v11
70 vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3
72 vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3
73 vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3
74 .endm
76 ;# Vertical xf on two rows. DCT values in comments are for inverse transform;
77 ;# forward transform uses transpose.
79 .macro two_rows_vert Ceven, Codd
80 vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times
81 vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 ""
82 vmsumshm v8, v8, v12, v6
83 vmsumshm v8, v9, v13, v8
84 vsraw v10, v8, v7
86 vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13
87 vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33
88 vmsumshm v8, v8, v12, v6
89 vmsumshm v8, v9, v13, v8
90 vsraw v8, v8, v7
92 vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3
93 .endm
95 .macro two_rows_h Dest
96 stw r0, 0(r8)
97 lwz r0, 4(r3)
98 stw r0, 4(r8)
99 lwzux r0, r3,r5
100 stw r0, 8(r8)
101 lwz r0, 4(r3)
102 stw r0, 12(r8)
103 lvx v8, 0,r8
104 two_rows_horiz \Dest
105 .endm
107 .align 2
108 ;# r3 short *input
109 ;# r4 short *output
110 ;# r5 int pitch
111 vp8_short_fdct4x4_ppc:
113 prologue
115 vspltisw v7, 14 ;# == 14, fits in 5 signed bits
116 addi r8, r1, 0
119 lwz r0, 0(r3)
120 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
122 lwzux r0, r3, r5
123 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
125 lvx v6, r6, r9 ;# v6 = Vround
126 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
128 two_rows_vert v0, v1
129 stvx v8, 0, r4
130 two_rows_vert v2, v3
131 stvx v8, r6, r4
133 epilogue
137 .align 2
138 ;# r3 short *input
139 ;# r4 short *output
140 ;# r5 int pitch
141 vp8_short_fdct8x4_ppc:
142 prologue
144 vspltisw v7, 14 ;# == 14, fits in 5 signed bits
145 addi r8, r1, 0
146 addi r10, r3, 0
148 lwz r0, 0(r3)
149 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
151 lwzux r0, r3, r5
152 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
154 lvx v6, r6, r9 ;# v6 = Vround
155 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
157 two_rows_vert v0, v1
158 stvx v8, 0, r4
159 two_rows_vert v2, v3
160 stvx v8, r6, r4
162 ;# Next block
163 addi r3, r10, 8
164 addi r4, r4, 32
165 lvx v6, 0, r9 ;# v6 = Hround
167 vspltisw v7, 14 ;# == 14, fits in 5 signed bits
168 addi r8, r1, 0
170 lwz r0, 0(r3)
171 two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13
173 lwzux r0, r3, r5
174 two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33
176 lvx v6, r6, r9 ;# v6 = Vround
177 vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter
179 two_rows_vert v0, v1
180 stvx v8, 0, r4
181 two_rows_vert v2, v3
182 stvx v8, r6, r4
184 epilogue
188 .data
189 .align 4
190 ppc_dctperm_tab:
191 .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
192 .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
194 .align 4
195 dct_tab:
196 .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
197 .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
199 .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
200 .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
202 .align 4
203 round_tab:
204 .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
205 .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))