Define RDCOST only once
[libvpx.git] / vp8 / encoder / ppc / sad_altivec.asm
blobe5f26380f96be5c0dc615db4113a8dff88d0d119
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 .globl vp8_sad16x16_ppc
13 .globl vp8_sad16x8_ppc
14 .globl vp8_sad8x16_ppc
15 .globl vp8_sad8x8_ppc
16 .globl vp8_sad4x4_ppc
18 .macro load_aligned_16 V R O
19 lvsl v3, 0, \R ;# permutate value for alignment
21 lvx v1, 0, \R
22 lvx v2, \O, \R
24 vperm \V, v1, v2, v3
25 .endm
27 .macro prologue
28 mfspr r11, 256 ;# get old VRSAVE
29 oris r12, r11, 0xffc0
30 mtspr 256, r12 ;# set VRSAVE
32 stwu r1, -32(r1) ;# create space on the stack
34 li r10, 16 ;# load offset and loop counter
36 vspltisw v8, 0 ;# zero out total to start
37 .endm
39 .macro epilogue
40 addi r1, r1, 32 ;# recover stack
42 mtspr 256, r11 ;# reset old VRSAVE
43 .endm
45 .macro SAD_16
46 ;# v6 = abs (v4 - v5)
47 vsububs v6, v4, v5
48 vsububs v7, v5, v4
49 vor v6, v6, v7
51 ;# v8 += abs (v4 - v5)
52 vsum4ubs v8, v6, v8
53 .endm
55 .macro sad_16_loop loop_label
56 lvsl v3, 0, r5 ;# only needs to be done once per block
58 ;# preload a line of data before getting into the loop
59 lvx v4, 0, r3
60 lvx v1, 0, r5
61 lvx v2, r10, r5
63 add r5, r5, r6
64 add r3, r3, r4
66 vperm v5, v1, v2, v3
68 .align 4
69 \loop_label:
70 ;# compute difference on first row
71 vsububs v6, v4, v5
72 vsububs v7, v5, v4
74 ;# load up next set of data
75 lvx v9, 0, r3
76 lvx v1, 0, r5
77 lvx v2, r10, r5
79 ;# perform abs() of difference
80 vor v6, v6, v7
81 add r3, r3, r4
83 ;# add to the running tally
84 vsum4ubs v8, v6, v8
86 ;# now onto the next line
87 vperm v5, v1, v2, v3
88 add r5, r5, r6
89 lvx v4, 0, r3
91 ;# compute difference on second row
92 vsububs v6, v9, v5
93 lvx v1, 0, r5
94 vsububs v7, v5, v9
95 lvx v2, r10, r5
96 vor v6, v6, v7
97 add r3, r3, r4
98 vsum4ubs v8, v6, v8
99 vperm v5, v1, v2, v3
100 add r5, r5, r6
102 bdnz \loop_label
104 vspltisw v7, 0
106 vsumsws v8, v8, v7
108 stvx v8, 0, r1
109 lwz r3, 12(r1)
110 .endm
112 .macro sad_8_loop loop_label
113 .align 4
114 \loop_label:
115 ;# only one of the inputs should need to be aligned.
116 load_aligned_16 v4, r3, r10
117 load_aligned_16 v5, r5, r10
119 ;# move onto the next line
120 add r3, r3, r4
121 add r5, r5, r6
123 ;# only one of the inputs should need to be aligned.
124 load_aligned_16 v6, r3, r10
125 load_aligned_16 v7, r5, r10
127 ;# move onto the next line
128 add r3, r3, r4
129 add r5, r5, r6
131 vmrghb v4, v4, v6
132 vmrghb v5, v5, v7
134 SAD_16
136 bdnz \loop_label
138 vspltisw v7, 0
140 vsumsws v8, v8, v7
142 stvx v8, 0, r1
143 lwz r3, 12(r1)
144 .endm
146 .align 2
147 ;# r3 unsigned char *src_ptr
148 ;# r4 int src_stride
149 ;# r5 unsigned char *ref_ptr
150 ;# r6 int ref_stride
152 ;# r3 return value
153 vp8_sad16x16_ppc:
155 prologue
157 li r9, 8
158 mtctr r9
160 sad_16_loop sad16x16_loop
162 epilogue
166 .align 2
167 ;# r3 unsigned char *src_ptr
168 ;# r4 int src_stride
169 ;# r5 unsigned char *ref_ptr
170 ;# r6 int ref_stride
172 ;# r3 return value
173 vp8_sad16x8_ppc:
175 prologue
177 li r9, 4
178 mtctr r9
180 sad_16_loop sad16x8_loop
182 epilogue
186 .align 2
187 ;# r3 unsigned char *src_ptr
188 ;# r4 int src_stride
189 ;# r5 unsigned char *ref_ptr
190 ;# r6 int ref_stride
192 ;# r3 return value
193 vp8_sad8x16_ppc:
195 prologue
197 li r9, 8
198 mtctr r9
200 sad_8_loop sad8x16_loop
202 epilogue
206 .align 2
207 ;# r3 unsigned char *src_ptr
208 ;# r4 int src_stride
209 ;# r5 unsigned char *ref_ptr
210 ;# r6 int ref_stride
212 ;# r3 return value
213 vp8_sad8x8_ppc:
215 prologue
217 li r9, 4
218 mtctr r9
220 sad_8_loop sad8x8_loop
222 epilogue
226 .macro transfer_4x4 I P
227 lwz r0, 0(\I)
228 add \I, \I, \P
230 lwz r7, 0(\I)
231 add \I, \I, \P
233 lwz r8, 0(\I)
234 add \I, \I, \P
236 lwz r9, 0(\I)
238 stw r0, 0(r1)
239 stw r7, 4(r1)
240 stw r8, 8(r1)
241 stw r9, 12(r1)
242 .endm
244 .align 2
245 ;# r3 unsigned char *src_ptr
246 ;# r4 int src_stride
247 ;# r5 unsigned char *ref_ptr
248 ;# r6 int ref_stride
250 ;# r3 return value
251 vp8_sad4x4_ppc:
253 prologue
255 transfer_4x4 r3, r4
256 lvx v4, 0, r1
258 transfer_4x4 r5, r6
259 lvx v5, 0, r1
261 vspltisw v8, 0 ;# zero out total to start
263 ;# v6 = abs (v4 - v5)
264 vsububs v6, v4, v5
265 vsububs v7, v5, v4
266 vor v6, v6, v7
268 ;# v8 += abs (v4 - v5)
269 vsum4ubs v7, v6, v8
270 vsumsws v7, v7, v8
272 stvx v7, 0, r1
273 lwz r3, 12(r1)
275 epilogue