Removed unused vp8_recon_intra4x4mb function
[libvpx.git] / vp8 / encoder / ppc / variance_altivec.asm
bloba1ebf663aa5b3c2170666ece4923811c97755a35
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 .globl vp8_get8x8var_ppc
13 .globl vp8_get16x16var_ppc
14 .globl vp8_mse16x16_ppc
15 .globl vp8_variance16x16_ppc
16 .globl vp8_variance16x8_ppc
17 .globl vp8_variance8x16_ppc
18 .globl vp8_variance8x8_ppc
19 .globl vp8_variance4x4_ppc
21 .macro load_aligned_16 V R O
22 lvsl v3, 0, \R ;# permutate value for alignment
24 lvx v1, 0, \R
25 lvx v2, \O, \R
27 vperm \V, v1, v2, v3
28 .endm
30 .macro prologue
31 mfspr r11, 256 ;# get old VRSAVE
32 oris r12, r11, 0xffc0
33 mtspr 256, r12 ;# set VRSAVE
35 stwu r1, -32(r1) ;# create space on the stack
37 li r10, 16 ;# load offset and loop counter
39 vspltisw v7, 0 ;# zero for merging
40 vspltisw v8, 0 ;# zero out total to start
41 vspltisw v9, 0 ;# zero out total for dif^2
42 .endm
44 .macro epilogue
45 addi r1, r1, 32 ;# recover stack
47 mtspr 256, r11 ;# reset old VRSAVE
48 .endm
50 .macro compute_sum_sse
51 ;# Compute sum first. Unpack to so signed subract
52 ;# can be used. Only have a half word signed
53 ;# subract. Do high, then low.
54 vmrghb v2, v7, v4
55 vmrghb v3, v7, v5
56 vsubshs v2, v2, v3
57 vsum4shs v8, v2, v8
59 vmrglb v2, v7, v4
60 vmrglb v3, v7, v5
61 vsubshs v2, v2, v3
62 vsum4shs v8, v2, v8
64 ;# Now compute sse.
65 vsububs v2, v4, v5
66 vsububs v3, v5, v4
67 vor v2, v2, v3
69 vmsumubm v9, v2, v2, v9
70 .endm
72 .macro variance_16 DS loop_label store_sum
73 \loop_label:
74 ;# only one of the inputs should need to be aligned.
75 load_aligned_16 v4, r3, r10
76 load_aligned_16 v5, r5, r10
78 ;# move onto the next line
79 add r3, r3, r4
80 add r5, r5, r6
82 compute_sum_sse
84 bdnz \loop_label
86 vsumsws v8, v8, v7
87 vsumsws v9, v9, v7
89 stvx v8, 0, r1
90 lwz r3, 12(r1)
92 stvx v9, 0, r1
93 lwz r4, 12(r1)
95 .if \store_sum
96 stw r3, 0(r8) ;# sum
97 .endif
98 stw r4, 0(r7) ;# sse
100 mullw r3, r3, r3 ;# sum*sum
101 srawi r3, r3, \DS ;# (sum*sum) >> DS
102 subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
103 .endm
105 .macro variance_8 DS loop_label store_sum
106 \loop_label:
107 ;# only one of the inputs should need to be aligned.
108 load_aligned_16 v4, r3, r10
109 load_aligned_16 v5, r5, r10
111 ;# move onto the next line
112 add r3, r3, r4
113 add r5, r5, r6
115 ;# only one of the inputs should need to be aligned.
116 load_aligned_16 v6, r3, r10
117 load_aligned_16 v0, r5, r10
119 ;# move onto the next line
120 add r3, r3, r4
121 add r5, r5, r6
123 vmrghb v4, v4, v6
124 vmrghb v5, v5, v0
126 compute_sum_sse
128 bdnz \loop_label
130 vsumsws v8, v8, v7
131 vsumsws v9, v9, v7
133 stvx v8, 0, r1
134 lwz r3, 12(r1)
136 stvx v9, 0, r1
137 lwz r4, 12(r1)
139 .if \store_sum
140 stw r3, 0(r8) ;# sum
141 .endif
142 stw r4, 0(r7) ;# sse
144 mullw r3, r3, r3 ;# sum*sum
145 srawi r3, r3, \DS ;# (sum*sum) >> 8
146 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
147 .endm
149 .align 2
150 ;# r3 unsigned char *src_ptr
151 ;# r4 int source_stride
152 ;# r5 unsigned char *ref_ptr
153 ;# r6 int recon_stride
154 ;# r7 unsigned int *SSE
155 ;# r8 int *Sum
157 ;# r3 return value
158 vp8_get8x8var_ppc:
160 prologue
162 li r9, 4
163 mtctr r9
165 variance_8 6, get8x8var_loop, 1
167 epilogue
171 .align 2
172 ;# r3 unsigned char *src_ptr
173 ;# r4 int source_stride
174 ;# r5 unsigned char *ref_ptr
175 ;# r6 int recon_stride
176 ;# r7 unsigned int *SSE
177 ;# r8 int *Sum
179 ;# r3 return value
180 vp8_get16x16var_ppc:
182 prologue
184 mtctr r10
186 variance_16 8, get16x16var_loop, 1
188 epilogue
192 .align 2
193 ;# r3 unsigned char *src_ptr
194 ;# r4 int source_stride
195 ;# r5 unsigned char *ref_ptr
196 ;# r6 int recon_stride
197 ;# r7 unsigned int *sse
199 ;# r 3 return value
200 vp8_mse16x16_ppc:
201 prologue
203 mtctr r10
205 mse16x16_loop:
206 ;# only one of the inputs should need to be aligned.
207 load_aligned_16 v4, r3, r10
208 load_aligned_16 v5, r5, r10
210 ;# move onto the next line
211 add r3, r3, r4
212 add r5, r5, r6
214 ;# Now compute sse.
215 vsububs v2, v4, v5
216 vsububs v3, v5, v4
217 vor v2, v2, v3
219 vmsumubm v9, v2, v2, v9
221 bdnz mse16x16_loop
223 vsumsws v9, v9, v7
225 stvx v9, 0, r1
226 lwz r3, 12(r1)
228 stvx v9, 0, r1
229 lwz r3, 12(r1)
231 stw r3, 0(r7) ;# sse
233 epilogue
237 .align 2
238 ;# r3 unsigned char *src_ptr
239 ;# r4 int source_stride
240 ;# r5 unsigned char *ref_ptr
241 ;# r6 int recon_stride
242 ;# r7 unsigned int *sse
244 ;# r3 return value
245 vp8_variance16x16_ppc:
247 prologue
249 mtctr r10
251 variance_16 8, variance16x16_loop, 0
253 epilogue
257 .align 2
258 ;# r3 unsigned char *src_ptr
259 ;# r4 int source_stride
260 ;# r5 unsigned char *ref_ptr
261 ;# r6 int recon_stride
262 ;# r7 unsigned int *sse
264 ;# r3 return value
265 vp8_variance16x8_ppc:
267 prologue
269 li r9, 8
270 mtctr r9
272 variance_16 7, variance16x8_loop, 0
274 epilogue
278 .align 2
279 ;# r3 unsigned char *src_ptr
280 ;# r4 int source_stride
281 ;# r5 unsigned char *ref_ptr
282 ;# r6 int recon_stride
283 ;# r7 unsigned int *sse
285 ;# r3 return value
286 vp8_variance8x16_ppc:
288 prologue
290 li r9, 8
291 mtctr r9
293 variance_8 7, variance8x16_loop, 0
295 epilogue
299 .align 2
300 ;# r3 unsigned char *src_ptr
301 ;# r4 int source_stride
302 ;# r5 unsigned char *ref_ptr
303 ;# r6 int recon_stride
304 ;# r7 unsigned int *sse
306 ;# r3 return value
307 vp8_variance8x8_ppc:
309 prologue
311 li r9, 4
312 mtctr r9
314 variance_8 6, variance8x8_loop, 0
316 epilogue
320 .macro transfer_4x4 I P
321 lwz r0, 0(\I)
322 add \I, \I, \P
324 lwz r10,0(\I)
325 add \I, \I, \P
327 lwz r8, 0(\I)
328 add \I, \I, \P
330 lwz r9, 0(\I)
332 stw r0, 0(r1)
333 stw r10, 4(r1)
334 stw r8, 8(r1)
335 stw r9, 12(r1)
336 .endm
338 .align 2
339 ;# r3 unsigned char *src_ptr
340 ;# r4 int source_stride
341 ;# r5 unsigned char *ref_ptr
342 ;# r6 int recon_stride
343 ;# r7 unsigned int *sse
345 ;# r3 return value
346 vp8_variance4x4_ppc:
348 prologue
350 transfer_4x4 r3, r4
351 lvx v4, 0, r1
353 transfer_4x4 r5, r6
354 lvx v5, 0, r1
356 compute_sum_sse
358 vsumsws v8, v8, v7
359 vsumsws v9, v9, v7
361 stvx v8, 0, r1
362 lwz r3, 12(r1)
364 stvx v9, 0, r1
365 lwz r4, 12(r1)
367 stw r4, 0(r7) ;# sse
369 mullw r3, r3, r3 ;# sum*sum
370 srawi r3, r3, 4 ;# (sum*sum) >> 4
371 subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
373 epilogue