2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 .globl vp8_get8x8var_ppc
12 .globl vp8_get16x16var_ppc
13 .globl vp8_mse16x16_ppc
14 .globl vp8_variance16x16_ppc
15 .globl vp8_variance16x8_ppc
16 .globl vp8_variance8x16_ppc
17 .globl vp8_variance8x8_ppc
18 .globl vp8_variance4x4_ppc
20 .
macro load_aligned_16 V R O
21 lvsl v3
, 0, \R
;# permutate value for alignment
30 mfspr r11
, 256 ;# get old VRSAVE
32 mtspr
256, r12
;# set VRSAVE
34 stwu r1
, -32(r1
) ;# create space on the stack
36 li r10
, 16 ;# load offset and loop counter
38 vspltisw v7
, 0 ;# zero for merging
39 vspltisw v8
, 0 ;# zero out total to start
40 vspltisw v9
, 0 ;# zero out total for dif^2
44 addi r1
, r1
, 32 ;# recover stack
46 mtspr
256, r11
;# reset old VRSAVE
49 .
macro compute_sum_sse
50 ;# Compute sum first. Unpack to so signed subract
51 ;# can be used. Only have a half word signed
52 ;# subract. Do high, then low.
68 vmsumubm v9
, v2
, v2
, v9
71 .
macro variance_16
DS loop_label store_sum
73 ;# only one of the inputs should need to be aligned.
74 load_aligned_16 v4
, r3
, r10
75 load_aligned_16 v5
, r5
, r10
77 ;# move onto the next line
99 mullw r3
, r3
, r3
;# sum*sum
100 srawi r3
, r3
, \
DS ;# (sum*sum) >> DS
101 subf r3
, r3
, r4
;# sse - ((sum*sum) >> DS)
104 .
macro variance_8
DS loop_label store_sum
106 ;# only one of the inputs should need to be aligned.
107 load_aligned_16 v4
, r3
, r10
108 load_aligned_16 v5
, r5
, r10
110 ;# move onto the next line
114 ;# only one of the inputs should need to be aligned.
115 load_aligned_16 v6
, r3
, r10
116 load_aligned_16 v0
, r5
, r10
118 ;# move onto the next line
143 mullw r3
, r3
, r3
;# sum*sum
144 srawi r3
, r3
, \
DS ;# (sum*sum) >> 8
145 subf r3
, r3
, r4
;# sse - ((sum*sum) >> 8)
149 ;# r3 unsigned char *src_ptr
150 ;# r4 int source_stride
151 ;# r5 unsigned char *ref_ptr
152 ;# r6 int recon_stride
153 ;# r7 unsigned int *SSE
164 variance_8
6, get8x8var_loop
, 1
171 ;# r3 unsigned char *src_ptr
172 ;# r4 int source_stride
173 ;# r5 unsigned char *ref_ptr
174 ;# r6 int recon_stride
175 ;# r7 unsigned int *SSE
185 variance_16
8, get16x16var_loop
, 1
192 ;# r3 unsigned char *src_ptr
193 ;# r4 int source_stride
194 ;# r5 unsigned char *ref_ptr
195 ;# r6 int recon_stride
196 ;# r7 unsigned int *sse
205 ;# only one of the inputs should need to be aligned.
206 load_aligned_16 v4
, r3
, r10
207 load_aligned_16 v5
, r5
, r10
209 ;# move onto the next line
218 vmsumubm v9
, v2
, v2
, v9
237 ;# r3 unsigned char *src_ptr
238 ;# r4 int source_stride
239 ;# r5 unsigned char *ref_ptr
240 ;# r6 int recon_stride
241 ;# r7 unsigned int *sse
244 vp8_variance16x16_ppc:
250 variance_16
8, variance16x16_loop
, 0
257 ;# r3 unsigned char *src_ptr
258 ;# r4 int source_stride
259 ;# r5 unsigned char *ref_ptr
260 ;# r6 int recon_stride
261 ;# r7 unsigned int *sse
264 vp8_variance16x8_ppc:
271 variance_16
7, variance16x8_loop
, 0
278 ;# r3 unsigned char *src_ptr
279 ;# r4 int source_stride
280 ;# r5 unsigned char *ref_ptr
281 ;# r6 int recon_stride
282 ;# r7 unsigned int *sse
285 vp8_variance8x16_ppc:
292 variance_8
7, variance8x16_loop
, 0
299 ;# r3 unsigned char *src_ptr
300 ;# r4 int source_stride
301 ;# r5 unsigned char *ref_ptr
302 ;# r6 int recon_stride
303 ;# r7 unsigned int *sse
313 variance_8
6, variance8x8_loop
, 0
319 .
macro transfer_4x4 I P
338 ;# r3 unsigned char *src_ptr
339 ;# r4 int source_stride
340 ;# r5 unsigned char *ref_ptr
341 ;# r6 int recon_stride
342 ;# r7 unsigned int *sse
368 mullw r3
, r3
, r3
;# sum*sum
369 srawi r3
, r3
, 4 ;# (sum*sum) >> 4
370 subf r3
, r3
, r4
;# sse - ((sum*sum) >> 4)