2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 .globl vp8_get8x8var_ppc
13 .globl vp8_get16x16var_ppc
14 .globl vp8_mse16x16_ppc
15 .globl vp8_variance16x16_ppc
16 .globl vp8_variance16x8_ppc
17 .globl vp8_variance8x16_ppc
18 .globl vp8_variance8x8_ppc
19 .globl vp8_variance4x4_ppc
21 .
macro load_aligned_16 V R O
22 lvsl v3
, 0, \R
;# permutate value for alignment
31 mfspr r11
, 256 ;# get old VRSAVE
33 mtspr
256, r12
;# set VRSAVE
35 stwu r1
, -32(r1
) ;# create space on the stack
37 li r10
, 16 ;# load offset and loop counter
39 vspltisw v7
, 0 ;# zero for merging
40 vspltisw v8
, 0 ;# zero out total to start
41 vspltisw v9
, 0 ;# zero out total for dif^2
45 addi r1
, r1
, 32 ;# recover stack
47 mtspr
256, r11
;# reset old VRSAVE
50 .
macro compute_sum_sse
51 ;# Compute sum first. Unpack to so signed subract
52 ;# can be used. Only have a half word signed
53 ;# subract. Do high, then low.
69 vmsumubm v9
, v2
, v2
, v9
72 .
macro variance_16
DS loop_label store_sum
74 ;# only one of the inputs should need to be aligned.
75 load_aligned_16 v4
, r3
, r10
76 load_aligned_16 v5
, r5
, r10
78 ;# move onto the next line
100 mullw r3
, r3
, r3
;# sum*sum
101 srawi r3
, r3
, \
DS ;# (sum*sum) >> DS
102 subf r3
, r3
, r4
;# sse - ((sum*sum) >> DS)
105 .
macro variance_8
DS loop_label store_sum
107 ;# only one of the inputs should need to be aligned.
108 load_aligned_16 v4
, r3
, r10
109 load_aligned_16 v5
, r5
, r10
111 ;# move onto the next line
115 ;# only one of the inputs should need to be aligned.
116 load_aligned_16 v6
, r3
, r10
117 load_aligned_16 v0
, r5
, r10
119 ;# move onto the next line
144 mullw r3
, r3
, r3
;# sum*sum
145 srawi r3
, r3
, \
DS ;# (sum*sum) >> 8
146 subf r3
, r3
, r4
;# sse - ((sum*sum) >> 8)
150 ;# r3 unsigned char *src_ptr
151 ;# r4 int source_stride
152 ;# r5 unsigned char *ref_ptr
153 ;# r6 int recon_stride
154 ;# r7 unsigned int *SSE
165 variance_8
6, get8x8var_loop
, 1
172 ;# r3 unsigned char *src_ptr
173 ;# r4 int source_stride
174 ;# r5 unsigned char *ref_ptr
175 ;# r6 int recon_stride
176 ;# r7 unsigned int *SSE
186 variance_16
8, get16x16var_loop
, 1
193 ;# r3 unsigned char *src_ptr
194 ;# r4 int source_stride
195 ;# r5 unsigned char *ref_ptr
196 ;# r6 int recon_stride
197 ;# r7 unsigned int *sse
206 ;# only one of the inputs should need to be aligned.
207 load_aligned_16 v4
, r3
, r10
208 load_aligned_16 v5
, r5
, r10
210 ;# move onto the next line
219 vmsumubm v9
, v2
, v2
, v9
238 ;# r3 unsigned char *src_ptr
239 ;# r4 int source_stride
240 ;# r5 unsigned char *ref_ptr
241 ;# r6 int recon_stride
242 ;# r7 unsigned int *sse
245 vp8_variance16x16_ppc:
251 variance_16
8, variance16x16_loop
, 0
258 ;# r3 unsigned char *src_ptr
259 ;# r4 int source_stride
260 ;# r5 unsigned char *ref_ptr
261 ;# r6 int recon_stride
262 ;# r7 unsigned int *sse
265 vp8_variance16x8_ppc:
272 variance_16
7, variance16x8_loop
, 0
279 ;# r3 unsigned char *src_ptr
280 ;# r4 int source_stride
281 ;# r5 unsigned char *ref_ptr
282 ;# r6 int recon_stride
283 ;# r7 unsigned int *sse
286 vp8_variance8x16_ppc:
293 variance_8
7, variance8x16_loop
, 0
300 ;# r3 unsigned char *src_ptr
301 ;# r4 int source_stride
302 ;# r5 unsigned char *ref_ptr
303 ;# r6 int recon_stride
304 ;# r7 unsigned int *sse
314 variance_8
6, variance8x8_loop
, 0
320 .
macro transfer_4x4 I P
339 ;# r3 unsigned char *src_ptr
340 ;# r4 int source_stride
341 ;# r5 unsigned char *ref_ptr
342 ;# r6 int recon_stride
343 ;# r7 unsigned int *sse
369 mullw r3
, r3
, r3
;# sum*sum
370 srawi r3
, r3
, 4 ;# (sum*sum) >> 4
371 subf r3
, r3
, r4
;# sse - ((sum*sum) >> 4)