Initial WebM release
[libvpx.git] / vp8 / encoder / ppc / variance_altivec.asm
blob952bf7286488a4b8e61e9955e315c9bd266c7170
2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 .globl vp8_get8x8var_ppc
12 .globl vp8_get16x16var_ppc
13 .globl vp8_mse16x16_ppc
14 .globl vp8_variance16x16_ppc
15 .globl vp8_variance16x8_ppc
16 .globl vp8_variance8x16_ppc
17 .globl vp8_variance8x8_ppc
18 .globl vp8_variance4x4_ppc
20 .macro load_aligned_16 V R O
21 lvsl v3, 0, \R ;# permutate value for alignment
23 lvx v1, 0, \R
24 lvx v2, \O, \R
26 vperm \V, v1, v2, v3
27 .endm
29 .macro prologue
30 mfspr r11, 256 ;# get old VRSAVE
31 oris r12, r11, 0xffc0
32 mtspr 256, r12 ;# set VRSAVE
34 stwu r1, -32(r1) ;# create space on the stack
36 li r10, 16 ;# load offset and loop counter
38 vspltisw v7, 0 ;# zero for merging
39 vspltisw v8, 0 ;# zero out total to start
40 vspltisw v9, 0 ;# zero out total for dif^2
41 .endm
43 .macro epilogue
44 addi r1, r1, 32 ;# recover stack
46 mtspr 256, r11 ;# reset old VRSAVE
47 .endm
49 .macro compute_sum_sse
50 ;# Compute sum first. Unpack to so signed subract
51 ;# can be used. Only have a half word signed
52 ;# subract. Do high, then low.
53 vmrghb v2, v7, v4
54 vmrghb v3, v7, v5
55 vsubshs v2, v2, v3
56 vsum4shs v8, v2, v8
58 vmrglb v2, v7, v4
59 vmrglb v3, v7, v5
60 vsubshs v2, v2, v3
61 vsum4shs v8, v2, v8
63 ;# Now compute sse.
64 vsububs v2, v4, v5
65 vsububs v3, v5, v4
66 vor v2, v2, v3
68 vmsumubm v9, v2, v2, v9
69 .endm
71 .macro variance_16 DS loop_label store_sum
72 \loop_label:
73 ;# only one of the inputs should need to be aligned.
74 load_aligned_16 v4, r3, r10
75 load_aligned_16 v5, r5, r10
77 ;# move onto the next line
78 add r3, r3, r4
79 add r5, r5, r6
81 compute_sum_sse
83 bdnz \loop_label
85 vsumsws v8, v8, v7
86 vsumsws v9, v9, v7
88 stvx v8, 0, r1
89 lwz r3, 12(r1)
91 stvx v9, 0, r1
92 lwz r4, 12(r1)
94 .if \store_sum
95 stw r3, 0(r8) ;# sum
96 .endif
97 stw r4, 0(r7) ;# sse
99 mullw r3, r3, r3 ;# sum*sum
100 srawi r3, r3, \DS ;# (sum*sum) >> DS
101 subf r3, r3, r4 ;# sse - ((sum*sum) >> DS)
102 .endm
104 .macro variance_8 DS loop_label store_sum
105 \loop_label:
106 ;# only one of the inputs should need to be aligned.
107 load_aligned_16 v4, r3, r10
108 load_aligned_16 v5, r5, r10
110 ;# move onto the next line
111 add r3, r3, r4
112 add r5, r5, r6
114 ;# only one of the inputs should need to be aligned.
115 load_aligned_16 v6, r3, r10
116 load_aligned_16 v0, r5, r10
118 ;# move onto the next line
119 add r3, r3, r4
120 add r5, r5, r6
122 vmrghb v4, v4, v6
123 vmrghb v5, v5, v0
125 compute_sum_sse
127 bdnz \loop_label
129 vsumsws v8, v8, v7
130 vsumsws v9, v9, v7
132 stvx v8, 0, r1
133 lwz r3, 12(r1)
135 stvx v9, 0, r1
136 lwz r4, 12(r1)
138 .if \store_sum
139 stw r3, 0(r8) ;# sum
140 .endif
141 stw r4, 0(r7) ;# sse
143 mullw r3, r3, r3 ;# sum*sum
144 srawi r3, r3, \DS ;# (sum*sum) >> 8
145 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
146 .endm
148 .align 2
149 ;# r3 unsigned char *src_ptr
150 ;# r4 int source_stride
151 ;# r5 unsigned char *ref_ptr
152 ;# r6 int recon_stride
153 ;# r7 unsigned int *SSE
154 ;# r8 int *Sum
156 ;# r3 return value
157 vp8_get8x8var_ppc:
159 prologue
161 li r9, 4
162 mtctr r9
164 variance_8 6, get8x8var_loop, 1
166 epilogue
170 .align 2
171 ;# r3 unsigned char *src_ptr
172 ;# r4 int source_stride
173 ;# r5 unsigned char *ref_ptr
174 ;# r6 int recon_stride
175 ;# r7 unsigned int *SSE
176 ;# r8 int *Sum
178 ;# r3 return value
179 vp8_get16x16var_ppc:
181 prologue
183 mtctr r10
185 variance_16 8, get16x16var_loop, 1
187 epilogue
191 .align 2
192 ;# r3 unsigned char *src_ptr
193 ;# r4 int source_stride
194 ;# r5 unsigned char *ref_ptr
195 ;# r6 int recon_stride
196 ;# r7 unsigned int *sse
198 ;# r 3 return value
199 vp8_mse16x16_ppc:
200 prologue
202 mtctr r10
204 mse16x16_loop:
205 ;# only one of the inputs should need to be aligned.
206 load_aligned_16 v4, r3, r10
207 load_aligned_16 v5, r5, r10
209 ;# move onto the next line
210 add r3, r3, r4
211 add r5, r5, r6
213 ;# Now compute sse.
214 vsububs v2, v4, v5
215 vsububs v3, v5, v4
216 vor v2, v2, v3
218 vmsumubm v9, v2, v2, v9
220 bdnz mse16x16_loop
222 vsumsws v9, v9, v7
224 stvx v9, 0, r1
225 lwz r3, 12(r1)
227 stvx v9, 0, r1
228 lwz r3, 12(r1)
230 stw r3, 0(r7) ;# sse
232 epilogue
236 .align 2
237 ;# r3 unsigned char *src_ptr
238 ;# r4 int source_stride
239 ;# r5 unsigned char *ref_ptr
240 ;# r6 int recon_stride
241 ;# r7 unsigned int *sse
243 ;# r3 return value
244 vp8_variance16x16_ppc:
246 prologue
248 mtctr r10
250 variance_16 8, variance16x16_loop, 0
252 epilogue
256 .align 2
257 ;# r3 unsigned char *src_ptr
258 ;# r4 int source_stride
259 ;# r5 unsigned char *ref_ptr
260 ;# r6 int recon_stride
261 ;# r7 unsigned int *sse
263 ;# r3 return value
264 vp8_variance16x8_ppc:
266 prologue
268 li r9, 8
269 mtctr r9
271 variance_16 7, variance16x8_loop, 0
273 epilogue
277 .align 2
278 ;# r3 unsigned char *src_ptr
279 ;# r4 int source_stride
280 ;# r5 unsigned char *ref_ptr
281 ;# r6 int recon_stride
282 ;# r7 unsigned int *sse
284 ;# r3 return value
285 vp8_variance8x16_ppc:
287 prologue
289 li r9, 8
290 mtctr r9
292 variance_8 7, variance8x16_loop, 0
294 epilogue
298 .align 2
299 ;# r3 unsigned char *src_ptr
300 ;# r4 int source_stride
301 ;# r5 unsigned char *ref_ptr
302 ;# r6 int recon_stride
303 ;# r7 unsigned int *sse
305 ;# r3 return value
306 vp8_variance8x8_ppc:
308 prologue
310 li r9, 4
311 mtctr r9
313 variance_8 6, variance8x8_loop, 0
315 epilogue
319 .macro transfer_4x4 I P
320 lwz r0, 0(\I)
321 add \I, \I, \P
323 lwz r10,0(\I)
324 add \I, \I, \P
326 lwz r8, 0(\I)
327 add \I, \I, \P
329 lwz r9, 0(\I)
331 stw r0, 0(r1)
332 stw r10, 4(r1)
333 stw r8, 8(r1)
334 stw r9, 12(r1)
335 .endm
337 .align 2
338 ;# r3 unsigned char *src_ptr
339 ;# r4 int source_stride
340 ;# r5 unsigned char *ref_ptr
341 ;# r6 int recon_stride
342 ;# r7 unsigned int *sse
344 ;# r3 return value
345 vp8_variance4x4_ppc:
347 prologue
349 transfer_4x4 r3, r4
350 lvx v4, 0, r1
352 transfer_4x4 r5, r6
353 lvx v5, 0, r1
355 compute_sum_sse
357 vsumsws v8, v8, v7
358 vsumsws v9, v9, v7
360 stvx v8, 0, r1
361 lwz r3, 12(r1)
363 stvx v9, 0, r1
364 lwz r4, 12(r1)
366 stw r4, 0(r7) ;# sse
368 mullw r3, r3, r3 ;# sum*sum
369 srawi r3, r3, 4 ;# (sum*sum) >> 4
370 subf r3, r3, r4 ;# sse - ((sum*sum) >> 4)
372 epilogue