Removed unused vp8_recon_intra4x4mb function
[libvpx.git] / vp8 / encoder / ppc / variance_subpixel_altivec.asm
blob301360b1d38995610a32c501fc703d275071bf9f
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 .globl vp8_sub_pixel_variance4x4_ppc
13 .globl vp8_sub_pixel_variance8x8_ppc
14 .globl vp8_sub_pixel_variance8x16_ppc
15 .globl vp8_sub_pixel_variance16x8_ppc
16 .globl vp8_sub_pixel_variance16x16_ppc
18 .macro load_c V, LABEL, OFF, R0, R1
19 lis \R0, \LABEL@ha
20 la \R1, \LABEL@l(\R0)
21 lvx \V, \OFF, \R1
22 .endm
24 .macro load_vfilter V0, V1
25 load_c \V0, vfilter_b, r6, r12, r10
27 addi r6, r6, 16
28 lvx \V1, r6, r10
29 .endm
31 .macro HProlog jump_label
32 ;# load up horizontal filter
33 slwi. r5, r5, 4 ;# index into horizontal filter array
35 ;# index to the next set of vectors in the row.
36 li r10, 16
38 ;# downshift by 7 ( divide by 128 ) at the end
39 vspltish v19, 7
41 ;# If there isn't any filtering to be done for the horizontal, then
42 ;# just skip to the second pass.
43 beq \jump_label
45 load_c v20, hfilter_b, r5, r12, r0
47 ;# setup constants
48 ;# v14 permutation value for alignment
49 load_c v28, b_hperm_b, 0, r12, r0
51 ;# index to the next set of vectors in the row.
52 li r12, 32
54 ;# rounding added in on the multiply
55 vspltisw v21, 8
56 vspltisw v18, 3
57 vslw v18, v21, v18 ;# 0x00000040000000400000004000000040
59 slwi. r6, r6, 5 ;# index into vertical filter array
60 .endm
62 ;# Filters a horizontal line
63 ;# expects:
64 ;# r3 src_ptr
65 ;# r4 pitch
66 ;# r10 16
67 ;# r12 32
68 ;# v17 perm intput
69 ;# v18 rounding
70 ;# v19 shift
71 ;# v20 filter taps
72 ;# v21 tmp
73 ;# v22 tmp
74 ;# v23 tmp
75 ;# v24 tmp
76 ;# v25 tmp
77 ;# v26 tmp
78 ;# v27 tmp
79 ;# v28 perm output
82 .macro hfilter_8 V, hp, lp, increment_counter
83 lvsl v17, 0, r3 ;# permutate value for alignment
85 ;# input to filter is 9 bytes wide, output is 8 bytes.
86 lvx v21, 0, r3
87 lvx v22, r10, r3
89 .if \increment_counter
90 add r3, r3, r4
91 .endif
92 vperm v21, v21, v22, v17
94 vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456
95 vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A
97 vmsummbm v24, v20, v24, v18
98 vmsummbm v25, v20, v25, v18
100 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
102 vsrh v24, v24, v19 ;# divide v0, v1 by 128
104 vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result
105 .endm
107 .macro vfilter_16 P0 P1
108 vmuleub v22, \P0, v20 ;# 64 + 4 positive taps
109 vadduhm v22, v18, v22
110 vmuloub v23, \P0, v20
111 vadduhm v23, v18, v23
113 vmuleub v24, \P1, v21
114 vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary
115 vmuloub v25, \P1, v21
116 vadduhm v23, v23, v25 ;# Ro = odds
118 vsrh v22, v22, v19 ;# divide by 128
119 vsrh v23, v23, v19 ;# v16 v17 = evens, odds
120 vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order
121 vmrglh v23, v22, v23
122 vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result
123 .endm
125 .macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
126 ;# Compute sum first. Unpack to so signed subract
127 ;# can be used. Only have a half word signed
128 ;# subract. Do high, then low.
129 vmrghb \t1, \z0, \src
130 vmrghb \t2, \z0, \ref
131 vsubshs \t1, \t1, \t2
132 vsum4shs \sum, \t1, \sum
134 vmrglb \t1, \z0, \src
135 vmrglb \t2, \z0, \ref
136 vsubshs \t1, \t1, \t2
137 vsum4shs \sum, \t1, \sum
139 ;# Now compute sse.
140 vsububs \t1, \src, \ref
141 vsububs \t2, \ref, \src
142 vor \t1, \t1, \t2
144 vmsumubm \sse, \t1, \t1, \sse
145 .endm
147 .macro variance_final sum, sse, z0, DS
148 vsumsws \sum, \sum, \z0
149 vsumsws \sse, \sse, \z0
151 stvx \sum, 0, r1
152 lwz r3, 12(r1)
154 stvx \sse, 0, r1
155 lwz r4, 12(r1)
157 stw r4, 0(r9) ;# sse
159 mullw r3, r3, r3 ;# sum*sum
160 srawi r3, r3, \DS ;# (sum*sum) >> 8
161 subf r3, r3, r4 ;# sse - ((sum*sum) >> 8)
162 .endm
164 .macro compute_sum_sse_16 V, increment_counter
165 load_and_align_16 v16, r7, r8, \increment_counter
166 compute_sum_sse \V, v16, v18, v19, v20, v21, v23
167 .endm
169 .macro load_and_align_16 V, R, P, increment_counter
170 lvsl v17, 0, \R ;# permutate value for alignment
172 ;# input to filter is 21 bytes wide, output is 16 bytes.
173 ;# input will can span three vectors if not aligned correctly.
174 lvx v21, 0, \R
175 lvx v22, r10, \R
177 .if \increment_counter
178 add \R, \R, \P
179 .endif
181 vperm \V, v21, v22, v17
182 .endm
184 .align 2
185 ;# r3 unsigned char *src_ptr
186 ;# r4 int src_pixels_per_line
187 ;# r5 int xoffset
188 ;# r6 int yoffset
189 ;# r7 unsigned char *dst_ptr
190 ;# r8 int dst_pixels_per_line
191 ;# r9 unsigned int *sse
193 ;# r3 return value
194 vp8_sub_pixel_variance4x4_ppc:
195 mfspr r11, 256 ;# get old VRSAVE
196 oris r12, r11, 0xf830
197 ori r12, r12, 0xfff8
198 mtspr 256, r12 ;# set VRSAVE
200 stwu r1,-32(r1) ;# create space on the stack
202 HProlog second_pass_4x4_pre_copy_b
204 ;# Load up permutation constants
205 load_c v10, b_0123_b, 0, r12, r0
206 load_c v11, b_4567_b, 0, r12, r0
208 hfilter_8 v0, v10, v11, 1
209 hfilter_8 v1, v10, v11, 1
210 hfilter_8 v2, v10, v11, 1
211 hfilter_8 v3, v10, v11, 1
213 ;# Finished filtering main horizontal block. If there is no
214 ;# vertical filtering, jump to storing the data. Otherwise
215 ;# load up and filter the additional line that is needed
216 ;# for the vertical filter.
217 beq compute_sum_sse_4x4_b
219 hfilter_8 v4, v10, v11, 0
221 b second_pass_4x4_b
223 second_pass_4x4_pre_copy_b:
224 slwi r6, r6, 5 ;# index into vertical filter array
226 load_and_align_16 v0, r3, r4, 1
227 load_and_align_16 v1, r3, r4, 1
228 load_and_align_16 v2, r3, r4, 1
229 load_and_align_16 v3, r3, r4, 1
230 load_and_align_16 v4, r3, r4, 0
232 second_pass_4x4_b:
233 vspltish v20, 8
234 vspltish v18, 3
235 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
237 load_vfilter v20, v21
239 vfilter_16 v0, v1
240 vfilter_16 v1, v2
241 vfilter_16 v2, v3
242 vfilter_16 v3, v4
244 compute_sum_sse_4x4_b:
245 vspltish v18, 0 ;# sum
246 vspltish v19, 0 ;# sse
247 vspltish v23, 0 ;# unpack
248 li r10, 16
250 load_and_align_16 v4, r7, r8, 1
251 load_and_align_16 v5, r7, r8, 1
252 load_and_align_16 v6, r7, r8, 1
253 load_and_align_16 v7, r7, r8, 1
255 vmrghb v0, v0, v1
256 vmrghb v1, v2, v3
258 vmrghb v2, v4, v5
259 vmrghb v3, v6, v7
261 load_c v10, b_hilo_b, 0, r12, r0
263 vperm v0, v0, v1, v10
264 vperm v1, v2, v3, v10
266 compute_sum_sse v0, v1, v18, v19, v20, v21, v23
268 variance_final v18, v19, v23, 4
270 addi r1, r1, 32 ;# recover stack
271 mtspr 256, r11 ;# reset old VRSAVE
275 .align 2
276 ;# r3 unsigned char *src_ptr
277 ;# r4 int src_pixels_per_line
278 ;# r5 int xoffset
279 ;# r6 int yoffset
280 ;# r7 unsigned char *dst_ptr
281 ;# r8 int dst_pixels_per_line
282 ;# r9 unsigned int *sse
284 ;# r3 return value
285 vp8_sub_pixel_variance8x8_ppc:
286 mfspr r11, 256 ;# get old VRSAVE
287 oris r12, r11, 0xfff0
288 ori r12, r12, 0xffff
289 mtspr 256, r12 ;# set VRSAVE
291 stwu r1,-32(r1) ;# create space on the stack
293 HProlog second_pass_8x8_pre_copy_b
295 ;# Load up permutation constants
296 load_c v10, b_0123_b, 0, r12, r0
297 load_c v11, b_4567_b, 0, r12, r0
299 hfilter_8 v0, v10, v11, 1
300 hfilter_8 v1, v10, v11, 1
301 hfilter_8 v2, v10, v11, 1
302 hfilter_8 v3, v10, v11, 1
303 hfilter_8 v4, v10, v11, 1
304 hfilter_8 v5, v10, v11, 1
305 hfilter_8 v6, v10, v11, 1
306 hfilter_8 v7, v10, v11, 1
308 ;# Finished filtering main horizontal block. If there is no
309 ;# vertical filtering, jump to storing the data. Otherwise
310 ;# load up and filter the additional line that is needed
311 ;# for the vertical filter.
312 beq compute_sum_sse_8x8_b
314 hfilter_8 v8, v10, v11, 0
316 b second_pass_8x8_b
318 second_pass_8x8_pre_copy_b:
319 slwi. r6, r6, 5 ;# index into vertical filter array
321 load_and_align_16 v0, r3, r4, 1
322 load_and_align_16 v1, r3, r4, 1
323 load_and_align_16 v2, r3, r4, 1
324 load_and_align_16 v3, r3, r4, 1
325 load_and_align_16 v4, r3, r4, 1
326 load_and_align_16 v5, r3, r4, 1
327 load_and_align_16 v6, r3, r4, 1
328 load_and_align_16 v7, r3, r4, 1
329 load_and_align_16 v8, r3, r4, 0
331 beq compute_sum_sse_8x8_b
333 second_pass_8x8_b:
334 vspltish v20, 8
335 vspltish v18, 3
336 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
338 load_vfilter v20, v21
340 vfilter_16 v0, v1
341 vfilter_16 v1, v2
342 vfilter_16 v2, v3
343 vfilter_16 v3, v4
344 vfilter_16 v4, v5
345 vfilter_16 v5, v6
346 vfilter_16 v6, v7
347 vfilter_16 v7, v8
349 compute_sum_sse_8x8_b:
350 vspltish v18, 0 ;# sum
351 vspltish v19, 0 ;# sse
352 vspltish v23, 0 ;# unpack
353 li r10, 16
355 vmrghb v0, v0, v1
356 vmrghb v1, v2, v3
357 vmrghb v2, v4, v5
358 vmrghb v3, v6, v7
360 load_and_align_16 v4, r7, r8, 1
361 load_and_align_16 v5, r7, r8, 1
362 load_and_align_16 v6, r7, r8, 1
363 load_and_align_16 v7, r7, r8, 1
364 load_and_align_16 v8, r7, r8, 1
365 load_and_align_16 v9, r7, r8, 1
366 load_and_align_16 v10, r7, r8, 1
367 load_and_align_16 v11, r7, r8, 0
369 vmrghb v4, v4, v5
370 vmrghb v5, v6, v7
371 vmrghb v6, v8, v9
372 vmrghb v7, v10, v11
374 compute_sum_sse v0, v4, v18, v19, v20, v21, v23
375 compute_sum_sse v1, v5, v18, v19, v20, v21, v23
376 compute_sum_sse v2, v6, v18, v19, v20, v21, v23
377 compute_sum_sse v3, v7, v18, v19, v20, v21, v23
379 variance_final v18, v19, v23, 6
381 addi r1, r1, 32 ;# recover stack
382 mtspr 256, r11 ;# reset old VRSAVE
385 .align 2
386 ;# r3 unsigned char *src_ptr
387 ;# r4 int src_pixels_per_line
388 ;# r5 int xoffset
389 ;# r6 int yoffset
390 ;# r7 unsigned char *dst_ptr
391 ;# r8 int dst_pixels_per_line
392 ;# r9 unsigned int *sse
394 ;# r3 return value
395 vp8_sub_pixel_variance8x16_ppc:
396 mfspr r11, 256 ;# get old VRSAVE
397 oris r12, r11, 0xffff
398 ori r12, r12, 0xfffc
399 mtspr 256, r12 ;# set VRSAVE
401 stwu r1,-32(r1) ;# create space on the stack
403 HProlog second_pass_8x16_pre_copy_b
405 ;# Load up permutation constants
406 load_c v29, b_0123_b, 0, r12, r0
407 load_c v30, b_4567_b, 0, r12, r0
409 hfilter_8 v0, v29, v30, 1
410 hfilter_8 v1, v29, v30, 1
411 hfilter_8 v2, v29, v30, 1
412 hfilter_8 v3, v29, v30, 1
413 hfilter_8 v4, v29, v30, 1
414 hfilter_8 v5, v29, v30, 1
415 hfilter_8 v6, v29, v30, 1
416 hfilter_8 v7, v29, v30, 1
417 hfilter_8 v8, v29, v30, 1
418 hfilter_8 v9, v29, v30, 1
419 hfilter_8 v10, v29, v30, 1
420 hfilter_8 v11, v29, v30, 1
421 hfilter_8 v12, v29, v30, 1
422 hfilter_8 v13, v29, v30, 1
423 hfilter_8 v14, v29, v30, 1
424 hfilter_8 v15, v29, v30, 1
426 ;# Finished filtering main horizontal block. If there is no
427 ;# vertical filtering, jump to storing the data. Otherwise
428 ;# load up and filter the additional line that is needed
429 ;# for the vertical filter.
430 beq compute_sum_sse_8x16_b
432 hfilter_8 v16, v29, v30, 0
434 b second_pass_8x16_b
436 second_pass_8x16_pre_copy_b:
437 slwi. r6, r6, 5 ;# index into vertical filter array
439 load_and_align_16 v0, r3, r4, 1
440 load_and_align_16 v1, r3, r4, 1
441 load_and_align_16 v2, r3, r4, 1
442 load_and_align_16 v3, r3, r4, 1
443 load_and_align_16 v4, r3, r4, 1
444 load_and_align_16 v5, r3, r4, 1
445 load_and_align_16 v6, r3, r4, 1
446 load_and_align_16 v7, r3, r4, 1
447 load_and_align_16 v8, r3, r4, 1
448 load_and_align_16 v9, r3, r4, 1
449 load_and_align_16 v10, r3, r4, 1
450 load_and_align_16 v11, r3, r4, 1
451 load_and_align_16 v12, r3, r4, 1
452 load_and_align_16 v13, r3, r4, 1
453 load_and_align_16 v14, r3, r4, 1
454 load_and_align_16 v15, r3, r4, 1
455 load_and_align_16 v16, r3, r4, 0
457 beq compute_sum_sse_8x16_b
459 second_pass_8x16_b:
460 vspltish v20, 8
461 vspltish v18, 3
462 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
464 load_vfilter v20, v21
466 vfilter_16 v0, v1
467 vfilter_16 v1, v2
468 vfilter_16 v2, v3
469 vfilter_16 v3, v4
470 vfilter_16 v4, v5
471 vfilter_16 v5, v6
472 vfilter_16 v6, v7
473 vfilter_16 v7, v8
474 vfilter_16 v8, v9
475 vfilter_16 v9, v10
476 vfilter_16 v10, v11
477 vfilter_16 v11, v12
478 vfilter_16 v12, v13
479 vfilter_16 v13, v14
480 vfilter_16 v14, v15
481 vfilter_16 v15, v16
483 compute_sum_sse_8x16_b:
484 vspltish v18, 0 ;# sum
485 vspltish v19, 0 ;# sse
486 vspltish v23, 0 ;# unpack
487 li r10, 16
489 vmrghb v0, v0, v1
490 vmrghb v1, v2, v3
491 vmrghb v2, v4, v5
492 vmrghb v3, v6, v7
493 vmrghb v4, v8, v9
494 vmrghb v5, v10, v11
495 vmrghb v6, v12, v13
496 vmrghb v7, v14, v15
498 load_and_align_16 v8, r7, r8, 1
499 load_and_align_16 v9, r7, r8, 1
500 load_and_align_16 v10, r7, r8, 1
501 load_and_align_16 v11, r7, r8, 1
502 load_and_align_16 v12, r7, r8, 1
503 load_and_align_16 v13, r7, r8, 1
504 load_and_align_16 v14, r7, r8, 1
505 load_and_align_16 v15, r7, r8, 1
507 vmrghb v8, v8, v9
508 vmrghb v9, v10, v11
509 vmrghb v10, v12, v13
510 vmrghb v11, v14, v15
512 compute_sum_sse v0, v8, v18, v19, v20, v21, v23
513 compute_sum_sse v1, v9, v18, v19, v20, v21, v23
514 compute_sum_sse v2, v10, v18, v19, v20, v21, v23
515 compute_sum_sse v3, v11, v18, v19, v20, v21, v23
517 load_and_align_16 v8, r7, r8, 1
518 load_and_align_16 v9, r7, r8, 1
519 load_and_align_16 v10, r7, r8, 1
520 load_and_align_16 v11, r7, r8, 1
521 load_and_align_16 v12, r7, r8, 1
522 load_and_align_16 v13, r7, r8, 1
523 load_and_align_16 v14, r7, r8, 1
524 load_and_align_16 v15, r7, r8, 0
526 vmrghb v8, v8, v9
527 vmrghb v9, v10, v11
528 vmrghb v10, v12, v13
529 vmrghb v11, v14, v15
531 compute_sum_sse v4, v8, v18, v19, v20, v21, v23
532 compute_sum_sse v5, v9, v18, v19, v20, v21, v23
533 compute_sum_sse v6, v10, v18, v19, v20, v21, v23
534 compute_sum_sse v7, v11, v18, v19, v20, v21, v23
536 variance_final v18, v19, v23, 7
538 addi r1, r1, 32 ;# recover stack
539 mtspr 256, r11 ;# reset old VRSAVE
542 ;# Filters a horizontal line
543 ;# expects:
544 ;# r3 src_ptr
545 ;# r4 pitch
546 ;# r10 16
547 ;# r12 32
548 ;# v17 perm intput
549 ;# v18 rounding
550 ;# v19 shift
551 ;# v20 filter taps
552 ;# v21 tmp
553 ;# v22 tmp
554 ;# v23 tmp
555 ;# v24 tmp
556 ;# v25 tmp
557 ;# v26 tmp
558 ;# v27 tmp
559 ;# v28 perm output
561 .macro hfilter_16 V, increment_counter
563 lvsl v17, 0, r3 ;# permutate value for alignment
565 ;# input to filter is 21 bytes wide, output is 16 bytes.
566 ;# input will can span three vectors if not aligned correctly.
567 lvx v21, 0, r3
568 lvx v22, r10, r3
569 lvx v23, r12, r3
571 .if \increment_counter
572 add r3, r3, r4
573 .endif
574 vperm v21, v21, v22, v17
575 vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified
577 ;# set 0
578 vmsummbm v24, v20, v21, v18 ;# taps times elements
580 ;# set 1
581 vsldoi v23, v21, v22, 1
582 vmsummbm v25, v20, v23, v18
584 ;# set 2
585 vsldoi v23, v21, v22, 2
586 vmsummbm v26, v20, v23, v18
588 ;# set 3
589 vsldoi v23, v21, v22, 3
590 vmsummbm v27, v20, v23, v18
592 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
593 vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F
595 vsrh v24, v24, v19 ;# divide v0, v1 by 128
596 vsrh v25, v25, v19
598 vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result
599 vperm \V, \V, v0, v28 ;# \V = correctly-ordered result
600 .endm
602 .align 2
603 ;# r3 unsigned char *src_ptr
604 ;# r4 int src_pixels_per_line
605 ;# r5 int xoffset
606 ;# r6 int yoffset
607 ;# r7 unsigned char *dst_ptr
608 ;# r8 int dst_pixels_per_line
609 ;# r9 unsigned int *sse
611 ;# r3 return value
612 vp8_sub_pixel_variance16x8_ppc:
613 mfspr r11, 256 ;# get old VRSAVE
614 oris r12, r11, 0xffff
615 ori r12, r12, 0xfff8
616 mtspr 256, r12 ;# set VRSAVE
618 stwu r1, -32(r1) ;# create space on the stack
620 HProlog second_pass_16x8_pre_copy_b
622 hfilter_16 v0, 1
623 hfilter_16 v1, 1
624 hfilter_16 v2, 1
625 hfilter_16 v3, 1
626 hfilter_16 v4, 1
627 hfilter_16 v5, 1
628 hfilter_16 v6, 1
629 hfilter_16 v7, 1
631 ;# Finished filtering main horizontal block. If there is no
632 ;# vertical filtering, jump to storing the data. Otherwise
633 ;# load up and filter the additional line that is needed
634 ;# for the vertical filter.
635 beq compute_sum_sse_16x8_b
637 hfilter_16 v8, 0
639 b second_pass_16x8_b
641 second_pass_16x8_pre_copy_b:
642 slwi. r6, r6, 5 ;# index into vertical filter array
644 load_and_align_16 v0, r3, r4, 1
645 load_and_align_16 v1, r3, r4, 1
646 load_and_align_16 v2, r3, r4, 1
647 load_and_align_16 v3, r3, r4, 1
648 load_and_align_16 v4, r3, r4, 1
649 load_and_align_16 v5, r3, r4, 1
650 load_and_align_16 v6, r3, r4, 1
651 load_and_align_16 v7, r3, r4, 1
652 load_and_align_16 v8, r3, r4, 1
654 beq compute_sum_sse_16x8_b
656 second_pass_16x8_b:
657 vspltish v20, 8
658 vspltish v18, 3
659 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
661 load_vfilter v20, v21
663 vfilter_16 v0, v1
664 vfilter_16 v1, v2
665 vfilter_16 v2, v3
666 vfilter_16 v3, v4
667 vfilter_16 v4, v5
668 vfilter_16 v5, v6
669 vfilter_16 v6, v7
670 vfilter_16 v7, v8
672 compute_sum_sse_16x8_b:
673 vspltish v18, 0 ;# sum
674 vspltish v19, 0 ;# sse
675 vspltish v23, 0 ;# unpack
676 li r10, 16
678 compute_sum_sse_16 v0, 1
679 compute_sum_sse_16 v1, 1
680 compute_sum_sse_16 v2, 1
681 compute_sum_sse_16 v3, 1
682 compute_sum_sse_16 v4, 1
683 compute_sum_sse_16 v5, 1
684 compute_sum_sse_16 v6, 1
685 compute_sum_sse_16 v7, 0
687 variance_final v18, v19, v23, 7
689 addi r1, r1, 32 ;# recover stack
691 mtspr 256, r11 ;# reset old VRSAVE
695 .align 2
696 ;# r3 unsigned char *src_ptr
697 ;# r4 int src_pixels_per_line
698 ;# r5 int xoffset
699 ;# r6 int yoffset
700 ;# r7 unsigned char *dst_ptr
701 ;# r8 int dst_pixels_per_line
702 ;# r9 unsigned int *sse
704 ;# r3 return value
705 vp8_sub_pixel_variance16x16_ppc:
706 mfspr r11, 256 ;# get old VRSAVE
707 oris r12, r11, 0xffff
708 ori r12, r12, 0xfff8
709 mtspr 256, r12 ;# set VRSAVE
711 stwu r1, -32(r1) ;# create space on the stack
713 HProlog second_pass_16x16_pre_copy_b
715 hfilter_16 v0, 1
716 hfilter_16 v1, 1
717 hfilter_16 v2, 1
718 hfilter_16 v3, 1
719 hfilter_16 v4, 1
720 hfilter_16 v5, 1
721 hfilter_16 v6, 1
722 hfilter_16 v7, 1
723 hfilter_16 v8, 1
724 hfilter_16 v9, 1
725 hfilter_16 v10, 1
726 hfilter_16 v11, 1
727 hfilter_16 v12, 1
728 hfilter_16 v13, 1
729 hfilter_16 v14, 1
730 hfilter_16 v15, 1
732 ;# Finished filtering main horizontal block. If there is no
733 ;# vertical filtering, jump to storing the data. Otherwise
734 ;# load up and filter the additional line that is needed
735 ;# for the vertical filter.
736 beq compute_sum_sse_16x16_b
738 hfilter_16 v16, 0
740 b second_pass_16x16_b
742 second_pass_16x16_pre_copy_b:
743 slwi. r6, r6, 5 ;# index into vertical filter array
745 load_and_align_16 v0, r3, r4, 1
746 load_and_align_16 v1, r3, r4, 1
747 load_and_align_16 v2, r3, r4, 1
748 load_and_align_16 v3, r3, r4, 1
749 load_and_align_16 v4, r3, r4, 1
750 load_and_align_16 v5, r3, r4, 1
751 load_and_align_16 v6, r3, r4, 1
752 load_and_align_16 v7, r3, r4, 1
753 load_and_align_16 v8, r3, r4, 1
754 load_and_align_16 v9, r3, r4, 1
755 load_and_align_16 v10, r3, r4, 1
756 load_and_align_16 v11, r3, r4, 1
757 load_and_align_16 v12, r3, r4, 1
758 load_and_align_16 v13, r3, r4, 1
759 load_and_align_16 v14, r3, r4, 1
760 load_and_align_16 v15, r3, r4, 1
761 load_and_align_16 v16, r3, r4, 0
763 beq compute_sum_sse_16x16_b
765 second_pass_16x16_b:
766 vspltish v20, 8
767 vspltish v18, 3
768 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
770 load_vfilter v20, v21
772 vfilter_16 v0, v1
773 vfilter_16 v1, v2
774 vfilter_16 v2, v3
775 vfilter_16 v3, v4
776 vfilter_16 v4, v5
777 vfilter_16 v5, v6
778 vfilter_16 v6, v7
779 vfilter_16 v7, v8
780 vfilter_16 v8, v9
781 vfilter_16 v9, v10
782 vfilter_16 v10, v11
783 vfilter_16 v11, v12
784 vfilter_16 v12, v13
785 vfilter_16 v13, v14
786 vfilter_16 v14, v15
787 vfilter_16 v15, v16
789 compute_sum_sse_16x16_b:
790 vspltish v18, 0 ;# sum
791 vspltish v19, 0 ;# sse
792 vspltish v23, 0 ;# unpack
793 li r10, 16
795 compute_sum_sse_16 v0, 1
796 compute_sum_sse_16 v1, 1
797 compute_sum_sse_16 v2, 1
798 compute_sum_sse_16 v3, 1
799 compute_sum_sse_16 v4, 1
800 compute_sum_sse_16 v5, 1
801 compute_sum_sse_16 v6, 1
802 compute_sum_sse_16 v7, 1
803 compute_sum_sse_16 v8, 1
804 compute_sum_sse_16 v9, 1
805 compute_sum_sse_16 v10, 1
806 compute_sum_sse_16 v11, 1
807 compute_sum_sse_16 v12, 1
808 compute_sum_sse_16 v13, 1
809 compute_sum_sse_16 v14, 1
810 compute_sum_sse_16 v15, 0
812 variance_final v18, v19, v23, 8
814 addi r1, r1, 32 ;# recover stack
816 mtspr 256, r11 ;# reset old VRSAVE
820 .data
822 .align 4
823 hfilter_b:
824 .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0
825 .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0
826 .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0
827 .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0
828 .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0
829 .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0
830 .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0
831 .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0
833 .align 4
834 vfilter_b:
835 .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
836 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
837 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
838 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
839 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
840 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
841 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
842 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
843 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
844 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
845 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
846 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
847 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
848 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
849 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
850 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
852 .align 4
853 b_hperm_b:
854 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
856 .align 4
857 b_0123_b:
858 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
860 .align 4
861 b_4567_b:
862 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
864 b_hilo_b:
865 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23