From cb14764fab88b5b28ba09fa9490bd72c017cb7c2 Mon Sep 17 00:00:00 2001 From: Tero Rintaluoma Date: Wed, 9 Feb 2011 09:34:56 -0500 Subject: [PATCH] Adds armv6 optimized variance calculation Adds vp8_sub_pixel_variance16x16_armv6 function to encoder. Integrates ARMv6 optimized bilinear interpolations from vp8/common/arm/armv6 and adds new assembly file for variance16x16 calculation. - vp8_filter_block2d_bil_first_pass_armv6 (integrated) - vp8_filter_block2d_bil_second_pass_armv6 (integrated) - vp8_variance16x16_armv6 (new) - bilinearfilter_arm.h (new) Change-Id: I18a8331ce7d031ceedd6cd415ecacb0c8f3392db --- vp8/common/arm/bilinearfilter_arm.c | 21 +--- vp8/common/arm/bilinearfilter_arm.h | 35 ++++++ vp8/encoder/arm/arm_csystemdependent.c | 8 +- vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm | 147 ++++++++++++++++++++++ vp8/encoder/arm/variance_arm.c | 34 +++++ vp8/encoder/arm/variance_arm.h | 17 +++ vp8/vp8_common.mk | 1 + vp8/vp8cx_arm.mk | 4 +- 8 files changed, 242 insertions(+), 25 deletions(-) create mode 100644 vp8/common/arm/bilinearfilter_arm.h create mode 100644 vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c index 961d142c..6a46ef68 100644 --- a/vp8/common/arm/bilinearfilter_arm.c +++ b/vp8/common/arm/bilinearfilter_arm.c @@ -12,26 +12,7 @@ #include #include "filter.h" #include "subpixel.h" - -extern void vp8_filter_block2d_bil_first_pass_armv6 -( - unsigned char *src_ptr, - unsigned short *dst_ptr, - unsigned int src_pitch, - unsigned int height, - unsigned int width, - const short *vp8_filter -); - -extern void vp8_filter_block2d_bil_second_pass_armv6 -( - unsigned short *src_ptr, - unsigned char *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const short *vp8_filter -); +#include "arm/bilinearfilter_arm.h" void vp8_filter_block2d_bil_armv6 ( diff --git a/vp8/common/arm/bilinearfilter_arm.h b/vp8/common/arm/bilinearfilter_arm.h new file mode 100644 index 00000000..b7155d3f --- /dev/null +++ b/vp8/common/arm/bilinearfilter_arm.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef BILINEARFILTER_ARM_H +#define BILINEARFILTER_ARM_H + +extern void vp8_filter_block2d_bil_first_pass_armv6 +( + const unsigned char *src_ptr, + unsigned short *dst_ptr, + unsigned int src_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +); + +extern void vp8_filter_block2d_bil_second_pass_armv6 +( + const unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp8_filter +); + +#endif /* BILINEARFILTER_ARM_H */ diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index a1f11026..6c17a798 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -38,14 +38,14 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) /*cpi->rtcd.variance.var4x4 = vp8_variance4x4_c; cpi->rtcd.variance.var8x8 = vp8_variance8x8_c; cpi->rtcd.variance.var8x16 = vp8_variance8x16_c; - cpi->rtcd.variance.var16x8 = vp8_variance16x8_c; - cpi->rtcd.variance.var16x16 = vp8_variance16x16_c;*/ + cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;*/ + cpi->rtcd.variance.var16x16 = vp8_variance16x16_armv6; /*cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c; cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c; cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c; - cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c; - cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c;*/ + cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;*/ + cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_armv6; /*cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c; cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/ diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm new file mode 100644 index 00000000..8d7258af --- /dev/null +++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm @@ -0,0 +1,147 @@ +; +; Copyright (c) 2011 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_variance16x16_armv6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|vp8_variance16x16_armv6| PROC + + stmfd sp!, {r4-r12, lr} + mov r12, #16 ; set loop counter to 16 (=block height) + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + +loop + ; 1st 4 pixels + ldr r4, [r0, #0x0] ; load 4 src pixels + ldr r5, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #0x4] ; load 4 src pixels + ldr r5, [r2, #0x4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #0x8] ; load 4 src pixels + ldr r5, [r2, #0x8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #0xc] ; load 4 src pixels + ldr r5, [r2, #0xc] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; substract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #0x28] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + + END diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c index b40c0482..9737bef4 100644 --- a/vp8/encoder/arm/variance_arm.c +++ b/vp8/encoder/arm/variance_arm.c @@ -10,6 +10,40 @@ #include "vpx_config.h" #include "variance.h" +#include "filter.h" +#include "arm/bilinearfilter_arm.h" + +#if HAVE_ARMV6 + +unsigned int vp8_sub_pixel_variance16x16_armv6 +( + const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse +) +{ + unsigned short first_pass[36*16]; + unsigned char second_pass[20*16]; + const short *HFilter, *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, + src_pixels_per_line, + 17, 16, HFilter); + vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, + 16, 16, 16, VFilter); + + return vp8_variance16x16_armv6(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); +} + +#endif #if HAVE_ARMV7 diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h index 3cbacfac..06d72873 100644 --- a/vp8/encoder/arm/variance_arm.h +++ b/vp8/encoder/arm/variance_arm.h @@ -12,6 +12,23 @@ #ifndef VARIANCE_ARM_H #define VARIANCE_ARM_H +#if HAVE_ARMV6 + +extern prototype_variance(vp8_variance16x16_armv6); +extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_variance_subpixvar16x16 +#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6 + +#undef vp8_variance_var16x16 +#define vp8_variance_var16x16 vp8_variance16x16_armv6 + +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_ARMV6 */ + #if HAVE_ARMV7 extern prototype_sad(vp8_sad4x4_neon); extern prototype_sad(vp8_sad8x8_neon); diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 25909456..af07618a 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -116,6 +116,7 @@ VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c # common (c) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.c +VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.h VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/filter_arm.c VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/loopfilter_arm.c VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/reconintra_arm.c diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index b23ac96c..abc5dc80 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -17,9 +17,10 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c VP8_CX_SRCS-$(ARCH_ARM) += encoder/asm_enc_offsets.c VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/encodemb_arm.c -VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/variance_arm.c VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c +VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c +VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.h VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE) += encoder/boolhuff.c @@ -33,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar #File list for armv6 # encoder +VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM) VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon -- 2.11.4.GIT