arm: remove duplicate functions
[libvpx.git] / vp8 / common / arm / armv6 / sixtappredict8x4_v6.asm
blob8b99394849081241d2d07781bd7fea97958cabad
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_sixtap_predict8x4_armv6|
14 AREA |.text|, CODE, READONLY ; name this block of code
15 ;-------------------------------------
16 ; r0 unsigned char *src_ptr,
17 ; r1 int src_pixels_per_line,
18 ; r2 int xoffset,
19 ; r3 int yoffset,
20 ; stack unsigned char *dst_ptr,
21 ; stack int dst_pitch
22 ;-------------------------------------
23 ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
24 ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
25 ;and the result is stored in transpose.
26 |vp8_sixtap_predict8x4_armv6| PROC
27 stmdb sp!, {r4 - r11, lr}
28 str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
30 cmp r2, #0 ;skip first_pass filter if xoffset=0
31 add lr, sp, #4 ;point to temporary buffer
32 beq skip_firstpass_filter
34 ;first-pass filter
35 ldr r12, _filter8_coeff_
36 sub r0, r0, r1, lsl #1
38 add r2, r12, r2, lsl #4 ;calculate filter location
39 add r0, r0, #3 ;adjust src only for loading convinience
41 ldr r3, [r2] ; load up packed filter coefficients
42 ldr r4, [r2, #4]
43 ldr r5, [r2, #8]
45 mov r2, #0x90000 ; height=9 is top part of counter
47 sub r1, r1, #8
49 |first_pass_hloop_v6|
50 ldrb r6, [r0, #-5] ; load source data
51 ldrb r7, [r0, #-4]
52 ldrb r8, [r0, #-3]
53 ldrb r9, [r0, #-2]
54 ldrb r10, [r0, #-1]
56 orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
58 pkhbt r6, r6, r7, lsl #16 ; r7 | r6
59 pkhbt r7, r7, r8, lsl #16 ; r8 | r7
61 pkhbt r8, r8, r9, lsl #16 ; r9 | r8
62 pkhbt r9, r9, r10, lsl #16 ; r10 | r9
64 |first_pass_wloop_v6|
65 smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
66 smuad r12, r7, r3
68 ldrb r6, [r0], #1
70 smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
71 ldrb r7, [r0], #1
72 smlad r12, r9, r4, r12
74 pkhbt r10, r10, r6, lsl #16 ; r10 | r9
75 pkhbt r6, r6, r7, lsl #16 ; r11 | r10
76 smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
77 smlad r12, r6, r5, r12
79 sub r2, r2, #1
81 add r11, r11, #0x40 ; round_shift_and_clamp
82 tst r2, #0xff ; test loop counter
83 usat r11, #8, r11, asr #7
84 add r12, r12, #0x40
85 strh r11, [lr], #20 ; result is transposed and stored, which
86 usat r12, #8, r12, asr #7
88 strh r12, [lr], #20
90 movne r11, r6
91 movne r12, r7
93 movne r6, r8
94 movne r7, r9
95 movne r8, r10
96 movne r9, r11
97 movne r10, r12
99 bne first_pass_wloop_v6
101 ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
102 ;;IF ARCHITECTURE=6
103 ;pld [src, ppl]
104 ;;pld [src, r9]
105 ;;ENDIF
107 subs r2, r2, #0x10000
109 sub lr, lr, #158
111 add r0, r0, r1 ; move to next input line
113 bne first_pass_hloop_v6
115 ;second pass filter
116 secondpass_filter
117 ldr r3, [sp], #4 ; load back yoffset
118 ldr r0, [sp, #216] ; load dst address from stack 180+36
119 ldr r1, [sp, #220] ; load dst stride from stack 180+40
121 cmp r3, #0
122 beq skip_secondpass_filter
124 ldr r12, _filter8_coeff_
125 add lr, r12, r3, lsl #4 ;calculate filter location
127 mov r2, #0x00080000
129 ldr r3, [lr] ; load up packed filter coefficients
130 ldr r4, [lr, #4]
131 ldr r5, [lr, #8]
133 pkhbt r12, r4, r3 ; pack the filter differently
134 pkhbt r11, r5, r4
136 second_pass_hloop_v6
137 ldr r6, [sp] ; load the data
138 ldr r7, [sp, #4]
140 orr r2, r2, #2 ; loop counter
142 second_pass_wloop_v6
143 smuad lr, r3, r6 ; apply filter
144 smulbt r10, r3, r6
146 ldr r8, [sp, #8]
148 smlad lr, r4, r7, lr
149 smladx r10, r12, r7, r10
151 ldrh r9, [sp, #12]
153 smlad lr, r5, r8, lr
154 smladx r10, r11, r8, r10
156 add sp, sp, #4
157 smlatb r10, r5, r9, r10
159 sub r2, r2, #1
161 add lr, lr, #0x40 ; round_shift_and_clamp
162 tst r2, #0xff
163 usat lr, #8, lr, asr #7
164 add r10, r10, #0x40
165 strb lr, [r0], r1 ; the result is transposed back and stored
166 usat r10, #8, r10, asr #7
168 strb r10, [r0],r1
170 movne r6, r7
171 movne r7, r8
173 bne second_pass_wloop_v6
175 subs r2, r2, #0x10000
176 add sp, sp, #12 ; updata src for next loop (20-8)
177 sub r0, r0, r1, lsl #2
178 add r0, r0, #1
180 bne second_pass_hloop_v6
182 add sp, sp, #20
183 ldmia sp!, {r4 - r11, pc}
185 ;--------------------
186 skip_firstpass_filter
187 sub r0, r0, r1, lsl #1
188 sub r1, r1, #8
189 mov r2, #9
191 skip_firstpass_hloop
192 ldrb r4, [r0], #1 ; load data
193 subs r2, r2, #1
194 ldrb r5, [r0], #1
195 strh r4, [lr], #20 ; store it to immediate buffer
196 ldrb r6, [r0], #1 ; load data
197 strh r5, [lr], #20
198 ldrb r7, [r0], #1
199 strh r6, [lr], #20
200 ldrb r8, [r0], #1
201 strh r7, [lr], #20
202 ldrb r9, [r0], #1
203 strh r8, [lr], #20
204 ldrb r10, [r0], #1
205 strh r9, [lr], #20
206 ldrb r11, [r0], #1
207 strh r10, [lr], #20
208 add r0, r0, r1 ; move to next input line
209 strh r11, [lr], #20
211 sub lr, lr, #158 ; move over to next column
212 bne skip_firstpass_hloop
214 b secondpass_filter
216 ;--------------------
217 skip_secondpass_filter
218 mov r2, #8
219 add sp, sp, #4 ;start from src[0] instead of src[-2]
221 skip_secondpass_hloop
222 ldr r6, [sp], #4
223 subs r2, r2, #1
224 ldr r8, [sp], #4
226 mov r7, r6, lsr #16 ; unpack
227 strb r6, [r0], r1
228 mov r9, r8, lsr #16
229 strb r7, [r0], r1
230 add sp, sp, #12 ; 20-8
231 strb r8, [r0], r1
232 strb r9, [r0], r1
234 sub r0, r0, r1, lsl #2
235 add r0, r0, #1
237 bne skip_secondpass_hloop
239 add sp, sp, #16 ; 180 - (160 +4)
241 ldmia sp!, {r4 - r11, pc}
243 ENDP
245 ;-----------------
246 AREA subpelfilters8_dat, DATA, READWRITE ;read/write by default
247 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
248 ;One word each is reserved. Label filter_coeff can be used to access the data.
249 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
250 _filter8_coeff_
251 DCD filter8_coeff
252 filter8_coeff
253 DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
254 DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
255 DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
256 DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
257 DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
258 DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
259 DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
260 DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
262 ;DCD 0, 0, 128, 0, 0, 0
263 ;DCD 0, -6, 123, 12, -1, 0
264 ;DCD 2, -11, 108, 36, -8, 1
265 ;DCD 0, -9, 93, 50, -6, 0
266 ;DCD 3, -16, 77, 77, -16, 3
267 ;DCD 0, -6, 50, 93, -9, 0
268 ;DCD 1, -8, 36, 108, -11, 2
269 ;DCD 0, -1, 12, 123, -6, 0