2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 ; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
13 EXPORT |vp8_short_idct4x4llm_1_v6|
14 EXPORT |vp8_short_idct4x4llm_v6|
15 EXPORT |vp8_short_idct4x4llm_v6_scott|
16 EXPORT |vp8_short_idct4x4llm_v6_dual|
18 AREA |.text|
, CODE
, READONLY
20 ;********************************************************************************
21 ;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
26 ;********************************************************************************
28 |vp8_short_idct4x4llm_1_v6|
PROC ; cycles in out pit
30 ldrsh r0
, [r0
] ; load input[0] 1, r0 un 2
32 stmdb
sp!, {r4, r5, lr} ; make room for wide writes 1 backup
33 mov r0
, r0
, asr #
3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
34 pkhbt r4
, r0
, r0
, lsl #
16 ; pack r0 into r4 1, r0 req`d ^1 pack
35 mov r5
, r4
; expand expand
37 strd r4
, [r1
], r2
; *output = r0, post inc 1
42 ldmia
sp!, {r4, r5, pc} ; replace vars, return restore
43 ENDP ; |vp8_short_idct4x4llm_1_v6|
44 ;********************************************************************************
45 ;********************************************************************************
46 ;********************************************************************************
48 ;********************************************************************************
49 ;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
54 ;********************************************************************************
56 |vp8_short_idct4x4llm_v6|
PROC ; cycles in out pit
58 stmdb
sp!, {r4-r11, lr} ; backup registers 1 backup
60 mov r4
, #
0x00004E00 ; 1 cst
61 orr r4
, r4
, #
0x0000007B ; cospi8sqrt2minus1
62 mov r5
, #
0x00008A00 ; 1 cst
63 orr r5
, r5
, #
0x0000008C ; sinpi8sqrt2
67 ldrsh r12
, [r0
, #
8] ; input[4] 1, r12 unavail 2 [4]
68 ldrsh r3
, [r0
, #
24] ; input[12] 1, r3 unavail 2 [12]
69 ldrsh r8
, [r0
, #
16] ; input[8] 1, r8 unavail 2 [8]
70 ldrsh r7
, [r0
], #
0x2 ; input[0] 1, r7 unavail 2 ++ [0]
71 smulwb r10
, r5
, r12
; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
72 smulwb r11
, r4
, r3
; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
73 add r9
, r7
, r8
; a1 = [0] + [8] 1 a1
74 sub r7
, r7
, r8
; b1 = [0] - [8] 1 b1
75 add r11
, r3
, r11
; temp2 1
76 rsb r11
, r11
, r10
; c1 = temp1 - temp2 1 c1
77 smulwb r3
, r5
, r3
; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
78 smulwb r10
, r4
, r12
; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
79 add r8
, r7
, r11
; b1 + c1 1 b+c
80 strh r8
, [r1
, r2
] ; out[pitch] = b1+c1 1
81 sub r7
, r7
, r11
; b1 - c1 1 b-c
82 add r10
, r12
, r10
; temp1 1
83 add r3
, r10
, r3
; d1 = temp1 + temp2 1 d1
84 add r10
, r9
, r3
; a1 + d1 1 a+d
85 sub r3
, r9
, r3
; a1 - d1 1 a-d
86 add r8
, r2
, r2
; pitch * 2 1 p*2
87 strh r7
, [r1
, r8
] ; out[pitch*2] = b1-c1 1
88 add r7
, r2
, r2
, lsl #
1 ; pitch * 3 1 p*3
89 strh r3
, [r1
, r7
] ; out[pitch*3] = a1-d1 1
90 subs r6
, r6
, #
1 ; i-- 1 --
91 strh r10
, [r1
], #
0x2 ; out[0] = a1+d1 1 ++
92 bne loop1
; if i>0, continue
94 sub r1
, r1
, #
8 ; set up out for next loop 1 -4
95 ; for this iteration, input=prev output
99 ldrsh r11
, [r1
, #
2] ; input[1] 1, r11 un 2 [1]
100 ldrsh r8
, [r1
, #
6] ; input[3] 1, r8 un 2 [3]
101 ldrsh r3
, [r1
, #
4] ; input[2] 1, r3 un 2 [2]
102 ldrsh r0
, [r1
] ; input[0] 1, r0 un 2 [0]
103 smulwb r9
, r5
, r11
; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
104 smulwb r10
, r4
, r8
; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
105 add r7
, r0
, r3
; a1 = [0] + [2] 1 a1
106 sub r0
, r0
, r3
; b1 = [0] - [2] 1 b1
107 add r10
, r8
, r10
; temp2 1
108 rsb r9
, r10
, r9
; c1 = temp1 - temp2 1 c1
109 smulwb r8
, r5
, r8
; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
110 smulwb r10
, r4
, r11
; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
111 add r3
, r0
, r9
; b1+c1 1 b+c
112 add r3
, r3
, #
4 ; b1+c1+4 1 +4
113 add r10
, r11
, r10
; temp1 1
114 mov r3
, r3
, asr #
3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
115 strh r3
, [r1
, #
2] ; out[1] = b1+c1 1
116 add r10
, r10
, r8
; d1 = temp1 + temp2 1 d1
117 add r3
, r7
, r10
; a1+d1 1 a+d
118 add r3
, r3
, #
4 ; a1+d1+4 1 +4
119 sub r7
, r7
, r10
; a1-d1 1 a-d
120 add r7
, r7
, #
4 ; a1-d1+4 1 +4
121 mov r3
, r3
, asr #
3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
122 mov r7
, r7
, asr #
3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
123 strh r7
, [r1
, #
6] ; out[3] = a1-d1 1
124 sub r0
, r0
, r9
; b1-c1 1 b-c
125 add r0
, r0
, #
4 ; b1-c1+4 1 +4
126 subs r6
, r6
, #
1 ; i-- 1 --
127 mov r0
, r0
, asr #
3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
128 strh r0
, [r1
, #
4] ; out[2] = b1-c1 1
129 strh r3
, [r1
], r2
; out[0] = a1+d1 1
130 ; add r1, r1, r2 ; out += pitch 1 ++
131 bne loop2
; if i>0, continue
133 ldmia
sp!, {r4 - r11, pc} ; replace vars, return restore
136 ;********************************************************************************
137 ;********************************************************************************
138 ;********************************************************************************
140 ;********************************************************************************
141 ;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
146 ;********************************************************************************
148 |vp8_short_idct4x4llm_v6_scott|
PROC ; cycles in out pit
151 stmdb
sp!, {r4 - r11, lr} ; backup registers 1 backup
153 mov r3
, #
0x00004E00 ; cos
154 orr r3
, r3
, #
0x0000007B ; cospi8sqrt2minus1
155 mov r4
, #
0x00008A00 ; sin
156 orr r4
, r4
, #
0x0000008C ; sinpi8sqrt2
160 short_idct4x4llm_v6_scott_loop1
;
161 ldr r10
, [r0
, #
(4*2)] ; i5 | i4 5,4
162 ldr r11
, [r0
, #
(12*2)] ; i13 | i12 13,12
164 smulwb r6
, r4
, r10
; ((ip[4] * sinpi8sqrt2) >> 16) lt1
165 smulwb r7
, r3
, r11
; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
167 smulwb r12
, r3
, r10
; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
168 smulwb r14
, r4
, r11
; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
170 add r6
, r6
, r7
; partial c1 lt1-lt2
171 add r12
, r12
, r14
; partial d1 l2t2+l2t1
173 smulwt r14
, r4
, r10
; ((ip[5] * sinpi8sqrt2) >> 16) ht1
174 smulwt r7
, r3
, r11
; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
176 smulwt r8
, r3
, r10
; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
177 smulwt r9
, r4
, r11
; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
179 add r7
, r14
, r7
; partial c1_2 ht1+ht2
180 sub r8
, r8
, r9
; partial d1_2 h2t1-h2t2
182 pkhbt r6
, r6
, r7
, lsl #
16 ; partial c1_2 | partial c1_1 pack
183 pkhbt r12
, r12
, r8
, lsl #
16 ; partial d1_2 | partial d1_1 pack
185 usub16 r6
, r6
, r10
; c1_2 | c1_1 c
186 uadd16 r12
, r12
, r11
; d1_2 | d1_1 d
188 ldr r10
, [r0
, #
0] ; i1 | i0 1,0
189 ldr r11
, [r0
, #
(8*2)] ; i9 | i10 9,10
191 ;;;;;; add r0, r0, #0x4 ; +4
192 ;;;;;; add r1, r1, #0x4 ; +4
194 uadd16 r8
, r10
, r11
; i1 + i9 | i0 + i8 aka a1 a
195 usub16 r9
, r10
, r11
; i1 - i9 | i0 - i8 aka b1 b
197 uadd16 r7
, r8
, r12
; a1 + d1 pair a+d
198 usub16 r14
, r8
, r12
; a1 - d1 pair a-d
200 str r7
, [r1
] ; op[0] = a1 + d1
201 str r14
, [r1
, r2
] ; op[pitch*3] = a1 - d1
203 add r0
, r0
, #
0x4 ; op[pitch] = b1 + c1 ++
204 add r1
, r1
, #
0x4 ; op[pitch*2] = b1 - c1 ++
206 subs r5
, r5
, #
0x1 ; --
207 bne short_idct4x4llm_v6_scott_loop1
;
209 sub r1
, r1
, #
16 ; reset output ptr
211 mov r0
, r1
; input = output
213 short_idct4x4llm_v6_scott_loop2
;
216 bne short_idct4x4llm_v6_scott_loop2
;
218 ldmia
sp!, {r4 - r11, pc} ;
221 ;********************************************************************************
222 ;********************************************************************************
223 ;********************************************************************************
225 ;********************************************************************************
226 ;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
231 ;********************************************************************************
233 |vp8_short_idct4x4llm_v6_dual|
PROC ; cycles in out pit
235 stmdb
sp!, {r4-r11, lr} ; backup registers 1 backup
236 mov r3
, #
0x00004E00 ; cos
237 orr r3
, r3
, #
0x0000007B ; cospi8sqrt2minus1
238 mov r4
, #
0x00008A00 ; sin
239 orr r4
, r4
, #
0x0000008C ; sinpi8sqrt2
242 ldr r6
, [r0
, #
(4*2)] ; i5 | i4 5|4
243 ldr r12
, [r0
, #
(12*2)] ; i13 | i12 13|12
244 ldr r14
, [r0
, #
(8*2)] ; i9 | i8 9|8
246 smulwt r9
, r3
, r6
; (ip[5] * cospi8sqrt2minus1) >> 16 5c
247 smulwb r7
, r3
, r6
; (ip[4] * cospi8sqrt2minus1) >> 16 4c
248 smulwt r10
, r4
, r6
; (ip[5] * sinpi8sqrt2) >> 16 5s
249 smulwb r8
, r4
, r6
; (ip[4] * sinpi8sqrt2) >> 16 4s
250 pkhbt r7
, r7
, r9
, lsl #
16 ; 5c | 4c
251 smulwt r11
, r3
, r12
; (ip[13] * cospi8sqrt2minus1) >> 16 13c
252 pkhbt r8
, r8
, r10
, lsl #
16 ; 5s | 4s
253 uadd16 r6
, r6
, r7
; 5c+5 | 4c+4
254 smulwt r7
, r4
, r12
; (ip[13] * sinpi8sqrt2) >> 16 13s
255 smulwb r9
, r3
, r12
; (ip[12] * cospi8sqrt2minus1) >> 16 12c
256 smulwb r10
, r4
, r12
; (ip[12] * sinpi8sqrt2) >> 16 12s
257 subs r5
, r5
, #
0x1 ; i-- --
258 pkhbt r9
, r9
, r11
, lsl #
16 ; 13c | 12c
259 ldr r11
, [r0
], #
0x4 ; i1 | i0 ++ 1|0
260 pkhbt r10
, r10
, r7
, lsl #
16 ; 13s | 12s
261 uadd16 r7
, r12
, r9
; 13c+13 | 12c+12
262 usub16 r7
, r8
, r7
; c c
263 uadd16 r6
, r6
, r10
; d d
264 uadd16 r10
, r11
, r14
; a a
265 usub16 r8
, r11
, r14
; b b
266 uadd16 r9
, r10
, r6
; a+d a+d
267 usub16 r10
, r10
, r6
; a-d a-d
268 uadd16 r6
, r8
, r7
; b+c b+c
269 usub16 r7
, r8
, r7
; b-c b-c
270 str r6
, [r1
, r2
] ; o5 | o4
271 add r6
, r2
, r2
; pitch * 2 p2
272 str r7
, [r1
, r6
] ; o9 | o8
273 add r6
, r6
, r2
; pitch * 3 p3
274 str r10
, [r1
, r6
] ; o13 | o12
275 str r9
, [r1
], #
0x4 ; o1 | o0 ++
278 sub r0
, r1
, #
8 ; reset input/output i/o
280 ldr r6
, [r0
, r2
] ; i5 | i4 5|4
281 ldr r1
, [r0
] ; i1 | i0 1|0
282 ldr r12
, [r0
, #
0x4] ; i3 | i2 3|2
283 add r14
, r2
, #
0x4 ; pitch + 2 p+2
284 ldr r14
, [r0
, r14
] ; i7 | i6 7|6
285 smulwt r9
, r3
, r6
; (ip[5] * cospi8sqrt2minus1) >> 16 5c
286 smulwt r7
, r3
, r1
; (ip[1] * cospi8sqrt2minus1) >> 16 1c
287 smulwt r10
, r4
, r6
; (ip[5] * sinpi8sqrt2) >> 16 5s
288 smulwt r8
, r4
, r1
; (ip[1] * sinpi8sqrt2) >> 16 1s
289 pkhbt r11
, r6
, r1
, lsl #
16 ; i0 | i4 0|4
290 pkhbt r7
, r9
, r7
, lsl #
16 ; 1c | 5c
291 pkhbt r8
, r10
, r8
, lsl #
16 ; 1s | 5s = temp1 © tc1
292 pkhtb r1
, r1
, r6
, asr #
16 ; i1 | i5 1|5
293 uadd16 r1
, r7
, r1
; 1c+1 | 5c+5 = temp2 (d) td2
294 pkhbt r9
, r14
, r12
, lsl #
16 ; i2 | i6 2|6
295 uadd16 r10
, r11
, r9
; a a
296 usub16 r9
, r11
, r9
; b b
297 pkhtb r6
, r12
, r14
, asr #
16 ; i3 | i7 3|7
298 subs r5
, r5
, #
0x1 ; i-- --
299 smulwt r7
, r3
, r6
; (ip[3] * cospi8sqrt2minus1) >> 16 3c
300 smulwt r11
, r4
, r6
; (ip[3] * sinpi8sqrt2) >> 16 3s
301 smulwb r12
, r3
, r6
; (ip[7] * cospi8sqrt2minus1) >> 16 7c
302 smulwb r14
, r4
, r6
; (ip[7] * sinpi8sqrt2) >> 16 7s
304 pkhbt r7
, r12
, r7
, lsl #
16 ; 3c | 7c
305 pkhbt r11
, r14
, r11
, lsl #
16 ; 3s | 7s = temp1 (d) td1
306 uadd16 r6
, r7
, r6
; 3c+3 | 7c+7 = temp2 (c) tc2
307 usub16 r12
, r8
, r6
; c (o1 | o5) c
308 uadd16 r6
, r11
, r1
; d (o3 | o7) d
309 uadd16 r7
, r10
, r6
; a+d a+d
310 mov r8
, #
0x4 ; set up 4's 4
311 orr r8
, r8
, #
0x40000 ; 4|4
312 usub16 r6
, r10
, r6
; a-d a-d
313 uadd16 r6
, r6
, r8
; a-d+4 3|7
314 uadd16 r7
, r7
, r8
; a+d+4 0|4
315 uadd16 r10
, r9
, r12
; b+c b+c
316 usub16 r1
, r9
, r12
; b-c b-c
317 uadd16 r10
, r10
, r8
; b+c+4 1|5
318 uadd16 r1
, r1
, r8
; b-c+4 2|6
319 mov r8
, r10
, asr #
19 ; o1 >> 3
320 strh r8
, [r0
, #
2] ; o1
321 mov r8
, r1
, asr #
19 ; o2 >> 3
322 strh r8
, [r0
, #
4] ; o2
323 mov r8
, r6
, asr #
19 ; o3 >> 3
324 strh r8
, [r0
, #
6] ; o3
325 mov r8
, r7
, asr #
19 ; o0 >> 3
326 strh r8
, [r0
], r2
; o0 +p
328 mov r8
, r10
, asr #
3 ; o5 >> 3
329 strh r8
, [r0
, #
2] ; o5
331 mov r8
, r1
, asr #
3 ; o6 >> 3
332 strh r8
, [r0
, #
4] ; o6
334 mov r8
, r6
, asr #
3 ; o7 >> 3
335 strh r8
, [r0
, #
6] ; o7
337 mov r8
, r7
, asr #
3 ; o4 >> 3
338 strh r8
, [r0
], r2
; o4 +p
339 ;;;;; subs r5, r5, #0x1 ; i-- --
342 ldmia
sp!, {r4 - r11, pc} ; replace vars, return restore