Define RDCOST only once
[libvpx.git] / vp8 / encoder / arm / neon / subtract_neon.asm
blob68c2950621ed4b3a1a361bc251a7899b7df026aa
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 EXPORT |vp8_subtract_b_neon|
12 EXPORT |vp8_subtract_mby_neon|
13 EXPORT |vp8_subtract_mbuv_neon|
15 INCLUDE asm_enc_offsets.asm
17 ARM
18 REQUIRE8
19 PRESERVE8
21 AREA ||.text||, CODE, READONLY, ALIGN=2
23 ;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
24 |vp8_subtract_b_neon| PROC
26 stmfd sp!, {r4-r7}
28 ldr r3, [r0, #vp8_block_base_src]
29 ldr r4, [r0, #vp8_block_src]
30 ldr r5, [r0, #vp8_block_src_diff]
31 ldr r3, [r3]
32 ldr r6, [r0, #vp8_block_src_stride]
33 add r3, r3, r4 ; src = *base_src + src
34 ldr r7, [r1, #vp8_blockd_predictor]
36 vld1.8 {d0}, [r3], r6 ;load src
37 vld1.8 {d1}, [r7], r2 ;load pred
38 vld1.8 {d2}, [r3], r6
39 vld1.8 {d3}, [r7], r2
40 vld1.8 {d4}, [r3], r6
41 vld1.8 {d5}, [r7], r2
42 vld1.8 {d6}, [r3], r6
43 vld1.8 {d7}, [r7], r2
45 vsubl.u8 q10, d0, d1
46 vsubl.u8 q11, d2, d3
47 vsubl.u8 q12, d4, d5
48 vsubl.u8 q13, d6, d7
50 mov r2, r2, lsl #1
52 vst1.16 {d20}, [r5], r2 ;store diff
53 vst1.16 {d22}, [r5], r2
54 vst1.16 {d24}, [r5], r2
55 vst1.16 {d26}, [r5], r2
57 ldmfd sp!, {r4-r7}
58 bx lr
60 ENDP
63 ;==========================================
64 ;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
65 |vp8_subtract_mby_neon| PROC
66 mov r12, #4
68 subtract_mby_loop
69 vld1.8 {q0}, [r1], r3 ;load src
70 vld1.8 {q1}, [r2]! ;load pred
71 vld1.8 {q2}, [r1], r3
72 vld1.8 {q3}, [r2]!
73 vld1.8 {q4}, [r1], r3
74 vld1.8 {q5}, [r2]!
75 vld1.8 {q6}, [r1], r3
76 vld1.8 {q7}, [r2]!
78 vsubl.u8 q8, d0, d2
79 vsubl.u8 q9, d1, d3
80 vsubl.u8 q10, d4, d6
81 vsubl.u8 q11, d5, d7
82 vsubl.u8 q12, d8, d10
83 vsubl.u8 q13, d9, d11
84 vsubl.u8 q14, d12, d14
85 vsubl.u8 q15, d13, d15
87 vst1.16 {q8}, [r0]! ;store diff
88 vst1.16 {q9}, [r0]!
89 vst1.16 {q10}, [r0]!
90 vst1.16 {q11}, [r0]!
91 vst1.16 {q12}, [r0]!
92 vst1.16 {q13}, [r0]!
93 vst1.16 {q14}, [r0]!
94 vst1.16 {q15}, [r0]!
96 subs r12, r12, #1
97 bne subtract_mby_loop
99 bx lr
100 ENDP
102 ;=================================
103 ;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
104 |vp8_subtract_mbuv_neon| PROC
105 ldr r12, [sp]
108 add r0, r0, #512 ; short *udiff = diff + 256;
109 add r3, r3, #256 ; unsigned char *upred = pred + 256;
111 vld1.8 {d0}, [r1], r12 ;load src
112 vld1.8 {d1}, [r3]! ;load pred
113 vld1.8 {d2}, [r1], r12
114 vld1.8 {d3}, [r3]!
115 vld1.8 {d4}, [r1], r12
116 vld1.8 {d5}, [r3]!
117 vld1.8 {d6}, [r1], r12
118 vld1.8 {d7}, [r3]!
119 vld1.8 {d8}, [r1], r12
120 vld1.8 {d9}, [r3]!
121 vld1.8 {d10}, [r1], r12
122 vld1.8 {d11}, [r3]!
123 vld1.8 {d12}, [r1], r12
124 vld1.8 {d13}, [r3]!
125 vld1.8 {d14}, [r1], r12
126 vld1.8 {d15}, [r3]!
128 vsubl.u8 q8, d0, d1
129 vsubl.u8 q9, d2, d3
130 vsubl.u8 q10, d4, d5
131 vsubl.u8 q11, d6, d7
132 vsubl.u8 q12, d8, d9
133 vsubl.u8 q13, d10, d11
134 vsubl.u8 q14, d12, d13
135 vsubl.u8 q15, d14, d15
137 vst1.16 {q8}, [r0]! ;store diff
138 vst1.16 {q9}, [r0]!
139 vst1.16 {q10}, [r0]!
140 vst1.16 {q11}, [r0]!
141 vst1.16 {q12}, [r0]!
142 vst1.16 {q13}, [r0]!
143 vst1.16 {q14}, [r0]!
144 vst1.16 {q15}, [r0]!
147 vld1.8 {d0}, [r2], r12 ;load src
148 vld1.8 {d1}, [r3]! ;load pred
149 vld1.8 {d2}, [r2], r12
150 vld1.8 {d3}, [r3]!
151 vld1.8 {d4}, [r2], r12
152 vld1.8 {d5}, [r3]!
153 vld1.8 {d6}, [r2], r12
154 vld1.8 {d7}, [r3]!
155 vld1.8 {d8}, [r2], r12
156 vld1.8 {d9}, [r3]!
157 vld1.8 {d10}, [r2], r12
158 vld1.8 {d11}, [r3]!
159 vld1.8 {d12}, [r2], r12
160 vld1.8 {d13}, [r3]!
161 vld1.8 {d14}, [r2], r12
162 vld1.8 {d15}, [r3]!
164 vsubl.u8 q8, d0, d1
165 vsubl.u8 q9, d2, d3
166 vsubl.u8 q10, d4, d5
167 vsubl.u8 q11, d6, d7
168 vsubl.u8 q12, d8, d9
169 vsubl.u8 q13, d10, d11
170 vsubl.u8 q14, d12, d13
171 vsubl.u8 q15, d14, d15
173 vst1.16 {q8}, [r0]! ;store diff
174 vst1.16 {q9}, [r0]!
175 vst1.16 {q10}, [r0]!
176 vst1.16 {q11}, [r0]!
177 vst1.16 {q12}, [r0]!
178 vst1.16 {q13}, [r0]!
179 vst1.16 {q14}, [r0]!
180 vst1.16 {q15}, [r0]!
182 bx lr
183 ENDP