1 void DEF(put
, pixels8
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
3 int stride
= line_size
;
5 "and r12, %[pixels], #7 \n\t"
6 "bic %[pixels], %[pixels], #7 \n\t"
8 "add r4, %[pixels], %[line_size] \n\t"
9 "add r5, %[block], %[line_size] \n\t"
10 "mov %[line_size], %[line_size], lsl #1 \n\t"
12 "wldrd wr0, [%[pixels]] \n\t"
13 "subs %[h], %[h], #2 \n\t"
14 "wldrd wr1, [%[pixels], #8] \n\t"
15 "add %[pixels], %[pixels], %[line_size] \n\t"
16 "wldrd wr3, [r4] \n\t"
17 "pld [%[pixels]] \n\t"
18 "pld [%[pixels], #32] \n\t"
19 "wldrd wr4, [r4, #8] \n\t"
20 "add r4, r4, %[line_size] \n\t"
21 "walignr1 wr8, wr0, wr1 \n\t"
24 "walignr1 wr10, wr3, wr4 \n\t"
25 "wstrd wr8, [%[block]] \n\t"
26 "add %[block], %[block], %[line_size] \n\t"
27 "wstrd wr10, [r5] \n\t"
28 "add r5, r5, %[line_size] \n\t"
30 : [block
]"+r"(block
), [pixels
]"+r"(pixels
), [line_size
]"+r"(stride
), [h
]"+r"(h
)
32 : "memory", "r4", "r5", "r12");
35 void DEF(avg
, pixels8
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
37 int stride
= line_size
;
38 __asm__
__volatile__ (
39 "and r12, %[pixels], #7 \n\t"
40 "bic %[pixels], %[pixels], #7 \n\t"
41 "tmcr wcgr1, r12 \n\t"
42 "add r4, %[pixels], %[line_size] \n\t"
43 "add r5, %[block], %[line_size] \n\t"
44 "mov %[line_size], %[line_size], lsl #1 \n\t"
46 "wldrd wr0, [%[pixels]] \n\t"
47 "subs %[h], %[h], #2 \n\t"
48 "wldrd wr1, [%[pixels], #8] \n\t"
49 "add %[pixels], %[pixels], %[line_size] \n\t"
50 "wldrd wr3, [r4] \n\t"
51 "pld [%[pixels]] \n\t"
52 "pld [%[pixels], #32] \n\t"
53 "wldrd wr4, [r4, #8] \n\t"
54 "add r4, r4, %[line_size] \n\t"
55 "walignr1 wr8, wr0, wr1 \n\t"
56 "wldrd wr0, [%[block]] \n\t"
57 "wldrd wr2, [r5] \n\t"
60 "walignr1 wr10, wr3, wr4 \n\t"
61 WAVG2B
" wr8, wr8, wr0 \n\t"
62 WAVG2B
" wr10, wr10, wr2 \n\t"
63 "wstrd wr8, [%[block]] \n\t"
64 "add %[block], %[block], %[line_size] \n\t"
65 "wstrd wr10, [r5] \n\t"
67 "pld [%[block], #32] \n\t"
68 "add r5, r5, %[line_size] \n\t"
72 : [block
]"+r"(block
), [pixels
]"+r"(pixels
), [line_size
]"+r"(stride
), [h
]"+r"(h
)
74 : "memory", "r4", "r5", "r12");
77 void DEF(put
, pixels16
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
79 int stride
= line_size
;
80 __asm__
__volatile__ (
81 "and r12, %[pixels], #7 \n\t"
82 "bic %[pixels], %[pixels], #7 \n\t"
83 "tmcr wcgr1, r12 \n\t"
84 "add r4, %[pixels], %[line_size] \n\t"
85 "add r5, %[block], %[line_size] \n\t"
86 "mov %[line_size], %[line_size], lsl #1 \n\t"
88 "wldrd wr0, [%[pixels]] \n\t"
89 "wldrd wr1, [%[pixels], #8] \n\t"
90 "subs %[h], %[h], #2 \n\t"
91 "wldrd wr2, [%[pixels], #16] \n\t"
92 "add %[pixels], %[pixels], %[line_size] \n\t"
93 "wldrd wr3, [r4] \n\t"
94 "pld [%[pixels]] \n\t"
95 "pld [%[pixels], #32] \n\t"
96 "walignr1 wr8, wr0, wr1 \n\t"
97 "wldrd wr4, [r4, #8] \n\t"
98 "walignr1 wr9, wr1, wr2 \n\t"
99 "wldrd wr5, [r4, #16] \n\t"
100 "add r4, r4, %[line_size] \n\t"
103 "walignr1 wr10, wr3, wr4 \n\t"
104 "wstrd wr8, [%[block]] \n\t"
105 "walignr1 wr11, wr4, wr5 \n\t"
106 "wstrd wr9, [%[block], #8] \n\t"
107 "add %[block], %[block], %[line_size] \n\t"
108 "wstrd wr10, [r5] \n\t"
109 "wstrd wr11, [r5, #8] \n\t"
110 "add r5, r5, %[line_size] \n\t"
112 : [block
]"+r"(block
), [pixels
]"+r"(pixels
), [line_size
]"+r"(stride
), [h
]"+r"(h
)
114 : "memory", "r4", "r5", "r12");
117 void DEF(avg
, pixels16
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
119 int stride
= line_size
;
120 __asm__
__volatile__ (
121 "pld [%[pixels]] \n\t"
122 "pld [%[pixels], #32] \n\t"
123 "pld [%[block]] \n\t"
124 "pld [%[block], #32] \n\t"
125 "and r12, %[pixels], #7 \n\t"
126 "bic %[pixels], %[pixels], #7 \n\t"
127 "tmcr wcgr1, r12 \n\t"
128 "add r4, %[pixels], %[line_size]\n\t"
129 "add r5, %[block], %[line_size] \n\t"
130 "mov %[line_size], %[line_size], lsl #1 \n\t"
132 "wldrd wr0, [%[pixels]] \n\t"
133 "wldrd wr1, [%[pixels], #8] \n\t"
134 "subs %[h], %[h], #2 \n\t"
135 "wldrd wr2, [%[pixels], #16] \n\t"
136 "add %[pixels], %[pixels], %[line_size] \n\t"
137 "wldrd wr3, [r4] \n\t"
138 "pld [%[pixels]] \n\t"
139 "pld [%[pixels], #32] \n\t"
140 "walignr1 wr8, wr0, wr1 \n\t"
141 "wldrd wr4, [r4, #8] \n\t"
142 "walignr1 wr9, wr1, wr2 \n\t"
143 "wldrd wr5, [r4, #16] \n\t"
144 "add r4, r4, %[line_size] \n\t"
145 "wldrd wr0, [%[block]] \n\t"
147 "wldrd wr1, [%[block], #8] \n\t"
149 "wldrd wr2, [r5] \n\t"
150 "walignr1 wr10, wr3, wr4 \n\t"
151 "wldrd wr3, [r5, #8] \n\t"
152 WAVG2B
" wr8, wr8, wr0 \n\t"
153 WAVG2B
" wr9, wr9, wr1 \n\t"
154 WAVG2B
" wr10, wr10, wr2 \n\t"
155 "wstrd wr8, [%[block]] \n\t"
156 "walignr1 wr11, wr4, wr5 \n\t"
157 WAVG2B
" wr11, wr11, wr3 \n\t"
158 "wstrd wr9, [%[block], #8] \n\t"
159 "add %[block], %[block], %[line_size] \n\t"
160 "wstrd wr10, [r5] \n\t"
161 "pld [%[block]] \n\t"
162 "pld [%[block], #32] \n\t"
163 "wstrd wr11, [r5, #8] \n\t"
164 "add r5, r5, %[line_size] \n\t"
168 : [block
]"+r"(block
), [pixels
]"+r"(pixels
), [line_size
]"+r"(stride
), [h
]"+r"(h
)
170 : "memory", "r4", "r5", "r12");
173 void DEF(put
, pixels8_x2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
175 int stride
= line_size
;
176 // [wr0 wr1 wr2 wr3] for previous line
177 // [wr4 wr5 wr6 wr7] for current line
178 SET_RND(wr15
); // =2 for rnd and =1 for no_rnd version
179 __asm__
__volatile__(
180 "pld [%[pixels]] \n\t"
181 "pld [%[pixels], #32] \n\t"
182 "and r12, %[pixels], #7 \n\t"
183 "bic %[pixels], %[pixels], #7 \n\t"
184 "tmcr wcgr1, r12 \n\t"
185 "add r12, r12, #1 \n\t"
186 "add r4, %[pixels], %[line_size]\n\t"
187 "tmcr wcgr2, r12 \n\t"
188 "add r5, %[block], %[line_size] \n\t"
189 "mov %[line_size], %[line_size], lsl #1 \n\t"
192 "wldrd wr10, [%[pixels]] \n\t"
194 "wldrd wr11, [%[pixels], #8] \n\t"
195 "add %[pixels], %[pixels], %[line_size] \n\t"
196 "wldrd wr13, [r4] \n\t"
197 "pld [%[pixels]] \n\t"
198 "wldrd wr14, [r4, #8] \n\t"
199 "pld [%[pixels], #32] \n\t"
200 "add r4, r4, %[line_size] \n\t"
201 "walignr1 wr0, wr10, wr11 \n\t"
204 "walignr1 wr2, wr13, wr14 \n\t"
205 "wmoveq wr4, wr11 \n\t"
206 "wmoveq wr6, wr14 \n\t"
207 "walignr2ne wr4, wr10, wr11 \n\t"
208 "walignr2ne wr6, wr13, wr14 \n\t"
209 WAVG2B
" wr0, wr0, wr4 \n\t"
210 WAVG2B
" wr2, wr2, wr6 \n\t"
211 "wstrd wr0, [%[block]] \n\t"
212 "subs %[h], %[h], #2 \n\t"
213 "wstrd wr2, [r5] \n\t"
214 "add %[block], %[block], %[line_size] \n\t"
215 "add r5, r5, %[line_size] \n\t"
217 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
), [line_size
]"+r"(stride
)
219 : "r4", "r5", "r12", "memory");
222 void DEF(put
, pixels16_x2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
224 int stride
= line_size
;
225 // [wr0 wr1 wr2 wr3] for previous line
226 // [wr4 wr5 wr6 wr7] for current line
227 SET_RND(wr15
); // =2 for rnd and =1 for no_rnd version
228 __asm__
__volatile__(
229 "pld [%[pixels]] \n\t"
230 "pld [%[pixels], #32] \n\t"
231 "and r12, %[pixels], #7 \n\t"
232 "bic %[pixels], %[pixels], #7 \n\t"
233 "tmcr wcgr1, r12 \n\t"
234 "add r12, r12, #1 \n\t"
235 "add r4, %[pixels], %[line_size]\n\t"
236 "tmcr wcgr2, r12 \n\t"
237 "add r5, %[block], %[line_size] \n\t"
238 "mov %[line_size], %[line_size], lsl #1 \n\t"
241 "wldrd wr10, [%[pixels]] \n\t"
243 "wldrd wr11, [%[pixels], #8] \n\t"
244 "wldrd wr12, [%[pixels], #16] \n\t"
245 "add %[pixels], %[pixels], %[line_size] \n\t"
246 "wldrd wr13, [r4] \n\t"
247 "pld [%[pixels]] \n\t"
248 "wldrd wr14, [r4, #8] \n\t"
249 "pld [%[pixels], #32] \n\t"
250 "wldrd wr15, [r4, #16] \n\t"
251 "add r4, r4, %[line_size] \n\t"
252 "walignr1 wr0, wr10, wr11 \n\t"
255 "walignr1 wr1, wr11, wr12 \n\t"
256 "walignr1 wr2, wr13, wr14 \n\t"
257 "walignr1 wr3, wr14, wr15 \n\t"
258 "wmoveq wr4, wr11 \n\t"
259 "wmoveq wr5, wr12 \n\t"
260 "wmoveq wr6, wr14 \n\t"
261 "wmoveq wr7, wr15 \n\t"
262 "walignr2ne wr4, wr10, wr11 \n\t"
263 "walignr2ne wr5, wr11, wr12 \n\t"
264 "walignr2ne wr6, wr13, wr14 \n\t"
265 "walignr2ne wr7, wr14, wr15 \n\t"
266 WAVG2B
" wr0, wr0, wr4 \n\t"
267 WAVG2B
" wr1, wr1, wr5 \n\t"
268 "wstrd wr0, [%[block]] \n\t"
269 WAVG2B
" wr2, wr2, wr6 \n\t"
270 "wstrd wr1, [%[block], #8] \n\t"
271 WAVG2B
" wr3, wr3, wr7 \n\t"
272 "add %[block], %[block], %[line_size] \n\t"
273 "wstrd wr2, [r5] \n\t"
274 "subs %[h], %[h], #2 \n\t"
275 "wstrd wr3, [r5, #8] \n\t"
276 "add r5, r5, %[line_size] \n\t"
278 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
), [line_size
]"+r"(stride
)
280 : "r4", "r5", "r12", "memory");
283 void DEF(avg
, pixels8_x2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
285 int stride
= line_size
;
286 // [wr0 wr1 wr2 wr3] for previous line
287 // [wr4 wr5 wr6 wr7] for current line
288 SET_RND(wr15
); // =2 for rnd and =1 for no_rnd version
289 __asm__
__volatile__(
290 "pld [%[pixels]] \n\t"
291 "pld [%[pixels], #32] \n\t"
292 "pld [%[block]] \n\t"
293 "pld [%[block], #32] \n\t"
294 "and r12, %[pixels], #7 \n\t"
295 "bic %[pixels], %[pixels], #7 \n\t"
296 "tmcr wcgr1, r12 \n\t"
297 "add r12, r12, #1 \n\t"
298 "add r4, %[pixels], %[line_size]\n\t"
299 "tmcr wcgr2, r12 \n\t"
300 "add r5, %[block], %[line_size] \n\t"
301 "mov %[line_size], %[line_size], lsl #1 \n\t"
306 "wldrd wr10, [%[pixels]] \n\t"
308 "wldrd wr11, [%[pixels], #8] \n\t"
309 "add %[pixels], %[pixels], %[line_size] \n\t"
310 "wldrd wr13, [r4] \n\t"
311 "pld [%[pixels]] \n\t"
312 "wldrd wr14, [r4, #8] \n\t"
313 "pld [%[pixels], #32] \n\t"
314 "add r4, r4, %[line_size] \n\t"
315 "walignr1 wr0, wr10, wr11 \n\t"
318 "walignr1 wr2, wr13, wr14 \n\t"
319 "wmoveq wr4, wr11 \n\t"
320 "wmoveq wr6, wr14 \n\t"
321 "walignr2ne wr4, wr10, wr11 \n\t"
322 "wldrd wr10, [%[block]] \n\t"
323 "walignr2ne wr6, wr13, wr14 \n\t"
324 "wldrd wr12, [r5] \n\t"
325 WAVG2B
" wr0, wr0, wr4 \n\t"
326 WAVG2B
" wr2, wr2, wr6 \n\t"
327 WAVG2B
" wr0, wr0, wr10 \n\t"
328 WAVG2B
" wr2, wr2, wr12 \n\t"
329 "wstrd wr0, [%[block]] \n\t"
330 "subs %[h], %[h], #2 \n\t"
331 "wstrd wr2, [r5] \n\t"
332 "add %[block], %[block], %[line_size] \n\t"
333 "add r5, r5, %[line_size] \n\t"
334 "pld [%[block]] \n\t"
335 "pld [%[block], #32] \n\t"
339 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
), [line_size
]"+r"(stride
)
341 : "r4", "r5", "r12", "memory");
344 void DEF(avg
, pixels16_x2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
346 int stride
= line_size
;
347 // [wr0 wr1 wr2 wr3] for previous line
348 // [wr4 wr5 wr6 wr7] for current line
349 SET_RND(wr15
); // =2 for rnd and =1 for no_rnd version
350 __asm__
__volatile__(
351 "pld [%[pixels]] \n\t"
352 "pld [%[pixels], #32] \n\t"
353 "pld [%[block]] \n\t"
354 "pld [%[block], #32] \n\t"
355 "and r12, %[pixels], #7 \n\t"
356 "bic %[pixels], %[pixels], #7 \n\t"
357 "tmcr wcgr1, r12 \n\t"
358 "add r12, r12, #1 \n\t"
359 "add r4, %[pixels], %[line_size]\n\t"
360 "tmcr wcgr2, r12 \n\t"
361 "add r5, %[block], %[line_size] \n\t"
362 "mov %[line_size], %[line_size], lsl #1 \n\t"
367 "wldrd wr10, [%[pixels]] \n\t"
369 "wldrd wr11, [%[pixels], #8] \n\t"
370 "wldrd wr12, [%[pixels], #16] \n\t"
371 "add %[pixels], %[pixels], %[line_size] \n\t"
372 "wldrd wr13, [r4] \n\t"
373 "pld [%[pixels]] \n\t"
374 "wldrd wr14, [r4, #8] \n\t"
375 "pld [%[pixels], #32] \n\t"
376 "wldrd wr15, [r4, #16] \n\t"
377 "add r4, r4, %[line_size] \n\t"
378 "walignr1 wr0, wr10, wr11 \n\t"
381 "walignr1 wr1, wr11, wr12 \n\t"
382 "walignr1 wr2, wr13, wr14 \n\t"
383 "walignr1 wr3, wr14, wr15 \n\t"
384 "wmoveq wr4, wr11 \n\t"
385 "wmoveq wr5, wr12 \n\t"
386 "wmoveq wr6, wr14 \n\t"
387 "wmoveq wr7, wr15 \n\t"
388 "walignr2ne wr4, wr10, wr11 \n\t"
389 "walignr2ne wr5, wr11, wr12 \n\t"
390 "walignr2ne wr6, wr13, wr14 \n\t"
391 "walignr2ne wr7, wr14, wr15 \n\t"
392 "wldrd wr10, [%[block]] \n\t"
393 WAVG2B
" wr0, wr0, wr4 \n\t"
394 "wldrd wr11, [%[block], #8] \n\t"
395 WAVG2B
" wr1, wr1, wr5 \n\t"
396 "wldrd wr12, [r5] \n\t"
397 WAVG2B
" wr2, wr2, wr6 \n\t"
398 "wldrd wr13, [r5, #8] \n\t"
399 WAVG2B
" wr3, wr3, wr7 \n\t"
400 WAVG2B
" wr0, wr0, wr10 \n\t"
401 WAVG2B
" wr1, wr1, wr11 \n\t"
402 WAVG2B
" wr2, wr2, wr12 \n\t"
403 WAVG2B
" wr3, wr3, wr13 \n\t"
404 "wstrd wr0, [%[block]] \n\t"
405 "subs %[h], %[h], #2 \n\t"
406 "wstrd wr1, [%[block], #8] \n\t"
407 "add %[block], %[block], %[line_size] \n\t"
408 "wstrd wr2, [r5] \n\t"
409 "pld [%[block]] \n\t"
410 "wstrd wr3, [r5, #8] \n\t"
411 "add r5, r5, %[line_size] \n\t"
412 "pld [%[block], #32] \n\t"
416 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
), [line_size
]"+r"(stride
)
418 :"r4", "r5", "r12", "memory");
421 void DEF(avg
, pixels8_y2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
423 int stride
= line_size
;
424 // [wr0 wr1 wr2 wr3] for previous line
425 // [wr4 wr5 wr6 wr7] for current line
426 __asm__
__volatile__(
427 "pld [%[pixels]] \n\t"
428 "pld [%[pixels], #32] \n\t"
429 "and r12, %[pixels], #7 \n\t"
430 "tmcr wcgr1, r12 \n\t"
431 "bic %[pixels], %[pixels], #7 \n\t"
433 "wldrd wr10, [%[pixels]] \n\t"
434 "wldrd wr11, [%[pixels], #8] \n\t"
435 "pld [%[block]] \n\t"
436 "add %[pixels], %[pixels], %[line_size] \n\t"
437 "walignr1 wr0, wr10, wr11 \n\t"
438 "pld [%[pixels]] \n\t"
439 "pld [%[pixels], #32] \n\t"
442 "wldrd wr10, [%[pixels]] \n\t"
443 "wldrd wr11, [%[pixels], #8] \n\t"
444 "add %[pixels], %[pixels], %[line_size] \n\t"
445 "pld [%[pixels]] \n\t"
446 "pld [%[pixels], #32] \n\t"
447 "walignr1 wr4, wr10, wr11 \n\t"
448 "wldrd wr10, [%[block]] \n\t"
449 WAVG2B
" wr8, wr0, wr4 \n\t"
450 WAVG2B
" wr8, wr8, wr10 \n\t"
451 "wstrd wr8, [%[block]] \n\t"
452 "add %[block], %[block], %[line_size] \n\t"
454 "wldrd wr10, [%[pixels]] \n\t"
455 "wldrd wr11, [%[pixels], #8] \n\t"
456 "pld [%[block]] \n\t"
457 "add %[pixels], %[pixels], %[line_size] \n\t"
458 "pld [%[pixels]] \n\t"
459 "pld [%[pixels], #32] \n\t"
460 "walignr1 wr0, wr10, wr11 \n\t"
461 "wldrd wr10, [%[block]] \n\t"
462 WAVG2B
" wr8, wr0, wr4 \n\t"
463 WAVG2B
" wr8, wr8, wr10 \n\t"
464 "wstrd wr8, [%[block]] \n\t"
465 "add %[block], %[block], %[line_size] \n\t"
467 "subs %[h], %[h], #2 \n\t"
468 "pld [%[block]] \n\t"
470 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
), [line_size
]"+r"(stride
)
472 : "cc", "memory", "r12");
475 void DEF(put
, pixels16_y2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
477 int stride
= line_size
;
478 // [wr0 wr1 wr2 wr3] for previous line
479 // [wr4 wr5 wr6 wr7] for current line
480 __asm__
__volatile__(
481 "pld [%[pixels]] \n\t"
482 "pld [%[pixels], #32] \n\t"
483 "and r12, %[pixels], #7 \n\t"
484 "tmcr wcgr1, r12 \n\t"
485 "bic %[pixels], %[pixels], #7 \n\t"
487 "wldrd wr10, [%[pixels]] \n\t"
488 "wldrd wr11, [%[pixels], #8] \n\t"
489 "wldrd wr12, [%[pixels], #16] \n\t"
490 "add %[pixels], %[pixels], %[line_size] \n\t"
491 "pld [%[pixels]] \n\t"
492 "pld [%[pixels], #32] \n\t"
493 "walignr1 wr0, wr10, wr11 \n\t"
494 "walignr1 wr1, wr11, wr12 \n\t"
497 "wldrd wr10, [%[pixels]] \n\t"
498 "wldrd wr11, [%[pixels], #8] \n\t"
499 "wldrd wr12, [%[pixels], #16] \n\t"
500 "add %[pixels], %[pixels], %[line_size] \n\t"
501 "pld [%[pixels]] \n\t"
502 "pld [%[pixels], #32] \n\t"
503 "walignr1 wr4, wr10, wr11 \n\t"
504 "walignr1 wr5, wr11, wr12 \n\t"
505 WAVG2B
" wr8, wr0, wr4 \n\t"
506 WAVG2B
" wr9, wr1, wr5 \n\t"
507 "wstrd wr8, [%[block]] \n\t"
508 "wstrd wr9, [%[block], #8] \n\t"
509 "add %[block], %[block], %[line_size] \n\t"
511 "wldrd wr10, [%[pixels]] \n\t"
512 "wldrd wr11, [%[pixels], #8] \n\t"
513 "wldrd wr12, [%[pixels], #16] \n\t"
514 "add %[pixels], %[pixels], %[line_size] \n\t"
515 "pld [%[pixels]] \n\t"
516 "pld [%[pixels], #32] \n\t"
517 "walignr1 wr0, wr10, wr11 \n\t"
518 "walignr1 wr1, wr11, wr12 \n\t"
519 WAVG2B
" wr8, wr0, wr4 \n\t"
520 WAVG2B
" wr9, wr1, wr5 \n\t"
521 "wstrd wr8, [%[block]] \n\t"
522 "wstrd wr9, [%[block], #8] \n\t"
523 "add %[block], %[block], %[line_size] \n\t"
525 "subs %[h], %[h], #2 \n\t"
527 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
), [line_size
]"+r"(stride
)
529 : "r4", "r5", "r12", "memory");
532 void DEF(avg
, pixels16_y2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
534 int stride
= line_size
;
535 // [wr0 wr1 wr2 wr3] for previous line
536 // [wr4 wr5 wr6 wr7] for current line
537 __asm__
__volatile__(
538 "pld [%[pixels]] \n\t"
539 "pld [%[pixels], #32] \n\t"
540 "and r12, %[pixels], #7 \n\t"
541 "tmcr wcgr1, r12 \n\t"
542 "bic %[pixels], %[pixels], #7 \n\t"
544 "wldrd wr10, [%[pixels]] \n\t"
545 "wldrd wr11, [%[pixels], #8] \n\t"
546 "pld [%[block]] \n\t"
547 "wldrd wr12, [%[pixels], #16] \n\t"
548 "add %[pixels], %[pixels], %[line_size] \n\t"
549 "pld [%[pixels]] \n\t"
550 "pld [%[pixels], #32] \n\t"
551 "walignr1 wr0, wr10, wr11 \n\t"
552 "walignr1 wr1, wr11, wr12 \n\t"
555 "wldrd wr10, [%[pixels]] \n\t"
556 "wldrd wr11, [%[pixels], #8] \n\t"
557 "wldrd wr12, [%[pixels], #16] \n\t"
558 "add %[pixels], %[pixels], %[line_size] \n\t"
559 "pld [%[pixels]] \n\t"
560 "pld [%[pixels], #32] \n\t"
561 "walignr1 wr4, wr10, wr11 \n\t"
562 "walignr1 wr5, wr11, wr12 \n\t"
563 "wldrd wr10, [%[block]] \n\t"
564 "wldrd wr11, [%[block], #8] \n\t"
565 WAVG2B
" wr8, wr0, wr4 \n\t"
566 WAVG2B
" wr9, wr1, wr5 \n\t"
567 WAVG2B
" wr8, wr8, wr10 \n\t"
568 WAVG2B
" wr9, wr9, wr11 \n\t"
569 "wstrd wr8, [%[block]] \n\t"
570 "wstrd wr9, [%[block], #8] \n\t"
571 "add %[block], %[block], %[line_size] \n\t"
573 "wldrd wr10, [%[pixels]] \n\t"
574 "wldrd wr11, [%[pixels], #8] \n\t"
575 "pld [%[block]] \n\t"
576 "wldrd wr12, [%[pixels], #16] \n\t"
577 "add %[pixels], %[pixels], %[line_size] \n\t"
578 "pld [%[pixels]] \n\t"
579 "pld [%[pixels], #32] \n\t"
580 "walignr1 wr0, wr10, wr11 \n\t"
581 "walignr1 wr1, wr11, wr12 \n\t"
582 "wldrd wr10, [%[block]] \n\t"
583 "wldrd wr11, [%[block], #8] \n\t"
584 WAVG2B
" wr8, wr0, wr4 \n\t"
585 WAVG2B
" wr9, wr1, wr5 \n\t"
586 WAVG2B
" wr8, wr8, wr10 \n\t"
587 WAVG2B
" wr9, wr9, wr11 \n\t"
588 "wstrd wr8, [%[block]] \n\t"
589 "wstrd wr9, [%[block], #8] \n\t"
590 "add %[block], %[block], %[line_size] \n\t"
592 "subs %[h], %[h], #2 \n\t"
593 "pld [%[block]] \n\t"
595 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
), [line_size
]"+r"(stride
)
597 : "r4", "r5", "r12", "memory");
600 void DEF(put
, pixels8_xy2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
602 // [wr0 wr1 wr2 wr3] for previous line
603 // [wr4 wr5 wr6 wr7] for current line
604 SET_RND(wr15
); // =2 for rnd and =1 for no_rnd version
605 __asm__
__volatile__(
606 "pld [%[pixels]] \n\t"
608 "pld [%[pixels], #32] \n\t"
609 "tmcr wcgr0, r12 \n\t" /* for shift value */
610 "and r12, %[pixels], #7 \n\t"
611 "bic %[pixels], %[pixels], #7 \n\t"
612 "tmcr wcgr1, r12 \n\t"
614 // [wr0 wr1 wr2 wr3] <= *
616 "wldrd wr12, [%[pixels]] \n\t"
617 "add r12, r12, #1 \n\t"
618 "wldrd wr13, [%[pixels], #8] \n\t"
619 "tmcr wcgr2, r12 \n\t"
620 "add %[pixels], %[pixels], %[line_size] \n\t"
622 "pld [%[pixels]] \n\t"
623 "pld [%[pixels], #32] \n\t"
624 "walignr1 wr2, wr12, wr13 \n\t"
625 "wmoveq wr10, wr13 \n\t"
626 "walignr2ne wr10, wr12, wr13 \n\t"
627 "wunpckelub wr0, wr2 \n\t"
628 "wunpckehub wr1, wr2 \n\t"
629 "wunpckelub wr8, wr10 \n\t"
630 "wunpckehub wr9, wr10 \n\t"
631 "waddhus wr0, wr0, wr8 \n\t"
632 "waddhus wr1, wr1, wr9 \n\t"
636 // [wr4 wr5 wr6 wr7] <= *
637 "wldrd wr12, [%[pixels]] \n\t"
639 "wldrd wr13, [%[pixels], #8] \n\t"
640 "add %[pixels], %[pixels], %[line_size] \n\t"
641 "walignr1 wr6, wr12, wr13 \n\t"
642 "pld [%[pixels]] \n\t"
643 "pld [%[pixels], #32] \n\t"
644 "wmoveq wr10, wr13 \n\t"
645 "walignr2ne wr10, wr12, wr13 \n\t"
646 "wunpckelub wr4, wr6 \n\t"
647 "wunpckehub wr5, wr6 \n\t"
648 "wunpckelub wr8, wr10 \n\t"
649 "wunpckehub wr9, wr10 \n\t"
650 "waddhus wr4, wr4, wr8 \n\t"
651 "waddhus wr5, wr5, wr9 \n\t"
652 "waddhus wr8, wr0, wr4 \n\t"
653 "waddhus wr9, wr1, wr5 \n\t"
654 "waddhus wr8, wr8, wr15 \n\t"
655 "waddhus wr9, wr9, wr15 \n\t"
656 "wsrlhg wr8, wr8, wcgr0 \n\t"
657 "wsrlhg wr9, wr9, wcgr0 \n\t"
658 "wpackhus wr8, wr8, wr9 \n\t"
659 "wstrd wr8, [%[block]] \n\t"
660 "add %[block], %[block], %[line_size] \n\t"
662 // [wr0 wr1 wr2 wr3] <= *
664 "wldrd wr12, [%[pixels]] \n\t"
665 "wldrd wr13, [%[pixels], #8] \n\t"
666 "add %[pixels], %[pixels], %[line_size] \n\t"
667 "walignr1 wr2, wr12, wr13 \n\t"
668 "pld [%[pixels]] \n\t"
669 "pld [%[pixels], #32] \n\t"
670 "wmoveq wr10, wr13 \n\t"
671 "walignr2ne wr10, wr12, wr13 \n\t"
672 "wunpckelub wr0, wr2 \n\t"
673 "wunpckehub wr1, wr2 \n\t"
674 "wunpckelub wr8, wr10 \n\t"
675 "wunpckehub wr9, wr10 \n\t"
676 "waddhus wr0, wr0, wr8 \n\t"
677 "waddhus wr1, wr1, wr9 \n\t"
678 "waddhus wr8, wr0, wr4 \n\t"
679 "waddhus wr9, wr1, wr5 \n\t"
680 "waddhus wr8, wr8, wr15 \n\t"
681 "waddhus wr9, wr9, wr15 \n\t"
682 "wsrlhg wr8, wr8, wcgr0 \n\t"
683 "wsrlhg wr9, wr9, wcgr0 \n\t"
684 "wpackhus wr8, wr8, wr9 \n\t"
685 "subs %[h], %[h], #2 \n\t"
686 "wstrd wr8, [%[block]] \n\t"
687 "add %[block], %[block], %[line_size] \n\t"
689 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
)
690 : [line_size
]"r"(line_size
)
694 void DEF(put
, pixels16_xy2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
696 // [wr0 wr1 wr2 wr3] for previous line
697 // [wr4 wr5 wr6 wr7] for current line
698 SET_RND(wr15
); // =2 for rnd and =1 for no_rnd version
699 __asm__
__volatile__(
700 "pld [%[pixels]] \n\t"
702 "pld [%[pixels], #32] \n\t"
703 "tmcr wcgr0, r12 \n\t" /* for shift value */
705 "and r12, %[pixels], #7 \n\t"
706 "bic %[pixels], %[pixels], #7 \n\t"
707 "tmcr wcgr1, r12 \n\t"
708 "add r12, r12, #1 \n\t"
709 "tmcr wcgr2, r12 \n\t"
711 // [wr0 wr1 wr2 wr3] <= *
713 "wldrd wr12, [%[pixels]] \n\t"
715 "wldrd wr13, [%[pixels], #8] \n\t"
716 "wldrd wr14, [%[pixels], #16] \n\t"
717 "add %[pixels], %[pixels], %[line_size] \n\t"
718 "pld [%[pixels]] \n\t"
719 "walignr1 wr2, wr12, wr13 \n\t"
720 "pld [%[pixels], #32] \n\t"
721 "walignr1 wr3, wr13, wr14 \n\t"
722 "wmoveq wr10, wr13 \n\t"
723 "wmoveq wr11, wr14 \n\t"
724 "walignr2ne wr10, wr12, wr13 \n\t"
725 "walignr2ne wr11, wr13, wr14 \n\t"
726 "wunpckelub wr0, wr2 \n\t"
727 "wunpckehub wr1, wr2 \n\t"
728 "wunpckelub wr2, wr3 \n\t"
729 "wunpckehub wr3, wr3 \n\t"
730 "wunpckelub wr8, wr10 \n\t"
731 "wunpckehub wr9, wr10 \n\t"
732 "wunpckelub wr10, wr11 \n\t"
733 "wunpckehub wr11, wr11 \n\t"
734 "waddhus wr0, wr0, wr8 \n\t"
735 "waddhus wr1, wr1, wr9 \n\t"
736 "waddhus wr2, wr2, wr10 \n\t"
737 "waddhus wr3, wr3, wr11 \n\t"
741 // [wr4 wr5 wr6 wr7] <= *
742 "wldrd wr12, [%[pixels]] \n\t"
744 "wldrd wr13, [%[pixels], #8] \n\t"
745 "wldrd wr14, [%[pixels], #16] \n\t"
746 "add %[pixels], %[pixels], %[line_size] \n\t"
747 "walignr1 wr6, wr12, wr13 \n\t"
748 "pld [%[pixels]] \n\t"
749 "pld [%[pixels], #32] \n\t"
750 "walignr1 wr7, wr13, wr14 \n\t"
751 "wmoveq wr10, wr13 \n\t"
752 "wmoveq wr11, wr14 \n\t"
753 "walignr2ne wr10, wr12, wr13 \n\t"
754 "walignr2ne wr11, wr13, wr14 \n\t"
755 "wunpckelub wr4, wr6 \n\t"
756 "wunpckehub wr5, wr6 \n\t"
757 "wunpckelub wr6, wr7 \n\t"
758 "wunpckehub wr7, wr7 \n\t"
759 "wunpckelub wr8, wr10 \n\t"
760 "wunpckehub wr9, wr10 \n\t"
761 "wunpckelub wr10, wr11 \n\t"
762 "wunpckehub wr11, wr11 \n\t"
763 "waddhus wr4, wr4, wr8 \n\t"
764 "waddhus wr5, wr5, wr9 \n\t"
765 "waddhus wr6, wr6, wr10 \n\t"
766 "waddhus wr7, wr7, wr11 \n\t"
767 "waddhus wr8, wr0, wr4 \n\t"
768 "waddhus wr9, wr1, wr5 \n\t"
769 "waddhus wr10, wr2, wr6 \n\t"
770 "waddhus wr11, wr3, wr7 \n\t"
771 "waddhus wr8, wr8, wr15 \n\t"
772 "waddhus wr9, wr9, wr15 \n\t"
773 "waddhus wr10, wr10, wr15 \n\t"
774 "waddhus wr11, wr11, wr15 \n\t"
775 "wsrlhg wr8, wr8, wcgr0 \n\t"
776 "wsrlhg wr9, wr9, wcgr0 \n\t"
777 "wsrlhg wr10, wr10, wcgr0 \n\t"
778 "wsrlhg wr11, wr11, wcgr0 \n\t"
779 "wpackhus wr8, wr8, wr9 \n\t"
780 "wpackhus wr9, wr10, wr11 \n\t"
781 "wstrd wr8, [%[block]] \n\t"
782 "wstrd wr9, [%[block], #8] \n\t"
783 "add %[block], %[block], %[line_size] \n\t"
785 // [wr0 wr1 wr2 wr3] <= *
787 "wldrd wr12, [%[pixels]] \n\t"
788 "wldrd wr13, [%[pixels], #8] \n\t"
789 "wldrd wr14, [%[pixels], #16] \n\t"
790 "add %[pixels], %[pixels], %[line_size] \n\t"
791 "walignr1 wr2, wr12, wr13 \n\t"
792 "pld [%[pixels]] \n\t"
793 "pld [%[pixels], #32] \n\t"
794 "walignr1 wr3, wr13, wr14 \n\t"
795 "wmoveq wr10, wr13 \n\t"
796 "wmoveq wr11, wr14 \n\t"
797 "walignr2ne wr10, wr12, wr13 \n\t"
798 "walignr2ne wr11, wr13, wr14 \n\t"
799 "wunpckelub wr0, wr2 \n\t"
800 "wunpckehub wr1, wr2 \n\t"
801 "wunpckelub wr2, wr3 \n\t"
802 "wunpckehub wr3, wr3 \n\t"
803 "wunpckelub wr8, wr10 \n\t"
804 "wunpckehub wr9, wr10 \n\t"
805 "wunpckelub wr10, wr11 \n\t"
806 "wunpckehub wr11, wr11 \n\t"
807 "waddhus wr0, wr0, wr8 \n\t"
808 "waddhus wr1, wr1, wr9 \n\t"
809 "waddhus wr2, wr2, wr10 \n\t"
810 "waddhus wr3, wr3, wr11 \n\t"
811 "waddhus wr8, wr0, wr4 \n\t"
812 "waddhus wr9, wr1, wr5 \n\t"
813 "waddhus wr10, wr2, wr6 \n\t"
814 "waddhus wr11, wr3, wr7 \n\t"
815 "waddhus wr8, wr8, wr15 \n\t"
816 "waddhus wr9, wr9, wr15 \n\t"
817 "waddhus wr10, wr10, wr15 \n\t"
818 "waddhus wr11, wr11, wr15 \n\t"
819 "wsrlhg wr8, wr8, wcgr0 \n\t"
820 "wsrlhg wr9, wr9, wcgr0 \n\t"
821 "wsrlhg wr10, wr10, wcgr0 \n\t"
822 "wsrlhg wr11, wr11, wcgr0 \n\t"
823 "wpackhus wr8, wr8, wr9 \n\t"
824 "wpackhus wr9, wr10, wr11 \n\t"
825 "wstrd wr8, [%[block]] \n\t"
826 "wstrd wr9, [%[block], #8] \n\t"
827 "add %[block], %[block], %[line_size] \n\t"
829 "subs %[h], %[h], #2 \n\t"
831 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
)
832 : [line_size
]"r"(line_size
)
836 void DEF(avg
, pixels8_xy2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
838 // [wr0 wr1 wr2 wr3] for previous line
839 // [wr4 wr5 wr6 wr7] for current line
840 SET_RND(wr15
); // =2 for rnd and =1 for no_rnd version
841 __asm__
__volatile__(
842 "pld [%[block]] \n\t"
843 "pld [%[block], #32] \n\t"
844 "pld [%[pixels]] \n\t"
846 "pld [%[pixels], #32] \n\t"
847 "tmcr wcgr0, r12 \n\t" /* for shift value */
848 "and r12, %[pixels], #7 \n\t"
849 "bic %[pixels], %[pixels], #7 \n\t"
850 "tmcr wcgr1, r12 \n\t"
852 // [wr0 wr1 wr2 wr3] <= *
854 "wldrd wr12, [%[pixels]] \n\t"
855 "add r12, r12, #1 \n\t"
856 "wldrd wr13, [%[pixels], #8] \n\t"
857 "tmcr wcgr2, r12 \n\t"
858 "add %[pixels], %[pixels], %[line_size] \n\t"
860 "pld [%[pixels]] \n\t"
861 "pld [%[pixels], #32] \n\t"
862 "walignr1 wr2, wr12, wr13 \n\t"
863 "wmoveq wr10, wr13 \n\t"
864 "walignr2ne wr10, wr12, wr13 \n\t"
865 "wunpckelub wr0, wr2 \n\t"
866 "wunpckehub wr1, wr2 \n\t"
867 "wunpckelub wr8, wr10 \n\t"
868 "wunpckehub wr9, wr10 \n\t"
869 "waddhus wr0, wr0, wr8 \n\t"
870 "waddhus wr1, wr1, wr9 \n\t"
874 // [wr4 wr5 wr6 wr7] <= *
875 "wldrd wr12, [%[pixels]] \n\t"
877 "wldrd wr13, [%[pixels], #8] \n\t"
878 "add %[pixels], %[pixels], %[line_size] \n\t"
879 "walignr1 wr6, wr12, wr13 \n\t"
880 "pld [%[pixels]] \n\t"
881 "pld [%[pixels], #32] \n\t"
882 "wmoveq wr10, wr13 \n\t"
883 "walignr2ne wr10, wr12, wr13 \n\t"
884 "wunpckelub wr4, wr6 \n\t"
885 "wunpckehub wr5, wr6 \n\t"
886 "wunpckelub wr8, wr10 \n\t"
887 "wunpckehub wr9, wr10 \n\t"
888 "waddhus wr4, wr4, wr8 \n\t"
889 "waddhus wr5, wr5, wr9 \n\t"
890 "waddhus wr8, wr0, wr4 \n\t"
891 "waddhus wr9, wr1, wr5 \n\t"
892 "waddhus wr8, wr8, wr15 \n\t"
893 "waddhus wr9, wr9, wr15 \n\t"
894 "wldrd wr12, [%[block]] \n\t"
895 "wsrlhg wr8, wr8, wcgr0 \n\t"
896 "wsrlhg wr9, wr9, wcgr0 \n\t"
897 "wpackhus wr8, wr8, wr9 \n\t"
898 WAVG2B
" wr8, wr8, wr12 \n\t"
899 "wstrd wr8, [%[block]] \n\t"
900 "add %[block], %[block], %[line_size] \n\t"
901 "wldrd wr12, [%[pixels]] \n\t"
902 "pld [%[block]] \n\t"
903 "pld [%[block], #32] \n\t"
905 // [wr0 wr1 wr2 wr3] <= *
907 "wldrd wr13, [%[pixels], #8] \n\t"
908 "add %[pixels], %[pixels], %[line_size] \n\t"
909 "walignr1 wr2, wr12, wr13 \n\t"
910 "pld [%[pixels]] \n\t"
911 "pld [%[pixels], #32] \n\t"
912 "wmoveq wr10, wr13 \n\t"
913 "walignr2ne wr10, wr12, wr13 \n\t"
914 "wunpckelub wr0, wr2 \n\t"
915 "wunpckehub wr1, wr2 \n\t"
916 "wunpckelub wr8, wr10 \n\t"
917 "wunpckehub wr9, wr10 \n\t"
918 "waddhus wr0, wr0, wr8 \n\t"
919 "waddhus wr1, wr1, wr9 \n\t"
920 "waddhus wr8, wr0, wr4 \n\t"
921 "waddhus wr9, wr1, wr5 \n\t"
922 "waddhus wr8, wr8, wr15 \n\t"
923 "waddhus wr9, wr9, wr15 \n\t"
924 "wldrd wr12, [%[block]] \n\t"
925 "wsrlhg wr8, wr8, wcgr0 \n\t"
926 "wsrlhg wr9, wr9, wcgr0 \n\t"
927 "wpackhus wr8, wr8, wr9 \n\t"
928 "subs %[h], %[h], #2 \n\t"
929 WAVG2B
" wr8, wr8, wr12 \n\t"
930 "wstrd wr8, [%[block]] \n\t"
931 "add %[block], %[block], %[line_size] \n\t"
932 "pld [%[block]] \n\t"
933 "pld [%[block], #32] \n\t"
935 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
)
936 : [line_size
]"r"(line_size
)
940 void DEF(avg
, pixels16_xy2
)(uint8_t *block
, const uint8_t *pixels
, const int line_size
, int h
)
942 // [wr0 wr1 wr2 wr3] for previous line
943 // [wr4 wr5 wr6 wr7] for current line
944 SET_RND(wr15
); // =2 for rnd and =1 for no_rnd version
945 __asm__
__volatile__(
946 "pld [%[block]] \n\t"
947 "pld [%[block], #32] \n\t"
948 "pld [%[pixels]] \n\t"
950 "pld [%[pixels], #32] \n\t"
951 "tmcr wcgr0, r12 \n\t" /* for shift value */
953 "and r12, %[pixels], #7 \n\t"
954 "bic %[pixels], %[pixels], #7 \n\t"
955 "tmcr wcgr1, r12 \n\t"
956 "add r12, r12, #1 \n\t"
957 "tmcr wcgr2, r12 \n\t"
959 // [wr0 wr1 wr2 wr3] <= *
961 "wldrd wr12, [%[pixels]] \n\t"
963 "wldrd wr13, [%[pixels], #8] \n\t"
964 "wldrd wr14, [%[pixels], #16] \n\t"
965 "add %[pixels], %[pixels], %[line_size] \n\t"
966 "pld [%[pixels]] \n\t"
967 "walignr1 wr2, wr12, wr13 \n\t"
968 "pld [%[pixels], #32] \n\t"
969 "walignr1 wr3, wr13, wr14 \n\t"
970 "wmoveq wr10, wr13 \n\t"
971 "wmoveq wr11, wr14 \n\t"
972 "walignr2ne wr10, wr12, wr13 \n\t"
973 "walignr2ne wr11, wr13, wr14 \n\t"
974 "wunpckelub wr0, wr2 \n\t"
975 "wunpckehub wr1, wr2 \n\t"
976 "wunpckelub wr2, wr3 \n\t"
977 "wunpckehub wr3, wr3 \n\t"
978 "wunpckelub wr8, wr10 \n\t"
979 "wunpckehub wr9, wr10 \n\t"
980 "wunpckelub wr10, wr11 \n\t"
981 "wunpckehub wr11, wr11 \n\t"
982 "waddhus wr0, wr0, wr8 \n\t"
983 "waddhus wr1, wr1, wr9 \n\t"
984 "waddhus wr2, wr2, wr10 \n\t"
985 "waddhus wr3, wr3, wr11 \n\t"
989 // [wr4 wr5 wr6 wr7] <= *
990 "wldrd wr12, [%[pixels]] \n\t"
992 "wldrd wr13, [%[pixels], #8] \n\t"
993 "wldrd wr14, [%[pixels], #16] \n\t"
994 "add %[pixels], %[pixels], %[line_size] \n\t"
995 "walignr1 wr6, wr12, wr13 \n\t"
996 "pld [%[pixels]] \n\t"
997 "pld [%[pixels], #32] \n\t"
998 "walignr1 wr7, wr13, wr14 \n\t"
999 "wmoveq wr10, wr13 \n\t"
1000 "wmoveq wr11, wr14 \n\t"
1001 "walignr2ne wr10, wr12, wr13 \n\t"
1002 "walignr2ne wr11, wr13, wr14 \n\t"
1003 "wunpckelub wr4, wr6 \n\t"
1004 "wunpckehub wr5, wr6 \n\t"
1005 "wunpckelub wr6, wr7 \n\t"
1006 "wunpckehub wr7, wr7 \n\t"
1007 "wunpckelub wr8, wr10 \n\t"
1008 "wunpckehub wr9, wr10 \n\t"
1009 "wunpckelub wr10, wr11 \n\t"
1010 "wunpckehub wr11, wr11 \n\t"
1011 "waddhus wr4, wr4, wr8 \n\t"
1012 "waddhus wr5, wr5, wr9 \n\t"
1013 "waddhus wr6, wr6, wr10 \n\t"
1014 "waddhus wr7, wr7, wr11 \n\t"
1015 "waddhus wr8, wr0, wr4 \n\t"
1016 "waddhus wr9, wr1, wr5 \n\t"
1017 "waddhus wr10, wr2, wr6 \n\t"
1018 "waddhus wr11, wr3, wr7 \n\t"
1019 "waddhus wr8, wr8, wr15 \n\t"
1020 "waddhus wr9, wr9, wr15 \n\t"
1021 "waddhus wr10, wr10, wr15 \n\t"
1022 "waddhus wr11, wr11, wr15 \n\t"
1023 "wsrlhg wr8, wr8, wcgr0 \n\t"
1024 "wsrlhg wr9, wr9, wcgr0 \n\t"
1025 "wldrd wr12, [%[block]] \n\t"
1026 "wldrd wr13, [%[block], #8] \n\t"
1027 "wsrlhg wr10, wr10, wcgr0 \n\t"
1028 "wsrlhg wr11, wr11, wcgr0 \n\t"
1029 "wpackhus wr8, wr8, wr9 \n\t"
1030 "wpackhus wr9, wr10, wr11 \n\t"
1031 WAVG2B
" wr8, wr8, wr12 \n\t"
1032 WAVG2B
" wr9, wr9, wr13 \n\t"
1033 "wstrd wr8, [%[block]] \n\t"
1034 "wstrd wr9, [%[block], #8] \n\t"
1035 "add %[block], %[block], %[line_size] \n\t"
1037 // [wr0 wr1 wr2 wr3] <= *
1038 // [wr4 wr5 wr6 wr7]
1039 "wldrd wr12, [%[pixels]] \n\t"
1040 "pld [%[block]] \n\t"
1041 "wldrd wr13, [%[pixels], #8] \n\t"
1042 "pld [%[block], #32] \n\t"
1043 "wldrd wr14, [%[pixels], #16] \n\t"
1044 "add %[pixels], %[pixels], %[line_size] \n\t"
1045 "walignr1 wr2, wr12, wr13 \n\t"
1046 "pld [%[pixels]] \n\t"
1047 "pld [%[pixels], #32] \n\t"
1048 "walignr1 wr3, wr13, wr14 \n\t"
1049 "wmoveq wr10, wr13 \n\t"
1050 "wmoveq wr11, wr14 \n\t"
1051 "walignr2ne wr10, wr12, wr13 \n\t"
1052 "walignr2ne wr11, wr13, wr14 \n\t"
1053 "wunpckelub wr0, wr2 \n\t"
1054 "wunpckehub wr1, wr2 \n\t"
1055 "wunpckelub wr2, wr3 \n\t"
1056 "wunpckehub wr3, wr3 \n\t"
1057 "wunpckelub wr8, wr10 \n\t"
1058 "wunpckehub wr9, wr10 \n\t"
1059 "wunpckelub wr10, wr11 \n\t"
1060 "wunpckehub wr11, wr11 \n\t"
1061 "waddhus wr0, wr0, wr8 \n\t"
1062 "waddhus wr1, wr1, wr9 \n\t"
1063 "waddhus wr2, wr2, wr10 \n\t"
1064 "waddhus wr3, wr3, wr11 \n\t"
1065 "waddhus wr8, wr0, wr4 \n\t"
1066 "waddhus wr9, wr1, wr5 \n\t"
1067 "waddhus wr10, wr2, wr6 \n\t"
1068 "waddhus wr11, wr3, wr7 \n\t"
1069 "waddhus wr8, wr8, wr15 \n\t"
1070 "waddhus wr9, wr9, wr15 \n\t"
1071 "waddhus wr10, wr10, wr15 \n\t"
1072 "waddhus wr11, wr11, wr15 \n\t"
1073 "wsrlhg wr8, wr8, wcgr0 \n\t"
1074 "wsrlhg wr9, wr9, wcgr0 \n\t"
1075 "wldrd wr12, [%[block]] \n\t"
1076 "wldrd wr13, [%[block], #8] \n\t"
1077 "wsrlhg wr10, wr10, wcgr0 \n\t"
1078 "wsrlhg wr11, wr11, wcgr0 \n\t"
1079 "wpackhus wr8, wr8, wr9 \n\t"
1080 "wpackhus wr9, wr10, wr11 \n\t"
1081 WAVG2B
" wr8, wr8, wr12 \n\t"
1082 WAVG2B
" wr9, wr9, wr13 \n\t"
1083 "wstrd wr8, [%[block]] \n\t"
1084 "wstrd wr9, [%[block], #8] \n\t"
1085 "add %[block], %[block], %[line_size] \n\t"
1086 "subs %[h], %[h], #2 \n\t"
1087 "pld [%[block]] \n\t"
1088 "pld [%[block], #32] \n\t"
1090 : [h
]"+r"(h
), [pixels
]"+r"(pixels
), [block
]"+r"(block
)
1091 : [line_size
]"r"(line_size
)