Initial WebM release
[libvpx.git] / vpx_scale / dm642 / gen_scalers_c64.c
blob2126a753432dd3c84d77d74a9312942fb0cbf2b3
1 /*
2 * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license and patent
5 * grant that can be found in the LICENSE file in the root of the source
6 * tree. All contributing project authors may be found in the AUTHORS
7 * file in the root of the source tree.
8 */
11 /****************************************************************************
13 * Module Title : gen_scalers.c
15 * Description : Generic image scaling functions.
17 ***************************************************************************/
19 /****************************************************************************
20 * Header Files
21 ****************************************************************************/
22 #include "vpx_scale/vpxscale.h"
24 /****************************************************************************
25 * Imports
26 ****************************************************************************/
28 /****************************************************************************
30 * ROUTINE : horizontal_line_4_5_scale_c4
32 * INPUTS : const unsigned char *source : Pointer to source data.
33 * unsigned int source_width : Stride of source.
34 * unsigned char *dest : Pointer to destination data.
35 * unsigned int dest_width : Stride of destination (NOT USED).
37 * OUTPUTS : None.
39 * RETURNS : void
41 * FUNCTION : Copies horizontal line of pixels from source to
42 * destination scaling up by 4 to 5.
44 * SPECIAL NOTES : None.
46 ****************************************************************************/
47 static
48 void horizontal_line_4_5_scale_c64
50 const unsigned char *source,
51 unsigned int source_width,
52 unsigned char *dest,
53 unsigned int dest_width
56 unsigned i;
57 unsigned int ba, cb, dc, ed;
58 unsigned char *restrict des = dest;
59 unsigned int *restrict src = (unsigned int *)source;
60 unsigned int const_51_205, const_102_154,
61 const_205_51, const_154_102;
63 unsigned int src_current, src_next;
65 (void) dest_width;
67 // Constants that are to be used for the filtering. For
68 // best speed we are going to want to right shift by 16.
69 // In the generic version they were shift by 8, so put
70 // an extra 8 in now so that 16 will come out later.
71 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
72 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
73 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
74 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
76 // 5 points are needed to filter to give 5 output points.
77 // A load can pull up 4 at a time, and one needs to be
78 // "borrowed" from the next set of data. So instead of
79 // loading those 5 points each time, "steal" a point from
80 // the next set and only load up 4 each time through.
81 src_current = _mem4(src);
83 for (i = 0; i < source_width - 4; i += 4)
85 src_next = _mem4(src++);
87 // Reorder the data so that it is ready for the
88 // dot product.
89 ba = _unpklu4(src_current);
90 cb = _unpkhu4(_rotl(src_current, 8));
91 dc = _unpkhu4(src_current);
92 ed = _unpkhu4(_shrmb(src_next, src_current));
94 // Use the dot product with round and shift.
95 des [0] = src_current & 0xff;
96 des [1] = _dotprsu2(ba, const_205_51);
97 des [2] = _dotprsu2(cb, const_154_102);
98 des [3] = _dotprsu2(dc, const_102_154);
99 des [4] = _dotprsu2(ed, const_51_205);
101 des += 5;
103 // reuse loaded vales next time around.
104 src_current = src_next;
107 // vp8_filter the last set of points. Normally a point from the next set
108 // would be used, but there is no next set, so just fill.
109 ba = _unpklu4(src_current);
110 cb = _unpkhu4(_rotl(src_current, 8));
111 dc = _unpkhu4(src_current);
113 des [0] = src_current & 0xff;
114 des [1] = _dotprsu2(ba, const_205_51);
115 des [2] = _dotprsu2(cb, const_154_102);
116 des [3] = _dotprsu2(dc, const_102_154);
117 des [4] = src_current & 0xff;
120 /****************************************************************************
122 * ROUTINE : vertical_band_4_5_scale_c64
124 * INPUTS : unsigned char *dest : Pointer to destination data.
125 * unsigned int dest_pitch : Stride of destination data.
126 * unsigned int dest_width : Width of destination data.
128 * OUTPUTS : None.
130 * RETURNS : void
132 * FUNCTION : Scales vertical band of pixels by scale 4 to 5. The
133 * height of the band scaled is 4-pixels.
135 * SPECIAL NOTES : The routine uses the first line of the band below
136 * the current band.
138 ****************************************************************************/
139 static
140 void vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
142 unsigned int i;
143 unsigned int a, b, c, d, e;
144 unsigned int ba, cb, dc, ed;
145 unsigned char *restrict src = dest;
146 unsigned char *restrict des = dest;
147 unsigned int const_51_205, const_102_154,
148 const_205_51, const_154_102;
150 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
151 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
152 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
153 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
155 // Force a loop unroll here so that there is not such a
156 // dependancy.
157 a = src [0];
158 b = src [dest_pitch];
159 c = src [dest_pitch*2];
160 d = src [dest_pitch*3];
161 e = src [dest_pitch*5];
162 src ++;
164 for (i = 0; i < dest_width; i++)
166 ba = _pack2(b, a);
167 cb = _pack2(c, b);
168 dc = _pack2(d, c);
169 ed = _pack2(e, d);
171 a = src [0];
172 b = src [dest_pitch];
173 c = src [dest_pitch*2];
174 d = src [dest_pitch*3];
175 e = src [dest_pitch*5];
176 src ++;
178 des [dest_pitch] = _dotprsu2(ba, const_205_51);
179 des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
180 des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
181 des [dest_pitch*4] = _dotprsu2(ed, const_51_205);
183 des ++;
187 /****************************************************************************
189 * ROUTINE : last_vertical_band_4_5_scale_c64
191 * INPUTS : unsigned char *dest : Pointer to destination data.
192 * unsigned int dest_pitch : Stride of destination data.
193 * unsigned int dest_width : Width of destination data.
195 * OUTPUTS : None.
197 * RETURNS : void
199 * FUNCTION : Scales last vertical band of pixels by scale 4 to 5. The
200 * height of the band scaled is 4-pixels.
202 * SPECIAL NOTES : The routine does not have available the first line of
203 * the band below the current band, since this is the
204 * last band.
206 ****************************************************************************/
207 static
208 void last_vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
210 unsigned int i;
211 unsigned int a, b, c, d;
212 unsigned int ba, cb, dc;
213 unsigned char *restrict src = dest;
214 unsigned char *restrict des = dest;
215 unsigned int const_102_154, const_205_51, const_154_102;
217 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
218 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
219 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
221 a = src [0];
222 b = src [dest_pitch];
223 c = src [dest_pitch*2];
224 d = src [dest_pitch*3];
225 src ++;
227 for (i = 0; i < dest_width; ++i)
229 ba = _pack2(b, a);
230 cb = _pack2(c, b);
231 dc = _pack2(d, c);
233 a = src [0];
234 b = src [dest_pitch];
235 c = src [dest_pitch*2];
236 d = src [dest_pitch*3];
237 src ++;
239 des [dest_pitch] = _dotprsu2(ba, const_205_51);
240 des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
241 des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
242 des [dest_pitch*4] = (unsigned char) d;
244 des++;
248 /****************************************************************************
250 * ROUTINE : horizontal_line_3_5_scale_c64
252 * INPUTS : const unsigned char *source : Pointer to source data.
253 * unsigned int source_width : Stride of source.
254 * unsigned char *dest : Pointer to destination data.
255 * unsigned int dest_width : Stride of destination (NOT USED).
257 * OUTPUTS : None.
259 * RETURNS : void
261 * FUNCTION : Copies horizontal line of pixels from source to
262 * destination scaling up by 3 to 5.
264 * SPECIAL NOTES : None.
267 ****************************************************************************/
268 static
269 void horizontal_line_3_5_scale_c64
271 const unsigned char *source,
272 unsigned int source_width,
273 unsigned char *dest,
274 unsigned int dest_width
277 unsigned int i;
278 unsigned int ba, cb, dc;
279 unsigned int src_current;
280 unsigned char *restrict des = dest;
281 unsigned char *restrict src = (unsigned char *)source;
282 unsigned int const_51_205, const_102_154,
283 const_205_51, const_154_102;
285 (void) dest_width;
287 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
288 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
289 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
290 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
292 for (i = 0; i < source_width - 3; i += 3)
294 src_current = _mem4(src);
296 // Reorder the data so that it is ready for the
297 // dot product.
298 ba = _unpklu4(src_current);
299 cb = _unpkhu4(_rotl(src_current, 8));
300 dc = _unpkhu4(src_current);
302 des [0] = src_current & 0xff;
303 des [1] = _dotprsu2(ba, const_154_102);
304 des [2] = _dotprsu2(cb, const_51_205);
305 des [3] = _dotprsu2(cb, const_205_51);
306 des [4] = _dotprsu2(dc, const_102_154);
308 src += 3;
309 des += 5;
312 src_current = _mem4(src);
314 ba = _unpklu4(src_current);
315 cb = _unpkhu4(_rotl(src_current, 8));
316 dc = _unpkhu4(src_current);
319 des [0] = src_current & 0xff;
320 des [1] = _dotprsu2(ba, const_154_102);
321 des [2] = _dotprsu2(cb, const_51_205);
322 des [3] = _dotprsu2(cb, const_205_51);
323 des [4] = dc & 0xff;
327 /****************************************************************************
329 * ROUTINE : vertical_band_3_5_scale_c64
331 * INPUTS : unsigned char *dest : Pointer to destination data.
332 * unsigned int dest_pitch : Stride of destination data.
333 * unsigned int dest_width : Width of destination data.
335 * OUTPUTS : None.
337 * RETURNS : void
339 * FUNCTION : Scales vertical band of pixels by scale 3 to 5. The
340 * height of the band scaled is 3-pixels.
342 * SPECIAL NOTES : The routine uses the first line of the band below
343 * the current band.
345 ****************************************************************************/
346 static
347 void vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
349 unsigned int i;
350 unsigned int a, b, c, d;
351 unsigned int ba, cb, dc;
352 unsigned char *restrict src = dest;
353 unsigned char *restrict des = dest;
354 unsigned int const_51_205, const_102_154,
355 const_205_51, const_154_102;
357 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
358 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
359 const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
360 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
362 a = src [0];
363 b = src [dest_pitch];
364 c = src [dest_pitch*2];
365 d = src [dest_pitch*5];
366 src ++;
368 for (i = 0; i < dest_width; i++)
370 ba = _pack2(b, a);
371 cb = _pack2(c, b);
372 dc = _pack2(d, c);
374 a = src [0];
375 b = src [dest_pitch];
376 c = src [dest_pitch*2];
377 d = src [dest_pitch*5];
378 src ++;
380 des [dest_pitch] = _dotprsu2(ba, const_154_102);
381 des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
382 des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
383 des [dest_pitch*4] = _dotprsu2(dc, const_102_154);
385 des++;
389 /****************************************************************************
391 * ROUTINE : last_vertical_band_3_5_scale_c64
393 * INPUTS : unsigned char *dest : Pointer to destination data.
394 * unsigned int dest_pitch : Stride of destination data.
395 * unsigned int dest_width : Width of destination data.
397 * OUTPUTS : None.
399 * RETURNS : void
401 * FUNCTION : Scales last vertical band of pixels by scale 3 to 5. The
402 * height of the band scaled is 3-pixels.
404 * SPECIAL NOTES : The routine does not have available the first line of
405 * the band below the current band, since this is the
406 * last band.
408 ****************************************************************************/
409 static
410 void last_vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
412 unsigned int i;
413 unsigned int a, b, c;
414 unsigned int ba, cb;
415 unsigned char *restrict src = dest;
416 unsigned char *restrict des = dest;
417 unsigned int const_51_205, const_205_51, const_154_102;
419 const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
420 const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
421 const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
423 a = src [0];
424 b = src [dest_pitch];
425 c = src [dest_pitch*2];
426 src ++;
428 for (i = 0; i < dest_width; ++i)
430 ba = _pack2(b, a);
431 cb = _pack2(c, b);
433 a = src [0];
434 b = src [dest_pitch];
435 c = src [dest_pitch*2];
436 src ++;
438 des [dest_pitch] = _dotprsu2(ba, const_154_102);
439 des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
440 des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
441 des [dest_pitch*4] = (unsigned char)(c) ;
443 des++;
447 /****************************************************************************
449 * ROUTINE : horizontal_line_1_2_scale_c64
451 * INPUTS : const unsigned char *source : Pointer to source data.
452 * unsigned int source_width : Stride of source.
453 * unsigned char *dest : Pointer to destination data.
454 * unsigned int dest_width : Stride of destination (NOT USED).
456 * OUTPUTS : None.
458 * RETURNS : void
460 * FUNCTION : Copies horizontal line of pixels from source to
461 * destination scaling up by 1 to 2.
463 * SPECIAL NOTES : source width must be a multiple of 4.
465 ****************************************************************************/
466 void horizontal_line_1_2_scale_c64
468 const unsigned char *source,
469 unsigned int source_width,
470 unsigned char *dest,
471 unsigned int dest_width
474 unsigned int i;
475 unsigned char *restrict des = dest;
476 unsigned char *restrict src = (unsigned char *)source;
477 unsigned int src7_4i, src4_1i, src3_0i;
478 unsigned int a4_0i, ahi, alo;
479 double src7_0d, src3_0d;
480 const unsigned int k01 = 0x01010101;
482 for (i = 0; i < source_width / 4; i += 1)
484 // Load up the data from src. Here a wide load is
485 // used to get 8 bytes at once, only 5 will be used
486 // for the actual computation.
487 src7_0d = _memd8(src);
488 src3_0i = _lo(src7_0d);
489 src7_4i = _hi(src7_0d);
491 // Need to average between points. Shift byte 5 into
492 // the lower word. This will result in bytes 5-1
493 // averaged with 4-0.
494 src4_1i = _shrmb(src7_4i, src3_0i);
495 a4_0i = _avgu4(src4_1i, src3_0i);
497 // Expand the data out. Could do an unpack, however
498 // all but the multiply units are getting pretty hard
499 // here the multiply unit can take some of the computations.
500 src3_0d = _mpyu4(src3_0i, k01);
502 // The averages need to be unpacked so that they are in 16
503 // bit form and will be able to be interleaved with the
504 // original data
505 ahi = _unpkhu4(a4_0i);
506 alo = _unpklu4(a4_0i);
508 ahi = _swap4(ahi);
509 alo = _swap4(alo);
511 // Mix the average result in with the orginal data.
512 ahi = _hi(src3_0d) | ahi;
513 alo = _lo(src3_0d) | alo;
515 _memd8(des) = _itod(ahi, alo);
517 des += 8;
518 src += 4;
523 /****************************************************************************
525 * ROUTINE : vertical_band_1_2_scale_c64
527 * INPUTS : unsigned char *dest : Pointer to destination data.
528 * unsigned int dest_pitch : Stride of destination data.
529 * unsigned int dest_width : Width of destination data.
531 * OUTPUTS : None.
533 * RETURNS : void
535 * FUNCTION : Scales vertical band of pixels by scale 1 to 2. The
536 * height of the band scaled is 1-pixel.
538 * SPECIAL NOTES : The routine uses the first line of the band below
539 * the current band.
540 * Destination width must be a multiple of 4. Because the
541 * intput must be, therefore the output must be.
543 ****************************************************************************/
544 static
545 void vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
547 unsigned int i;
548 unsigned int a, b;
549 unsigned int *restrict line_a = (unsigned int *)dest;
550 unsigned int *restrict line_b = (unsigned int *)(dest + (dest_pitch * 2));
551 unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
553 for (i = 0; i < dest_width / 4; i++)
555 a = _mem4(line_a++);
556 b = _mem4(line_b++);
558 _mem4(des++) = _avgu4(a, b);
562 /****************************************************************************
564 * ROUTINE : last_vertical_band_1_2_scale_c64
566 * INPUTS : unsigned char *dest : Pointer to destination data.
567 * unsigned int dest_pitch : Stride of destination data.
568 * unsigned int dest_width : Width of destination data.
570 * OUTPUTS : None.
572 * RETURNS : void
574 * FUNCTION : Scales last vertical band of pixels by scale 1 to 2. The
575 * height of the band scaled is 1-pixel.
577 * SPECIAL NOTES : The routine does not have available the first line of
578 * the band below the current band, since this is the
579 * last band. Again, width must be a multiple of 4.
581 ****************************************************************************/
582 static
583 void last_vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
585 unsigned int i;
586 unsigned int *restrict src = (unsigned int *)dest;
587 unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
589 for (i = 0; i < dest_width / 4; ++i)
591 _mem4(des++) = _mem4(src++);
595 void
596 register_generic_scalers(void)
598 vp8_horizontal_line_1_2_scale = horizontal_line_1_2_scale_c64;
599 vp8_vertical_band_1_2_scale = vertical_band_1_2_scale_c64;
600 vp8_last_vertical_band_1_2_scale = last_vertical_band_1_2_scale_c64;
601 vp8_horizontal_line_3_5_scale = horizontal_line_3_5_scale_c64;
602 vp8_vertical_band_3_5_scale = vertical_band_3_5_scale_c64;
603 vp8_last_vertical_band_3_5_scale = last_vertical_band_3_5_scale_c64;
604 vp8_horizontal_line_4_5_scale = horizontal_line_4_5_scale_c64;
605 vp8_vertical_band_4_5_scale = vertical_band_4_5_scale_c64;
606 vp8_last_vertical_band_4_5_scale = last_vertical_band_4_5_scale_c64;