Add VCS links
[debian-dgen.git] / scale2x / scale2x.c
blob2fd436837da1d57c25c279f5a173ea54e04e30aa
1 /*
2 * This file is part of the Scale2x project.
4 * Copyright (C) 2001, 2002, 2003, 2004 Andrea Mazzoleni
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 * This file contains a C and MMX implementation of the Scale2x effect.
24 * You can find an high level description of the effect at :
26 * http://scale2x.sourceforge.net/
28 * Alternatively at the previous license terms, you are allowed to use this
29 * code in your program with these conditions:
30 * - the program is not used in commercial activities.
31 * - the whole source code of the program is released with the binary.
32 * - derivative works of the program are allowed.
35 #if HAVE_CONFIG_H
36 #include <config.h>
37 #endif
39 #include "scale2x.h"
41 #include <assert.h>
43 /***************************************************************************/
44 /* Scale2x C implementation */
46 /**
47 * Define the macro USE_SCALE_RANDOMWRITE to enable
48 * an optimized version which writes memory in random order.
49 * This version is a little faster if you write in system memory.
50 * But it's a lot slower if you write in video memory.
51 * So, enable it only if you are sure to never write directly in video memory.
53 /* #define USE_SCALE_RANDOMWRITE */
55 #ifdef USE_SCALE_RANDOMWRITE
57 static inline void scale2x_8_def_whole(scale2x_uint8* restrict dst0, scale2x_uint8* restrict dst1, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
59 assert(count >= 2);
61 /* first pixel */
62 if (src0[0] != src2[0] && src1[0] != src1[1]) {
63 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
64 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
65 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
66 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
67 } else {
68 dst0[0] = src1[0];
69 dst0[1] = src1[0];
70 dst1[0] = src1[0];
71 dst1[1] = src1[0];
73 ++src0;
74 ++src1;
75 ++src2;
76 dst0 += 2;
77 dst1 += 2;
79 /* central pixels */
80 count -= 2;
81 while (count) {
82 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
83 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
84 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
85 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
86 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
87 } else {
88 dst0[0] = src1[0];
89 dst0[1] = src1[0];
90 dst1[0] = src1[0];
91 dst1[1] = src1[0];
94 ++src0;
95 ++src1;
96 ++src2;
97 dst0 += 2;
98 dst1 += 2;
99 --count;
102 /* last pixel */
103 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
104 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
105 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
106 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
107 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
108 } else {
109 dst0[0] = src1[0];
110 dst0[1] = src1[0];
111 dst1[0] = src1[0];
112 dst1[1] = src1[0];
116 #else /* USE_SCALE_RANDOMWRITE */
118 static inline void scale2x_8_def_border(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
120 assert(count >= 2);
122 /* first pixel */
123 if (src0[0] != src2[0] && src1[0] != src1[1]) {
124 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
125 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
126 } else {
127 dst[0] = src1[0];
128 dst[1] = src1[0];
130 ++src0;
131 ++src1;
132 ++src2;
133 dst += 2;
135 /* central pixels */
136 count -= 2;
137 while (count) {
138 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
139 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
140 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
141 } else {
142 dst[0] = src1[0];
143 dst[1] = src1[0];
146 ++src0;
147 ++src1;
148 ++src2;
149 dst += 2;
150 --count;
153 /* last pixel */
154 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
155 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
156 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
157 } else {
158 dst[0] = src1[0];
159 dst[1] = src1[0];
163 static inline void scale2x_8_def_center(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
165 assert(count >= 2);
167 /* first pixel */
168 if (src0[0] != src2[0] && src1[0] != src1[1]) {
169 dst[0] = src1[0];
170 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
171 } else {
172 dst[0] = src1[0];
173 dst[1] = src1[0];
175 ++src0;
176 ++src1;
177 ++src2;
178 dst += 2;
180 /* central pixels */
181 count -= 2;
182 while (count) {
183 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
184 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
185 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
186 } else {
187 dst[0] = src1[0];
188 dst[1] = src1[0];
191 ++src0;
192 ++src1;
193 ++src2;
194 dst += 2;
195 --count;
198 /* last pixel */
199 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
200 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
201 dst[1] = src1[0];
202 } else {
203 dst[0] = src1[0];
204 dst[1] = src1[0];
208 #endif /* USE_SCALE_RANDOMWRITE */
210 #ifdef USE_SCALE_RANDOMWRITE
212 static inline void scale2x_16_def_whole(scale2x_uint16* restrict dst0, scale2x_uint16* restrict dst1, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
214 assert(count >= 2);
216 /* first pixel */
217 if (src0[0] != src2[0] && src1[0] != src1[1]) {
218 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
219 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
220 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
221 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
222 } else {
223 dst0[0] = src1[0];
224 dst0[1] = src1[0];
225 dst1[0] = src1[0];
226 dst1[1] = src1[0];
228 ++src0;
229 ++src1;
230 ++src2;
231 dst0 += 2;
232 dst1 += 2;
234 /* central pixels */
235 count -= 2;
236 while (count) {
237 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
238 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
239 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
240 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
241 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
242 } else {
243 dst0[0] = src1[0];
244 dst0[1] = src1[0];
245 dst1[0] = src1[0];
246 dst1[1] = src1[0];
249 ++src0;
250 ++src1;
251 ++src2;
252 dst0 += 2;
253 dst1 += 2;
254 --count;
257 /* last pixel */
258 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
259 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
260 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
261 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
262 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
263 } else {
264 dst0[0] = src1[0];
265 dst0[1] = src1[0];
266 dst1[0] = src1[0];
267 dst1[1] = src1[0];
271 #else /* USE_SCALE_RANDOMWRITE */
273 static inline void scale2x_16_def_border(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
275 assert(count >= 2);
277 /* first pixel */
278 if (src0[0] != src2[0] && src1[0] != src1[1]) {
279 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
280 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
281 } else {
282 dst[0] = src1[0];
283 dst[1] = src1[0];
285 ++src0;
286 ++src1;
287 ++src2;
288 dst += 2;
290 /* central pixels */
291 count -= 2;
292 while (count) {
293 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
294 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
295 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
296 } else {
297 dst[0] = src1[0];
298 dst[1] = src1[0];
301 ++src0;
302 ++src1;
303 ++src2;
304 dst += 2;
305 --count;
308 /* last pixel */
309 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
310 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
311 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
312 } else {
313 dst[0] = src1[0];
314 dst[1] = src1[0];
318 static inline void scale2x_16_def_center(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
320 assert(count >= 2);
322 /* first pixel */
323 if (src0[0] != src2[0] && src1[0] != src1[1]) {
324 dst[0] = src1[0];
325 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
326 } else {
327 dst[0] = src1[0];
328 dst[1] = src1[0];
330 ++src0;
331 ++src1;
332 ++src2;
333 dst += 2;
335 /* central pixels */
336 count -= 2;
337 while (count) {
338 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
339 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
340 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
341 } else {
342 dst[0] = src1[0];
343 dst[1] = src1[0];
346 ++src0;
347 ++src1;
348 ++src2;
349 dst += 2;
350 --count;
353 /* last pixel */
354 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
355 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
356 dst[1] = src1[0];
357 } else {
358 dst[0] = src1[0];
359 dst[1] = src1[0];
363 #endif /* USE_SCALE_RANDOMWRITE */
365 #ifdef USE_SCALE_RANDOMWRITE
367 static inline void scale2x_32_def_whole(scale2x_uint32* restrict dst0, scale2x_uint32* restrict dst1, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
369 assert(count >= 2);
371 /* first pixel */
372 if (src0[0] != src2[0] && src1[0] != src1[1]) {
373 dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
374 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
375 dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
376 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
377 } else {
378 dst0[0] = src1[0];
379 dst0[1] = src1[0];
380 dst1[0] = src1[0];
381 dst1[1] = src1[0];
383 ++src0;
384 ++src1;
385 ++src2;
386 dst0 += 2;
387 dst1 += 2;
389 /* central pixels */
390 count -= 2;
391 while (count) {
392 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
393 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
394 dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
395 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
396 dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
397 } else {
398 dst0[0] = src1[0];
399 dst0[1] = src1[0];
400 dst1[0] = src1[0];
401 dst1[1] = src1[0];
404 ++src0;
405 ++src1;
406 ++src2;
407 dst0 += 2;
408 dst1 += 2;
409 --count;
412 /* last pixel */
413 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
414 dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
415 dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
416 dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
417 dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
418 } else {
419 dst0[0] = src1[0];
420 dst0[1] = src1[0];
421 dst1[0] = src1[0];
422 dst1[1] = src1[0];
426 #else /* USE_SCALE_RANDOMWRITE */
428 static inline void scale2x_32_def_border(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
430 assert(count >= 2);
432 /* first pixel */
433 if (src0[0] != src2[0] && src1[0] != src1[1]) {
434 dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
435 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
436 } else {
437 dst[0] = src1[0];
438 dst[1] = src1[0];
440 ++src0;
441 ++src1;
442 ++src2;
443 dst += 2;
445 /* central pixels */
446 count -= 2;
447 while (count) {
448 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
449 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
450 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
451 } else {
452 dst[0] = src1[0];
453 dst[1] = src1[0];
456 ++src0;
457 ++src1;
458 ++src2;
459 dst += 2;
460 --count;
463 /* last pixel */
464 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
465 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
466 dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
467 } else {
468 dst[0] = src1[0];
469 dst[1] = src1[0];
473 static inline void scale2x_32_def_center(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
475 assert(count >= 2);
477 /* first pixel */
478 if (src0[0] != src2[0] && src1[0] != src1[1]) {
479 dst[0] = src1[0];
480 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
481 } else {
482 dst[0] = src1[0];
483 dst[1] = src1[0];
485 ++src0;
486 ++src1;
487 ++src2;
488 dst += 2;
490 /* central pixels */
491 count -= 2;
492 while (count) {
493 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
494 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
495 dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
496 } else {
497 dst[0] = src1[0];
498 dst[1] = src1[0];
501 ++src0;
502 ++src1;
503 ++src2;
504 dst += 2;
505 --count;
508 /* last pixel */
509 if (src0[0] != src2[0] && src1[-1] != src1[0]) {
510 dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
511 dst[1] = src1[0];
512 } else {
513 dst[0] = src1[0];
514 dst[1] = src1[0];
518 #endif /* USE_SCALE_RANDOMWRITE */
521 * Scale by a factor of 2 a row of pixels of 8 bits.
522 * The function is implemented in C.
523 * The pixels over the left and right borders are assumed of the same color of
524 * the pixels on the border.
525 * Note that the implementation is optimized to write data sequentially to
526 * maximize the bandwidth on video memory.
527 * \param src0 Pointer at the first pixel of the previous row.
528 * \param src1 Pointer at the first pixel of the current row.
529 * \param src2 Pointer at the first pixel of the next row.
530 * \param count Length in pixels of the src0, src1 and src2 rows.
531 * It must be at least 2.
532 * \param dst0 First destination row, double length in pixels.
533 * \param dst1 Second destination row, double length in pixels.
535 void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
537 #ifdef USE_SCALE_RANDOMWRITE
538 scale2x_8_def_whole(dst0, dst1, src0, src1, src2, count);
539 #else
540 scale2x_8_def_border(dst0, src0, src1, src2, count);
541 scale2x_8_def_border(dst1, src2, src1, src0, count);
542 #endif
546 * Scale by a factor of 2 a row of pixels of 16 bits.
547 * This function operates like scale2x_8_def() but for 16 bits pixels.
548 * \param src0 Pointer at the first pixel of the previous row.
549 * \param src1 Pointer at the first pixel of the current row.
550 * \param src2 Pointer at the first pixel of the next row.
551 * \param count Length in pixels of the src0, src1 and src2 rows.
552 * It must be at least 2.
553 * \param dst0 First destination row, double length in pixels.
554 * \param dst1 Second destination row, double length in pixels.
556 void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
558 #ifdef USE_SCALE_RANDOMWRITE
559 scale2x_16_def_whole(dst0, dst1, src0, src1, src2, count);
560 #else
561 scale2x_16_def_border(dst0, src0, src1, src2, count);
562 scale2x_16_def_border(dst1, src2, src1, src0, count);
563 #endif
567 * Scale by a factor of 2 a row of pixels of 32 bits.
568 * This function operates like scale2x_8_def() but for 32 bits pixels.
569 * \param src0 Pointer at the first pixel of the previous row.
570 * \param src1 Pointer at the first pixel of the current row.
571 * \param src2 Pointer at the first pixel of the next row.
572 * \param count Length in pixels of the src0, src1 and src2 rows.
573 * It must be at least 2.
574 * \param dst0 First destination row, double length in pixels.
575 * \param dst1 Second destination row, double length in pixels.
577 void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
579 #ifdef USE_SCALE_RANDOMWRITE
580 scale2x_32_def_whole(dst0, dst1, src0, src1, src2, count);
581 #else
582 scale2x_32_def_border(dst0, src0, src1, src2, count);
583 scale2x_32_def_border(dst1, src2, src1, src0, count);
584 #endif
588 * Scale by a factor of 2x3 a row of pixels of 8 bits.
589 * \note Like scale2x_8_def();
591 void scale2x3_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
593 #ifdef USE_SCALE_RANDOMWRITE
594 scale2x_8_def_whole(dst0, dst2, src0, src1, src2, count);
595 scale2x_8_def_center(dst1, src0, src1, src2, count);
596 #else
597 scale2x_8_def_border(dst0, src0, src1, src2, count);
598 scale2x_8_def_center(dst1, src0, src1, src2, count);
599 scale2x_8_def_border(dst2, src2, src1, src0, count);
600 #endif
604 * Scale by a factor of 2x3 a row of pixels of 16 bits.
605 * \note Like scale2x_16_def();
607 void scale2x3_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
609 #ifdef USE_SCALE_RANDOMWRITE
610 scale2x_16_def_whole(dst0, dst2, src0, src1, src2, count);
611 scale2x_16_def_center(dst1, src0, src1, src2, count);
612 #else
613 scale2x_16_def_border(dst0, src0, src1, src2, count);
614 scale2x_16_def_center(dst1, src0, src1, src2, count);
615 scale2x_16_def_border(dst2, src2, src1, src0, count);
616 #endif
620 * Scale by a factor of 2x3 a row of pixels of 32 bits.
621 * \note Like scale2x_32_def();
623 void scale2x3_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
625 #ifdef USE_SCALE_RANDOMWRITE
626 scale2x_32_def_whole(dst0, dst2, src0, src1, src2, count);
627 scale2x_32_def_center(dst1, src0, src1, src2, count);
628 #else
629 scale2x_32_def_border(dst0, src0, src1, src2, count);
630 scale2x_32_def_center(dst1, src0, src1, src2, count);
631 scale2x_32_def_border(dst2, src2, src1, src0, count);
632 #endif
636 * Scale by a factor of 2x4 a row of pixels of 8 bits.
637 * \note Like scale2x_8_def();
639 void scale2x4_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
641 #ifdef USE_SCALE_RANDOMWRITE
642 scale2x_8_def_whole(dst0, dst3, src0, src1, src2, count);
643 scale2x_8_def_center(dst1, src0, src1, src2, count);
644 scale2x_8_def_center(dst2, src0, src1, src2, count);
645 #else
646 scale2x_8_def_border(dst0, src0, src1, src2, count);
647 scale2x_8_def_center(dst1, src0, src1, src2, count);
648 scale2x_8_def_center(dst2, src0, src1, src2, count);
649 scale2x_8_def_border(dst3, src2, src1, src0, count);
650 #endif
654 * Scale by a factor of 2x4 a row of pixels of 16 bits.
655 * \note Like scale2x_16_def();
657 void scale2x4_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
659 #ifdef USE_SCALE_RANDOMWRITE
660 scale2x_16_def_whole(dst0, dst3, src0, src1, src2, count);
661 scale2x_16_def_center(dst1, src0, src1, src2, count);
662 scale2x_16_def_center(dst2, src0, src1, src2, count);
663 #else
664 scale2x_16_def_border(dst0, src0, src1, src2, count);
665 scale2x_16_def_center(dst1, src0, src1, src2, count);
666 scale2x_16_def_center(dst2, src0, src1, src2, count);
667 scale2x_16_def_border(dst3, src2, src1, src0, count);
668 #endif
672 * Scale by a factor of 2x4 a row of pixels of 32 bits.
673 * \note Like scale2x_32_def();
675 void scale2x4_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
677 #ifdef USE_SCALE_RANDOMWRITE
678 scale2x_32_def_whole(dst0, dst3, src0, src1, src2, count);
679 scale2x_32_def_center(dst1, src0, src1, src2, count);
680 scale2x_32_def_center(dst2, src0, src1, src2, count);
681 #else
682 scale2x_32_def_border(dst0, src0, src1, src2, count);
683 scale2x_32_def_center(dst1, src0, src1, src2, count);
684 scale2x_32_def_center(dst2, src0, src1, src2, count);
685 scale2x_32_def_border(dst3, src2, src1, src0, count);
686 #endif
689 /***************************************************************************/
690 /* Scale2x MMX implementation */
692 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
695 * Apply the Scale2x effect at a single row.
696 * This function must be called only by the other scale2x functions.
698 * Considering the pixel map :
700 * ABC (src0)
701 * DEF (src1)
702 * GHI (src2)
704 * this functions compute 2 new pixels in substitution of the source pixel E
705 * like this map :
707 * ab (dst)
709 * with these variables :
711 * &current -> E
712 * &current_left -> D
713 * &current_right -> F
714 * &current_upper -> B
715 * &current_lower -> H
717 * %0 -> current_upper
718 * %1 -> current
719 * %2 -> current_lower
720 * %3 -> dst
721 * %4 -> counter
723 * %mm0 -> *current_left
724 * %mm1 -> *current_next
725 * %mm2 -> tmp0
726 * %mm3 -> tmp1
727 * %mm4 -> tmp2
728 * %mm5 -> tmp3
729 * %mm6 -> *current_upper
730 * %mm7 -> *current
732 static inline void scale2x_8_mmx_border(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
734 assert(count >= 16);
735 assert(count % 8 == 0);
737 /* always do the first and last run */
738 count -= 2*8;
740 __asm__ __volatile__(
741 /* first run */
742 /* set the current, current_pre, current_next registers */
743 "movq 0(%1), %%mm0\n"
744 "movq 0(%1), %%mm7\n"
745 "movq 8(%1), %%mm1\n"
746 "psllq $56, %%mm0\n"
747 "psllq $56, %%mm1\n"
748 "psrlq $56, %%mm0\n"
749 "movq %%mm7, %%mm2\n"
750 "movq %%mm7, %%mm3\n"
751 "psllq $8, %%mm2\n"
752 "psrlq $8, %%mm3\n"
753 "por %%mm2, %%mm0\n"
754 "por %%mm3, %%mm1\n"
756 /* current_upper */
757 "movq (%0), %%mm6\n"
759 /* compute the upper-left pixel for dst on %%mm2 */
760 /* compute the upper-right pixel for dst on %%mm4 */
761 "movq %%mm0, %%mm2\n"
762 "movq %%mm1, %%mm4\n"
763 "movq %%mm0, %%mm3\n"
764 "movq %%mm1, %%mm5\n"
765 "pcmpeqb %%mm6, %%mm2\n"
766 "pcmpeqb %%mm6, %%mm4\n"
767 "pcmpeqb (%2), %%mm3\n"
768 "pcmpeqb (%2), %%mm5\n"
769 "pandn %%mm2, %%mm3\n"
770 "pandn %%mm4, %%mm5\n"
771 "movq %%mm0, %%mm2\n"
772 "movq %%mm1, %%mm4\n"
773 "pcmpeqb %%mm1, %%mm2\n"
774 "pcmpeqb %%mm0, %%mm4\n"
775 "pandn %%mm3, %%mm2\n"
776 "pandn %%mm5, %%mm4\n"
777 "movq %%mm2, %%mm3\n"
778 "movq %%mm4, %%mm5\n"
779 "pand %%mm6, %%mm2\n"
780 "pand %%mm6, %%mm4\n"
781 "pandn %%mm7, %%mm3\n"
782 "pandn %%mm7, %%mm5\n"
783 "por %%mm3, %%mm2\n"
784 "por %%mm5, %%mm4\n"
786 /* set *dst */
787 "movq %%mm2, %%mm3\n"
788 "punpcklbw %%mm4, %%mm2\n"
789 "punpckhbw %%mm4, %%mm3\n"
790 "movq %%mm2, (%3)\n"
791 "movq %%mm3, 8(%3)\n"
793 /* next */
794 "add $8, %0\n"
795 "add $8, %1\n"
796 "add $8, %2\n"
797 "add $16, %3\n"
799 /* central runs */
800 "shr $3, %4\n"
801 "jz 1f\n"
803 "0:\n"
805 /* set the current, current_pre, current_next registers */
806 "movq -8(%1), %%mm0\n"
807 "movq (%1), %%mm7\n"
808 "movq 8(%1), %%mm1\n"
809 "psrlq $56, %%mm0\n"
810 "psllq $56, %%mm1\n"
811 "movq %%mm7, %%mm2\n"
812 "movq %%mm7, %%mm3\n"
813 "psllq $8, %%mm2\n"
814 "psrlq $8, %%mm3\n"
815 "por %%mm2, %%mm0\n"
816 "por %%mm3, %%mm1\n"
818 /* current_upper */
819 "movq (%0), %%mm6\n"
821 /* compute the upper-left pixel for dst on %%mm2 */
822 /* compute the upper-right pixel for dst on %%mm4 */
823 "movq %%mm0, %%mm2\n"
824 "movq %%mm1, %%mm4\n"
825 "movq %%mm0, %%mm3\n"
826 "movq %%mm1, %%mm5\n"
827 "pcmpeqb %%mm6, %%mm2\n"
828 "pcmpeqb %%mm6, %%mm4\n"
829 "pcmpeqb (%2), %%mm3\n"
830 "pcmpeqb (%2), %%mm5\n"
831 "pandn %%mm2, %%mm3\n"
832 "pandn %%mm4, %%mm5\n"
833 "movq %%mm0, %%mm2\n"
834 "movq %%mm1, %%mm4\n"
835 "pcmpeqb %%mm1, %%mm2\n"
836 "pcmpeqb %%mm0, %%mm4\n"
837 "pandn %%mm3, %%mm2\n"
838 "pandn %%mm5, %%mm4\n"
839 "movq %%mm2, %%mm3\n"
840 "movq %%mm4, %%mm5\n"
841 "pand %%mm6, %%mm2\n"
842 "pand %%mm6, %%mm4\n"
843 "pandn %%mm7, %%mm3\n"
844 "pandn %%mm7, %%mm5\n"
845 "por %%mm3, %%mm2\n"
846 "por %%mm5, %%mm4\n"
848 /* set *dst */
849 "movq %%mm2, %%mm3\n"
850 "punpcklbw %%mm4, %%mm2\n"
851 "punpckhbw %%mm4, %%mm3\n"
852 "movq %%mm2, (%3)\n"
853 "movq %%mm3, 8(%3)\n"
855 /* next */
856 "add $8, %0\n"
857 "add $8, %1\n"
858 "add $8, %2\n"
859 "add $16, %3\n"
861 "dec %4\n"
862 "jnz 0b\n"
863 "1:\n"
865 /* final run */
866 /* set the current, current_pre, current_next registers */
867 "movq (%1), %%mm1\n"
868 "movq (%1), %%mm7\n"
869 "movq -8(%1), %%mm0\n"
870 "psrlq $56, %%mm1\n"
871 "psrlq $56, %%mm0\n"
872 "psllq $56, %%mm1\n"
873 "movq %%mm7, %%mm2\n"
874 "movq %%mm7, %%mm3\n"
875 "psllq $8, %%mm2\n"
876 "psrlq $8, %%mm3\n"
877 "por %%mm2, %%mm0\n"
878 "por %%mm3, %%mm1\n"
880 /* current_upper */
881 "movq (%0), %%mm6\n"
883 /* compute the upper-left pixel for dst on %%mm2 */
884 /* compute the upper-right pixel for dst on %%mm4 */
885 "movq %%mm0, %%mm2\n"
886 "movq %%mm1, %%mm4\n"
887 "movq %%mm0, %%mm3\n"
888 "movq %%mm1, %%mm5\n"
889 "pcmpeqb %%mm6, %%mm2\n"
890 "pcmpeqb %%mm6, %%mm4\n"
891 "pcmpeqb (%2), %%mm3\n"
892 "pcmpeqb (%2), %%mm5\n"
893 "pandn %%mm2, %%mm3\n"
894 "pandn %%mm4, %%mm5\n"
895 "movq %%mm0, %%mm2\n"
896 "movq %%mm1, %%mm4\n"
897 "pcmpeqb %%mm1, %%mm2\n"
898 "pcmpeqb %%mm0, %%mm4\n"
899 "pandn %%mm3, %%mm2\n"
900 "pandn %%mm5, %%mm4\n"
901 "movq %%mm2, %%mm3\n"
902 "movq %%mm4, %%mm5\n"
903 "pand %%mm6, %%mm2\n"
904 "pand %%mm6, %%mm4\n"
905 "pandn %%mm7, %%mm3\n"
906 "pandn %%mm7, %%mm5\n"
907 "por %%mm3, %%mm2\n"
908 "por %%mm5, %%mm4\n"
910 /* set *dst */
911 "movq %%mm2, %%mm3\n"
912 "punpcklbw %%mm4, %%mm2\n"
913 "punpckhbw %%mm4, %%mm3\n"
914 "movq %%mm2, (%3)\n"
915 "movq %%mm3, 8(%3)\n"
917 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
919 : "cc"
923 static inline void scale2x_16_mmx_border(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
925 assert(count >= 8);
926 assert(count % 4 == 0);
928 /* always do the first and last run */
929 count -= 2*4;
931 __asm__ __volatile__(
932 /* first run */
933 /* set the current, current_pre, current_next registers */
934 "movq 0(%1), %%mm0\n"
935 "movq 0(%1), %%mm7\n"
936 "movq 8(%1), %%mm1\n"
937 "psllq $48, %%mm0\n"
938 "psllq $48, %%mm1\n"
939 "psrlq $48, %%mm0\n"
940 "movq %%mm7, %%mm2\n"
941 "movq %%mm7, %%mm3\n"
942 "psllq $16, %%mm2\n"
943 "psrlq $16, %%mm3\n"
944 "por %%mm2, %%mm0\n"
945 "por %%mm3, %%mm1\n"
947 /* current_upper */
948 "movq (%0), %%mm6\n"
950 /* compute the upper-left pixel for dst on %%mm2 */
951 /* compute the upper-right pixel for dst on %%mm4 */
952 "movq %%mm0, %%mm2\n"
953 "movq %%mm1, %%mm4\n"
954 "movq %%mm0, %%mm3\n"
955 "movq %%mm1, %%mm5\n"
956 "pcmpeqw %%mm6, %%mm2\n"
957 "pcmpeqw %%mm6, %%mm4\n"
958 "pcmpeqw (%2), %%mm3\n"
959 "pcmpeqw (%2), %%mm5\n"
960 "pandn %%mm2, %%mm3\n"
961 "pandn %%mm4, %%mm5\n"
962 "movq %%mm0, %%mm2\n"
963 "movq %%mm1, %%mm4\n"
964 "pcmpeqw %%mm1, %%mm2\n"
965 "pcmpeqw %%mm0, %%mm4\n"
966 "pandn %%mm3, %%mm2\n"
967 "pandn %%mm5, %%mm4\n"
968 "movq %%mm2, %%mm3\n"
969 "movq %%mm4, %%mm5\n"
970 "pand %%mm6, %%mm2\n"
971 "pand %%mm6, %%mm4\n"
972 "pandn %%mm7, %%mm3\n"
973 "pandn %%mm7, %%mm5\n"
974 "por %%mm3, %%mm2\n"
975 "por %%mm5, %%mm4\n"
977 /* set *dst */
978 "movq %%mm2, %%mm3\n"
979 "punpcklwd %%mm4, %%mm2\n"
980 "punpckhwd %%mm4, %%mm3\n"
981 "movq %%mm2, (%3)\n"
982 "movq %%mm3, 8(%3)\n"
984 /* next */
985 "add $8, %0\n"
986 "add $8, %1\n"
987 "add $8, %2\n"
988 "add $16, %3\n"
990 /* central runs */
991 "shr $2, %4\n"
992 "jz 1f\n"
994 "0:\n"
996 /* set the current, current_pre, current_next registers */
997 "movq -8(%1), %%mm0\n"
998 "movq (%1), %%mm7\n"
999 "movq 8(%1), %%mm1\n"
1000 "psrlq $48, %%mm0\n"
1001 "psllq $48, %%mm1\n"
1002 "movq %%mm7, %%mm2\n"
1003 "movq %%mm7, %%mm3\n"
1004 "psllq $16, %%mm2\n"
1005 "psrlq $16, %%mm3\n"
1006 "por %%mm2, %%mm0\n"
1007 "por %%mm3, %%mm1\n"
1009 /* current_upper */
1010 "movq (%0), %%mm6\n"
1012 /* compute the upper-left pixel for dst on %%mm2 */
1013 /* compute the upper-right pixel for dst on %%mm4 */
1014 "movq %%mm0, %%mm2\n"
1015 "movq %%mm1, %%mm4\n"
1016 "movq %%mm0, %%mm3\n"
1017 "movq %%mm1, %%mm5\n"
1018 "pcmpeqw %%mm6, %%mm2\n"
1019 "pcmpeqw %%mm6, %%mm4\n"
1020 "pcmpeqw (%2), %%mm3\n"
1021 "pcmpeqw (%2), %%mm5\n"
1022 "pandn %%mm2, %%mm3\n"
1023 "pandn %%mm4, %%mm5\n"
1024 "movq %%mm0, %%mm2\n"
1025 "movq %%mm1, %%mm4\n"
1026 "pcmpeqw %%mm1, %%mm2\n"
1027 "pcmpeqw %%mm0, %%mm4\n"
1028 "pandn %%mm3, %%mm2\n"
1029 "pandn %%mm5, %%mm4\n"
1030 "movq %%mm2, %%mm3\n"
1031 "movq %%mm4, %%mm5\n"
1032 "pand %%mm6, %%mm2\n"
1033 "pand %%mm6, %%mm4\n"
1034 "pandn %%mm7, %%mm3\n"
1035 "pandn %%mm7, %%mm5\n"
1036 "por %%mm3, %%mm2\n"
1037 "por %%mm5, %%mm4\n"
1039 /* set *dst */
1040 "movq %%mm2, %%mm3\n"
1041 "punpcklwd %%mm4, %%mm2\n"
1042 "punpckhwd %%mm4, %%mm3\n"
1043 "movq %%mm2, (%3)\n"
1044 "movq %%mm3, 8(%3)\n"
1046 /* next */
1047 "add $8, %0\n"
1048 "add $8, %1\n"
1049 "add $8, %2\n"
1050 "add $16, %3\n"
1052 "dec %4\n"
1053 "jnz 0b\n"
1054 "1:\n"
1056 /* final run */
1057 /* set the current, current_pre, current_next registers */
1058 "movq (%1), %%mm1\n"
1059 "movq (%1), %%mm7\n"
1060 "movq -8(%1), %%mm0\n"
1061 "psrlq $48, %%mm1\n"
1062 "psrlq $48, %%mm0\n"
1063 "psllq $48, %%mm1\n"
1064 "movq %%mm7, %%mm2\n"
1065 "movq %%mm7, %%mm3\n"
1066 "psllq $16, %%mm2\n"
1067 "psrlq $16, %%mm3\n"
1068 "por %%mm2, %%mm0\n"
1069 "por %%mm3, %%mm1\n"
1071 /* current_upper */
1072 "movq (%0), %%mm6\n"
1074 /* compute the upper-left pixel for dst on %%mm2 */
1075 /* compute the upper-right pixel for dst on %%mm4 */
1076 "movq %%mm0, %%mm2\n"
1077 "movq %%mm1, %%mm4\n"
1078 "movq %%mm0, %%mm3\n"
1079 "movq %%mm1, %%mm5\n"
1080 "pcmpeqw %%mm6, %%mm2\n"
1081 "pcmpeqw %%mm6, %%mm4\n"
1082 "pcmpeqw (%2), %%mm3\n"
1083 "pcmpeqw (%2), %%mm5\n"
1084 "pandn %%mm2, %%mm3\n"
1085 "pandn %%mm4, %%mm5\n"
1086 "movq %%mm0, %%mm2\n"
1087 "movq %%mm1, %%mm4\n"
1088 "pcmpeqw %%mm1, %%mm2\n"
1089 "pcmpeqw %%mm0, %%mm4\n"
1090 "pandn %%mm3, %%mm2\n"
1091 "pandn %%mm5, %%mm4\n"
1092 "movq %%mm2, %%mm3\n"
1093 "movq %%mm4, %%mm5\n"
1094 "pand %%mm6, %%mm2\n"
1095 "pand %%mm6, %%mm4\n"
1096 "pandn %%mm7, %%mm3\n"
1097 "pandn %%mm7, %%mm5\n"
1098 "por %%mm3, %%mm2\n"
1099 "por %%mm5, %%mm4\n"
1101 /* set *dst */
1102 "movq %%mm2, %%mm3\n"
1103 "punpcklwd %%mm4, %%mm2\n"
1104 "punpckhwd %%mm4, %%mm3\n"
1105 "movq %%mm2, (%3)\n"
1106 "movq %%mm3, 8(%3)\n"
1108 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1110 : "cc"
1114 static inline void scale2x_32_mmx_border(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1116 assert(count >= 4);
1117 assert(count % 2 == 0);
1119 /* always do the first and last run */
1120 count -= 2*2;
1122 __asm__ __volatile__(
1123 /* first run */
1124 /* set the current, current_pre, current_next registers */
1125 "movq 0(%1), %%mm0\n"
1126 "movq 0(%1), %%mm7\n"
1127 "movq 8(%1), %%mm1\n"
1128 "psllq $32, %%mm0\n"
1129 "psllq $32, %%mm1\n"
1130 "psrlq $32, %%mm0\n"
1131 "movq %%mm7, %%mm2\n"
1132 "movq %%mm7, %%mm3\n"
1133 "psllq $32, %%mm2\n"
1134 "psrlq $32, %%mm3\n"
1135 "por %%mm2, %%mm0\n"
1136 "por %%mm3, %%mm1\n"
1138 /* current_upper */
1139 "movq (%0), %%mm6\n"
1141 /* compute the upper-left pixel for dst on %%mm2 */
1142 /* compute the upper-right pixel for dst on %%mm4 */
1143 "movq %%mm0, %%mm2\n"
1144 "movq %%mm1, %%mm4\n"
1145 "movq %%mm0, %%mm3\n"
1146 "movq %%mm1, %%mm5\n"
1147 "pcmpeqd %%mm6, %%mm2\n"
1148 "pcmpeqd %%mm6, %%mm4\n"
1149 "pcmpeqd (%2), %%mm3\n"
1150 "pcmpeqd (%2), %%mm5\n"
1151 "pandn %%mm2, %%mm3\n"
1152 "pandn %%mm4, %%mm5\n"
1153 "movq %%mm0, %%mm2\n"
1154 "movq %%mm1, %%mm4\n"
1155 "pcmpeqd %%mm1, %%mm2\n"
1156 "pcmpeqd %%mm0, %%mm4\n"
1157 "pandn %%mm3, %%mm2\n"
1158 "pandn %%mm5, %%mm4\n"
1159 "movq %%mm2, %%mm3\n"
1160 "movq %%mm4, %%mm5\n"
1161 "pand %%mm6, %%mm2\n"
1162 "pand %%mm6, %%mm4\n"
1163 "pandn %%mm7, %%mm3\n"
1164 "pandn %%mm7, %%mm5\n"
1165 "por %%mm3, %%mm2\n"
1166 "por %%mm5, %%mm4\n"
1168 /* set *dst */
1169 "movq %%mm2, %%mm3\n"
1170 "punpckldq %%mm4, %%mm2\n"
1171 "punpckhdq %%mm4, %%mm3\n"
1172 "movq %%mm2, (%3)\n"
1173 "movq %%mm3, 8(%3)\n"
1175 /* next */
1176 "add $8, %0\n"
1177 "add $8, %1\n"
1178 "add $8, %2\n"
1179 "add $16, %3\n"
1181 /* central runs */
1182 "shr $1, %4\n"
1183 "jz 1f\n"
1185 "0:\n"
1187 /* set the current, current_pre, current_next registers */
1188 "movq -8(%1), %%mm0\n"
1189 "movq (%1), %%mm7\n"
1190 "movq 8(%1), %%mm1\n"
1191 "psrlq $32, %%mm0\n"
1192 "psllq $32, %%mm1\n"
1193 "movq %%mm7, %%mm2\n"
1194 "movq %%mm7, %%mm3\n"
1195 "psllq $32, %%mm2\n"
1196 "psrlq $32, %%mm3\n"
1197 "por %%mm2, %%mm0\n"
1198 "por %%mm3, %%mm1\n"
1200 /* current_upper */
1201 "movq (%0), %%mm6\n"
1203 /* compute the upper-left pixel for dst on %%mm2 */
1204 /* compute the upper-right pixel for dst on %%mm4 */
1205 "movq %%mm0, %%mm2\n"
1206 "movq %%mm1, %%mm4\n"
1207 "movq %%mm0, %%mm3\n"
1208 "movq %%mm1, %%mm5\n"
1209 "pcmpeqd %%mm6, %%mm2\n"
1210 "pcmpeqd %%mm6, %%mm4\n"
1211 "pcmpeqd (%2), %%mm3\n"
1212 "pcmpeqd (%2), %%mm5\n"
1213 "pandn %%mm2, %%mm3\n"
1214 "pandn %%mm4, %%mm5\n"
1215 "movq %%mm0, %%mm2\n"
1216 "movq %%mm1, %%mm4\n"
1217 "pcmpeqd %%mm1, %%mm2\n"
1218 "pcmpeqd %%mm0, %%mm4\n"
1219 "pandn %%mm3, %%mm2\n"
1220 "pandn %%mm5, %%mm4\n"
1221 "movq %%mm2, %%mm3\n"
1222 "movq %%mm4, %%mm5\n"
1223 "pand %%mm6, %%mm2\n"
1224 "pand %%mm6, %%mm4\n"
1225 "pandn %%mm7, %%mm3\n"
1226 "pandn %%mm7, %%mm5\n"
1227 "por %%mm3, %%mm2\n"
1228 "por %%mm5, %%mm4\n"
1230 /* set *dst */
1231 "movq %%mm2, %%mm3\n"
1232 "punpckldq %%mm4, %%mm2\n"
1233 "punpckhdq %%mm4, %%mm3\n"
1234 "movq %%mm2, (%3)\n"
1235 "movq %%mm3, 8(%3)\n"
1237 /* next */
1238 "add $8, %0\n"
1239 "add $8, %1\n"
1240 "add $8, %2\n"
1241 "add $16, %3\n"
1243 "dec %4\n"
1244 "jnz 0b\n"
1245 "1:\n"
1247 /* final run */
1248 /* set the current, current_pre, current_next registers */
1249 "movq (%1), %%mm1\n"
1250 "movq (%1), %%mm7\n"
1251 "movq -8(%1), %%mm0\n"
1252 "psrlq $32, %%mm1\n"
1253 "psrlq $32, %%mm0\n"
1254 "psllq $32, %%mm1\n"
1255 "movq %%mm7, %%mm2\n"
1256 "movq %%mm7, %%mm3\n"
1257 "psllq $32, %%mm2\n"
1258 "psrlq $32, %%mm3\n"
1259 "por %%mm2, %%mm0\n"
1260 "por %%mm3, %%mm1\n"
1262 /* current_upper */
1263 "movq (%0), %%mm6\n"
1265 /* compute the upper-left pixel for dst on %%mm2 */
1266 /* compute the upper-right pixel for dst on %%mm4 */
1267 "movq %%mm0, %%mm2\n"
1268 "movq %%mm1, %%mm4\n"
1269 "movq %%mm0, %%mm3\n"
1270 "movq %%mm1, %%mm5\n"
1271 "pcmpeqd %%mm6, %%mm2\n"
1272 "pcmpeqd %%mm6, %%mm4\n"
1273 "pcmpeqd (%2), %%mm3\n"
1274 "pcmpeqd (%2), %%mm5\n"
1275 "pandn %%mm2, %%mm3\n"
1276 "pandn %%mm4, %%mm5\n"
1277 "movq %%mm0, %%mm2\n"
1278 "movq %%mm1, %%mm4\n"
1279 "pcmpeqd %%mm1, %%mm2\n"
1280 "pcmpeqd %%mm0, %%mm4\n"
1281 "pandn %%mm3, %%mm2\n"
1282 "pandn %%mm5, %%mm4\n"
1283 "movq %%mm2, %%mm3\n"
1284 "movq %%mm4, %%mm5\n"
1285 "pand %%mm6, %%mm2\n"
1286 "pand %%mm6, %%mm4\n"
1287 "pandn %%mm7, %%mm3\n"
1288 "pandn %%mm7, %%mm5\n"
1289 "por %%mm3, %%mm2\n"
1290 "por %%mm5, %%mm4\n"
1292 /* set *dst */
1293 "movq %%mm2, %%mm3\n"
1294 "punpckldq %%mm4, %%mm2\n"
1295 "punpckhdq %%mm4, %%mm3\n"
1296 "movq %%mm2, (%3)\n"
1297 "movq %%mm3, 8(%3)\n"
1299 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1301 : "cc"
1306 * Scale by a factor of 2 a row of pixels of 8 bits.
1307 * This is a very fast MMX implementation.
1308 * The implementation uses a combination of cmp/and/not operations to
1309 * completly remove the need of conditional jumps. This trick give the
1310 * major speed improvement.
1311 * Also, using the 8 bytes MMX registers more than one pixel are computed
1312 * at the same time.
1313 * Before calling this function you must ensure that the currenct CPU supports
1314 * the MMX instruction set. After calling it you must be sure to call the EMMS
1315 * instruction before any floating-point operation.
1316 * The pixels over the left and right borders are assumed of the same color of
1317 * the pixels on the border.
1318 * Note that the implementation is optimized to write data sequentially to
1319 * maximize the bandwidth on video memory.
1320 * \param src0 Pointer at the first pixel of the previous row.
1321 * \param src1 Pointer at the first pixel of the current row.
1322 * \param src2 Pointer at the first pixel of the next row.
1323 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1324 * be at least 16 and a multiple of 8.
1325 * \param dst0 First destination row, double length in pixels.
1326 * \param dst1 Second destination row, double length in pixels.
1328 void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1330 if (count % 8 != 0 || count < 16) {
1331 scale2x_8_def(dst0, dst1, src0, src1, src2, count);
1332 } else {
1333 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1334 scale2x_8_mmx_border(dst1, src2, src1, src0, count);
1339 * Scale by a factor of 2 a row of pixels of 16 bits.
1340 * This function operates like scale2x_8_mmx() but for 16 bits pixels.
1341 * \param src0 Pointer at the first pixel of the previous row.
1342 * \param src1 Pointer at the first pixel of the current row.
1343 * \param src2 Pointer at the first pixel of the next row.
1344 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1345 * be at least 8 and a multiple of 4.
1346 * \param dst0 First destination row, double length in pixels.
1347 * \param dst1 Second destination row, double length in pixels.
1349 void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1351 if (count % 4 != 0 || count < 8) {
1352 scale2x_16_def(dst0, dst1, src0, src1, src2, count);
1353 } else {
1354 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1355 scale2x_16_mmx_border(dst1, src2, src1, src0, count);
1360 * Scale by a factor of 2 a row of pixels of 32 bits.
1361 * This function operates like scale2x_8_mmx() but for 32 bits pixels.
1362 * \param src0 Pointer at the first pixel of the previous row.
1363 * \param src1 Pointer at the first pixel of the current row.
1364 * \param src2 Pointer at the first pixel of the next row.
1365 * \param count Length in pixels of the src0, src1 and src2 rows. It must
1366 * be at least 4 and a multiple of 2.
1367 * \param dst0 First destination row, double length in pixels.
1368 * \param dst1 Second destination row, double length in pixels.
1370 void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1372 if (count % 2 != 0 || count < 4) {
1373 scale2x_32_def(dst0, dst1, src0, src1, src2, count);
1374 } else {
1375 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1376 scale2x_32_mmx_border(dst1, src2, src1, src0, count);
1381 * Scale by a factor of 2x3 a row of pixels of 8 bits.
1382 * This function operates like scale2x_8_mmx() but with an expansion
1383 * factor of 2x3 instead of 2x2.
1385 void scale2x3_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1387 if (count % 8 != 0 || count < 16) {
1388 scale2x3_8_def(dst0, dst1, dst2, src0, src1, src2, count);
1389 } else {
1390 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1391 scale2x_8_def_center(dst1, src0, src1, src2, count);
1392 scale2x_8_mmx_border(dst2, src2, src1, src0, count);
1397 * Scale by a factor of 2x3 a row of pixels of 16 bits.
1398 * This function operates like scale2x_16_mmx() but with an expansion
1399 * factor of 2x3 instead of 2x2.
1401 void scale2x3_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1403 if (count % 4 != 0 || count < 8) {
1404 scale2x3_16_def(dst0, dst1, dst2, src0, src1, src2, count);
1405 } else {
1406 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1407 scale2x_16_def_center(dst1, src0, src1, src2, count);
1408 scale2x_16_mmx_border(dst2, src2, src1, src0, count);
1413 * Scale by a factor of 2x3 a row of pixels of 32 bits.
1414 * This function operates like scale2x_32_mmx() but with an expansion
1415 * factor of 2x3 instead of 2x2.
1417 void scale2x3_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1419 if (count % 2 != 0 || count < 4) {
1420 scale2x3_32_def(dst0, dst1, dst2, src0, src1, src2, count);
1421 } else {
1422 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1423 scale2x_32_def_center(dst1, src0, src1, src2, count);
1424 scale2x_32_mmx_border(dst2, src2, src1, src0, count);
1429 * Scale by a factor of 2x4 a row of pixels of 8 bits.
1430 * This function operates like scale2x_8_mmx() but with an expansion
1431 * factor of 2x4 instead of 2x2.
1433 void scale2x4_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1435 if (count % 8 != 0 || count < 16) {
1436 scale2x4_8_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1437 } else {
1438 scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1439 scale2x_8_def_center(dst1, src0, src1, src2, count);
1440 scale2x_8_def_center(dst2, src0, src1, src2, count);
1441 scale2x_8_mmx_border(dst3, src2, src1, src0, count);
1446 * Scale by a factor of 2x4 a row of pixels of 16 bits.
1447 * This function operates like scale2x_16_mmx() but with an expansion
1448 * factor of 2x4 instead of 2x2.
1450 void scale2x4_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1452 if (count % 4 != 0 || count < 8) {
1453 scale2x4_16_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1454 } else {
1455 scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1456 scale2x_16_def_center(dst1, src0, src1, src2, count);
1457 scale2x_16_def_center(dst2, src0, src1, src2, count);
1458 scale2x_16_mmx_border(dst3, src2, src1, src0, count);
1463 * Scale by a factor of 2x4 a row of pixels of 32 bits.
1464 * This function operates like scale2x_32_mmx() but with an expansion
1465 * factor of 2x4 instead of 2x2.
1467 void scale2x4_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1469 if (count % 2 != 0 || count < 4) {
1470 scale2x4_32_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1471 } else {
1472 scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1473 scale2x_32_def_center(dst1, src0, src1, src2, count);
1474 scale2x_32_def_center(dst2, src0, src1, src2, count);
1475 scale2x_32_mmx_border(dst3, src2, src1, src0, count);
1479 #endif