configure.in: default profiling to off, as its broken
[rofl0r-VisualBoyAdvance.git] / src / admame.cpp
blobbdde667fc248ec0ced9ef1fc36d839536ae65b81
1 /*
2 * This file is part of the Advance project.
4 * Copyright (C) 1999-2002 Andrea Mazzoleni
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 * In addition, as a special exception, Andrea Mazzoleni
21 * gives permission to link the code of this program with
22 * the MAME library (or with modified versions of MAME that use the
23 * same license as MAME), and distribute linked combinations including
24 * the two. You must obey the GNU General Public License in all
25 * respects for all of the code used other than MAME. If you modify
26 * this file, you may extend this exception to your version of the
27 * file, but you are not obligated to do so. If you do not wish to
28 * do so, delete this exception statement from your version.
32 * Alternatively at the previous license terms, you are allowed to use this
33 * code in your program with these conditions:
34 * - the program is not used in commercial activities.
35 * - the whole source code of the program is released with the binary.
38 #include "System.h"
40 #ifdef MMX
41 extern "C" bool cpu_mmx;
42 #endif
44 static void internal_scale2x_16_def(u16 *dst, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
45 /* first pixel */
46 dst[0] = src1[0];
47 if (src1[1] == src0[0] && src2[0] != src0[0])
48 dst[1] = src0[0];
49 else
50 dst[1] = src1[0];
51 ++src0;
52 ++src1;
53 ++src2;
54 dst += 2;
56 /* central pixels */
57 count -= 2;
58 while (count) {
59 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
60 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
61 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
62 } else {
63 dst[0] = src1[0];
64 dst[1] = src1[0];
67 ++src0;
68 ++src1;
69 ++src2;
70 dst += 2;
71 --count;
74 /* last pixel */
75 if (src1[-1] == src0[0] && src2[0] != src0[0])
76 dst[0] = src0[0];
77 else
78 dst[0] = src1[0];
79 dst[1] = src1[0];
82 static void internal_scale2x_32_def(u32* dst,
83 const u32* src0,
84 const u32* src1,
85 const u32* src2,
86 unsigned count)
88 /* first pixel */
89 dst[0] = src1[0];
90 if (src1[1] == src0[0] && src2[0] != src0[0])
91 dst[1] = src0[0];
92 else
93 dst[1] = src1[0];
94 ++src0;
95 ++src1;
96 ++src2;
97 dst += 2;
99 /* central pixels */
100 count -= 2;
101 while (count) {
102 if (src0[0] != src2[0] && src1[-1] != src1[1]) {
103 dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
104 dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
105 } else {
106 dst[0] = src1[0];
107 dst[1] = src1[0];
110 ++src0;
111 ++src1;
112 ++src2;
113 dst += 2;
114 --count;
117 /* last pixel */
118 if (src1[-1] == src0[0] && src2[0] != src0[0])
119 dst[0] = src0[0];
120 else
121 dst[0] = src1[0];
122 dst[1] = src1[0];
125 #ifdef MMX
126 static void internal_scale2x_16_mmx_single(u16* dst, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
127 /* always do the first and last run */
128 count -= 2*4;
130 #ifdef __GNUC__
131 __asm__ __volatile__(
132 /* first run */
133 /* set the current, current_pre, current_next registers */
134 "movq 0(%1), %%mm0\n"
135 "movq 0(%1),%%mm7\n"
136 "movq 8(%1),%%mm1\n"
137 "psllq $48,%%mm0\n"
138 "psllq $48,%%mm1\n"
139 "psrlq $48, %%mm0\n"
140 "movq %%mm7,%%mm2\n"
141 "movq %%mm7,%%mm3\n"
142 "psllq $16,%%mm2\n"
143 "psrlq $16,%%mm3\n"
144 "por %%mm2,%%mm0\n"
145 "por %%mm3,%%mm1\n"
147 /* current_upper */
148 "movq (%0),%%mm6\n"
150 /* compute the upper-left pixel for dst on %%mm2 */
151 /* compute the upper-right pixel for dst on %%mm4 */
152 "movq %%mm0,%%mm2\n"
153 "movq %%mm1,%%mm4\n"
154 "movq %%mm0,%%mm3\n"
155 "movq %%mm1,%%mm5\n"
156 "pcmpeqw %%mm6,%%mm2\n"
157 "pcmpeqw %%mm6,%%mm4\n"
158 "pcmpeqw (%2),%%mm3\n"
159 "pcmpeqw (%2),%%mm5\n"
160 "pandn %%mm2,%%mm3\n"
161 "pandn %%mm4,%%mm5\n"
162 "movq %%mm0,%%mm2\n"
163 "movq %%mm1,%%mm4\n"
164 "pcmpeqw %%mm1,%%mm2\n"
165 "pcmpeqw %%mm0,%%mm4\n"
166 "pandn %%mm3,%%mm2\n"
167 "pandn %%mm5,%%mm4\n"
168 "movq %%mm2,%%mm3\n"
169 "movq %%mm4,%%mm5\n"
170 "pand %%mm6,%%mm2\n"
171 "pand %%mm6,%%mm4\n"
172 "pandn %%mm7,%%mm3\n"
173 "pandn %%mm7,%%mm5\n"
174 "por %%mm3,%%mm2\n"
175 "por %%mm5,%%mm4\n"
177 /* set *dst */
178 "movq %%mm2,%%mm3\n"
179 "punpcklwd %%mm4,%%mm2\n"
180 "punpckhwd %%mm4,%%mm3\n"
181 "movq %%mm2,(%3)\n"
182 "movq %%mm3,8(%3)\n"
184 /* next */
185 "addl $8,%0\n"
186 "addl $8,%1\n"
187 "addl $8,%2\n"
188 "addl $16,%3\n"
190 /* central runs */
191 "shrl $2,%4\n"
192 "jz 1f\n"
194 "0:\n"
196 /* set the current, current_pre, current_next registers */
197 "movq -8(%1),%%mm0\n"
198 "movq (%1),%%mm7\n"
199 "movq 8(%1),%%mm1\n"
200 "psrlq $48,%%mm0\n"
201 "psllq $48,%%mm1\n"
202 "movq %%mm7,%%mm2\n"
203 "movq %%mm7,%%mm3\n"
204 "psllq $16,%%mm2\n"
205 "psrlq $16,%%mm3\n"
206 "por %%mm2,%%mm0\n"
207 "por %%mm3,%%mm1\n"
209 /* current_upper */
210 "movq (%0),%%mm6\n"
212 /* compute the upper-left pixel for dst on %%mm2 */
213 /* compute the upper-right pixel for dst on %%mm4 */
214 "movq %%mm0,%%mm2\n"
215 "movq %%mm1,%%mm4\n"
216 "movq %%mm0,%%mm3\n"
217 "movq %%mm1,%%mm5\n"
218 "pcmpeqw %%mm6,%%mm2\n"
219 "pcmpeqw %%mm6,%%mm4\n"
220 "pcmpeqw (%2),%%mm3\n"
221 "pcmpeqw (%2),%%mm5\n"
222 "pandn %%mm2,%%mm3\n"
223 "pandn %%mm4,%%mm5\n"
224 "movq %%mm0,%%mm2\n"
225 "movq %%mm1,%%mm4\n"
226 "pcmpeqw %%mm1,%%mm2\n"
227 "pcmpeqw %%mm0,%%mm4\n"
228 "pandn %%mm3,%%mm2\n"
229 "pandn %%mm5,%%mm4\n"
230 "movq %%mm2,%%mm3\n"
231 "movq %%mm4,%%mm5\n"
232 "pand %%mm6,%%mm2\n"
233 "pand %%mm6,%%mm4\n"
234 "pandn %%mm7,%%mm3\n"
235 "pandn %%mm7,%%mm5\n"
236 "por %%mm3,%%mm2\n"
237 "por %%mm5,%%mm4\n"
239 /* set *dst */
240 "movq %%mm2,%%mm3\n"
241 "punpcklwd %%mm4,%%mm2\n"
242 "punpckhwd %%mm4,%%mm3\n"
243 "movq %%mm2,(%3)\n"
244 "movq %%mm3,8(%3)\n"
246 /* next */
247 "addl $8,%0\n"
248 "addl $8,%1\n"
249 "addl $8,%2\n"
250 "addl $16,%3\n"
252 "decl %4\n"
253 "jnz 0b\n"
254 "1:\n"
256 /* final run */
257 /* set the current, current_pre, current_next registers */
258 "movq (%1),%%mm1\n"
259 "movq (%1),%%mm7\n"
260 "movq -8(%1),%%mm0\n"
261 "psrlq $48,%%mm1\n"
262 "psrlq $48,%%mm0\n"
263 "psllq $48,%%mm1\n"
264 "movq %%mm7,%%mm2\n"
265 "movq %%mm7,%%mm3\n"
266 "psllq $16,%%mm2\n"
267 "psrlq $16,%%mm3\n"
268 "por %%mm2,%%mm0\n"
269 "por %%mm3,%%mm1\n"
271 /* current_upper */
272 "movq (%0),%%mm6\n"
274 /* compute the upper-left pixel for dst on %%mm2 */
275 /* compute the upper-right pixel for dst on %%mm4 */
276 "movq %%mm0,%%mm2\n"
277 "movq %%mm1,%%mm4\n"
278 "movq %%mm0,%%mm3\n"
279 "movq %%mm1,%%mm5\n"
280 "pcmpeqw %%mm6,%%mm2\n"
281 "pcmpeqw %%mm6,%%mm4\n"
282 "pcmpeqw (%2),%%mm3\n"
283 "pcmpeqw (%2),%%mm5\n"
284 "pandn %%mm2,%%mm3\n"
285 "pandn %%mm4,%%mm5\n"
286 "movq %%mm0,%%mm2\n"
287 "movq %%mm1,%%mm4\n"
288 "pcmpeqw %%mm1,%%mm2\n"
289 "pcmpeqw %%mm0,%%mm4\n"
290 "pandn %%mm3,%%mm2\n"
291 "pandn %%mm5,%%mm4\n"
292 "movq %%mm2,%%mm3\n"
293 "movq %%mm4,%%mm5\n"
294 "pand %%mm6,%%mm2\n"
295 "pand %%mm6,%%mm4\n"
296 "pandn %%mm7,%%mm3\n"
297 "pandn %%mm7,%%mm5\n"
298 "por %%mm3,%%mm2\n"
299 "por %%mm5,%%mm4\n"
301 /* set *dst */
302 "movq %%mm2,%%mm3\n"
303 "punpcklwd %%mm4,%%mm2\n"
304 "punpckhwd %%mm4,%%mm3\n"
305 "movq %%mm2,(%3)\n"
306 "movq %%mm3,8(%3)\n"
307 "emms\n"
309 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
311 : "cc"
313 #else
314 __asm {
315 mov eax, src0;
316 mov ebx, src1;
317 mov ecx, src2;
318 mov edx, dst;
319 mov esi, count;
321 /* first run */
322 /* set the current, current_pre, current_next registers */
323 movq mm0, qword ptr [ebx];
324 movq mm7, qword ptr [ebx];
325 movq mm1, qword ptr [ebx + 8];
326 psllq mm0, 48;
327 psllq mm1, 48;
328 psrlq mm0, 48;
329 movq mm2, mm7;
330 movq mm3, mm7;
331 psllq mm2, 16;
332 psrlq mm3, 16;
333 por mm0, mm2;
334 por mm1, mm3;
336 /* current_upper */
337 movq mm6, qword ptr [eax];
339 /* compute the upper-left pixel for dst on %%mm2 */
340 /* compute the upper-right pixel for dst on %%mm4 */
341 movq mm2, mm0;
342 movq mm4, mm1;
343 movq mm3, mm0;
344 movq mm5, mm1;
345 pcmpeqw mm2, mm6;
346 pcmpeqw mm4, mm6;
347 pcmpeqw mm3, qword ptr [ecx];
348 pcmpeqw mm5, qword ptr [ecx];
349 pandn mm3,mm2;
350 pandn mm5,mm4;
351 movq mm2,mm0;
352 movq mm4,mm1;
353 pcmpeqw mm2,mm1;
354 pcmpeqw mm4,mm0;
355 pandn mm2,mm3;
356 pandn mm4,mm5;
357 movq mm3,mm2;
358 movq mm5,mm4;
359 pand mm2,mm6;
360 pand mm4,mm6;
361 pandn mm3,mm7;
362 pandn mm5,mm7;
363 por mm2,mm3;
364 por mm4,mm5;
366 /* set *dst0 */
367 movq mm3,mm2;
368 punpcklwd mm2,mm4;
369 punpckhwd mm3,mm4;
370 movq qword ptr [edx], mm2;
371 movq qword ptr [edx + 8], mm3;
373 /* next */
374 add eax, 8;
375 add ebx, 8;
376 add ecx, 8;
377 add edx, 16;
379 /* central runs */
380 shr esi, 2;
381 jz label1;
382 align 4;
383 label0:
385 /* set the current, current_pre, current_next registers */
386 movq mm0, qword ptr [ebx-8];
387 movq mm7, qword ptr [ebx];
388 movq mm1, qword ptr [ebx+8];
389 psrlq mm0,48;
390 psllq mm1,48;
391 movq mm2,mm7;
392 movq mm3,mm7;
393 psllq mm2,16;
394 psrlq mm3,16;
395 por mm0,mm2;
396 por mm1,mm3;
398 /* current_upper */
399 movq mm6, qword ptr [eax];
401 /* compute the upper-left pixel for dst on %%mm2 */
402 /* compute the upper-right pixel for dst on %%mm4 */
403 movq mm2,mm0;
404 movq mm4,mm1;
405 movq mm3,mm0;
406 movq mm5,mm1;
407 pcmpeqw mm2,mm6;
408 pcmpeqw mm4,mm6;
409 pcmpeqw mm3, qword ptr [ecx];
410 pcmpeqw mm5, qword ptr [ecx];
411 pandn mm3,mm2;
412 pandn mm5,mm4;
413 movq mm2,mm0;
414 movq mm4,mm1;
415 pcmpeqw mm2,mm1;
416 pcmpeqw mm4,mm0;
417 pandn mm2,mm3;
418 pandn mm4,mm5;
419 movq mm3,mm2;
420 movq mm5,mm4;
421 pand mm2,mm6;
422 pand mm4,mm6;
423 pandn mm3,mm7;
424 pandn mm5,mm7;
425 por mm2,mm3;
426 por mm4,mm5;
428 /* set *dst */
429 movq mm3,mm2;
430 punpcklwd mm2,mm4;
431 punpckhwd mm3,mm4;
432 movq qword ptr [edx], mm2;
433 movq qword ptr [edx+8], mm3;
435 /* next */
436 add eax,8;
437 add ebx,8;
438 add ecx,8;
439 add edx,16;
441 dec esi;
442 jnz label0;
443 label1:
445 /* final run */
446 /* set the current, current_pre, current_next registers */
447 movq mm1, qword ptr [ebx];
448 movq mm7, qword ptr [ebx];
449 movq mm0, qword ptr [ebx-8];
450 psrlq mm1,48;
451 psrlq mm0,48;
452 psllq mm1,48;
453 movq mm2,mm7;
454 movq mm3,mm7;
455 psllq mm2,16;
456 psrlq mm3,16;
457 por mm0,mm2;
458 por mm1,mm3;
460 /* current_upper */
461 movq mm6, qword ptr [eax];
463 /* compute the upper-left pixel for dst on %%mm2 */
464 /* compute the upper-right pixel for dst on %%mm4 */
465 movq mm2,mm0;
466 movq mm4,mm1;
467 movq mm3,mm0;
468 movq mm5,mm1;
469 pcmpeqw mm2,mm6;
470 pcmpeqw mm4,mm6;
471 pcmpeqw mm3, qword ptr [ecx];
472 pcmpeqw mm5, qword ptr [ecx];
473 pandn mm3,mm2;
474 pandn mm5,mm4;
475 movq mm2,mm0;
476 movq mm4,mm1;
477 pcmpeqw mm2,mm1;
478 pcmpeqw mm4,mm0;
479 pandn mm2,mm3;
480 pandn mm4,mm5;
481 movq mm3,mm2;
482 movq mm5,mm4;
483 pand mm2,mm6;
484 pand mm4,mm6;
485 pandn mm3,mm7;
486 pandn mm5,mm7;
487 por mm2,mm3;
488 por mm4,mm5;
490 /* set *dst */
491 movq mm3,mm2;
492 punpcklwd mm2,mm4;
493 punpckhwd mm3,mm4;
494 movq qword ptr [edx], mm2;
495 movq qword ptr [edx+8], mm3;
497 mov src0, eax;
498 mov src1, ebx;
499 mov src2, ecx;
500 mov dst, edx;
501 mov count, esi;
503 emms;
505 #endif
508 static void internal_scale2x_32_mmx_single(u32* dst, const u32* src0, const u32* src1, const u32* src2, unsigned count) {
509 /* always do the first and last run */
510 count -= 2*2;
512 #ifdef __GNUC__
513 __asm__ __volatile__(
514 /* first run */
515 /* set the current, current_pre, current_next registers */
516 "movq 0(%1),%%mm0\n"
517 "movq 0(%1),%%mm7\n"
518 "movq 8(%1),%%mm1\n"
519 "psllq $32,%%mm0\n"
520 "psllq $32,%%mm1\n"
521 "psrlq $32,%%mm0\n"
522 "movq %%mm7,%%mm2\n"
523 "movq %%mm7,%%mm3\n"
524 "psllq $32,%%mm2\n"
525 "psrlq $32,%%mm3\n"
526 "por %%mm2,%%mm0\n"
527 "por %%mm3,%%mm1\n"
529 /* current_upper */
530 "movq (%0),%%mm6\n"
532 /* compute the upper-left pixel for dst on %%mm2 */
533 /* compute the upper-right pixel for dst on %%mm4 */
534 "movq %%mm0,%%mm2\n"
535 "movq %%mm1,%%mm4\n"
536 "movq %%mm0,%%mm3\n"
537 "movq %%mm1,%%mm5\n"
538 "pcmpeqd %%mm6,%%mm2\n"
539 "pcmpeqd %%mm6,%%mm4\n"
540 "pcmpeqd (%2),%%mm3\n"
541 "pcmpeqd (%2),%%mm5\n"
542 "pandn %%mm2,%%mm3\n"
543 "pandn %%mm4,%%mm5\n"
544 "movq %%mm0,%%mm2\n"
545 "movq %%mm1,%%mm4\n"
546 "pcmpeqd %%mm1,%%mm2\n"
547 "pcmpeqd %%mm0,%%mm4\n"
548 "pandn %%mm3,%%mm2\n"
549 "pandn %%mm5,%%mm4\n"
550 "movq %%mm2,%%mm3\n"
551 "movq %%mm4,%%mm5\n"
552 "pand %%mm6,%%mm2\n"
553 "pand %%mm6,%%mm4\n"
554 "pandn %%mm7,%%mm3\n"
555 "pandn %%mm7,%%mm5\n"
556 "por %%mm3,%%mm2\n"
557 "por %%mm5,%%mm4\n"
559 /* set *dst */
560 "movq %%mm2,%%mm3\n"
561 "punpckldq %%mm4,%%mm2\n"
562 "punpckhdq %%mm4,%%mm3\n"
563 "movq %%mm2,(%3)\n"
564 "movq %%mm3, 8(%3)\n"
566 /* next */
567 "addl $8,%0\n"
568 "addl $8,%1\n"
569 "addl $8,%2\n"
570 "addl $16,%3\n"
572 /* central runs */
573 "shrl $1,%4\n"
574 "jz 1f\n"
576 "0:\n"
578 /* set the current, current_pre, current_next registers */
579 "movq -8(%1),%%mm0\n"
580 "movq (%1),%%mm7\n"
581 "movq 8(%1),%%mm1\n"
582 "psrlq $32,%%mm0\n"
583 "psllq $32,%%mm1\n"
584 "movq %%mm7,%%mm2\n"
585 "movq %%mm7,%%mm3\n"
586 "psllq $32,%%mm2\n"
587 "psrlq $32,%%mm3\n"
588 "por %%mm2,%%mm0\n"
589 "por %%mm3,%%mm1\n"
591 /* current_upper */
592 "movq (%0),%%mm6\n"
594 /* compute the upper-left pixel for dst on %%mm2 */
595 /* compute the upper-right pixel for dst on %%mm4 */
596 "movq %%mm0,%%mm2\n"
597 "movq %%mm1,%%mm4\n"
598 "movq %%mm0,%%mm3\n"
599 "movq %%mm1,%%mm5\n"
600 "pcmpeqd %%mm6,%%mm2\n"
601 "pcmpeqd %%mm6,%%mm4\n"
602 "pcmpeqd (%2),%%mm3\n"
603 "pcmpeqd (%2),%%mm5\n"
604 "pandn %%mm2,%%mm3\n"
605 "pandn %%mm4,%%mm5\n"
606 "movq %%mm0,%%mm2\n"
607 "movq %%mm1,%%mm4\n"
608 "pcmpeqd %%mm1,%%mm2\n"
609 "pcmpeqd %%mm0,%%mm4\n"
610 "pandn %%mm3,%%mm2\n"
611 "pandn %%mm5,%%mm4\n"
612 "movq %%mm2,%%mm3\n"
613 "movq %%mm4,%%mm5\n"
614 "pand %%mm6,%%mm2\n"
615 "pand %%mm6,%%mm4\n"
616 "pandn %%mm7,%%mm3\n"
617 "pandn %%mm7,%%mm5\n"
618 "por %%mm3,%%mm2\n"
619 "por %%mm5,%%mm4\n"
621 /* set *dst */
622 "movq %%mm2,%%mm3\n"
623 "punpckldq %%mm4,%%mm2\n"
624 "punpckhdq %%mm4,%%mm3\n"
625 "movq %%mm2,(%3)\n"
626 "movq %%mm3,8(%3)\n"
628 /* next */
629 "addl $8,%0\n"
630 "addl $8,%1\n"
631 "addl $8,%2\n"
632 "addl $16,%3\n"
634 "decl %4\n"
635 "jnz 0b\n"
636 "1:\n"
638 /* final run */
639 /* set the current, current_pre, current_next registers */
640 "movq (%1),%%mm1\n"
641 "movq (%1),%%mm7\n"
642 "movq -8(%1), %%mm0\n"
643 "psrlq $32,%%mm1\n"
644 "psrlq $32,%%mm0\n"
645 "psllq $32,%%mm1\n"
646 "movq %%mm7,%%mm2\n"
647 "movq %%mm7,%%mm3\n"
648 "psllq $32,%%mm2\n"
649 "psrlq $32,%%mm3\n"
650 "por %%mm2,%%mm0\n"
651 "por %%mm3,%%mm1\n"
653 /* current_upper */
654 "movq (%0),%%mm6\n"
656 /* compute the upper-left pixel for dst on %%mm2 */
657 /* compute the upper-right pixel for dst on %%mm4 */
658 "movq %%mm0,%%mm2\n"
659 "movq %%mm1,%%mm4\n"
660 "movq %%mm0,%%mm3\n"
661 "movq %%mm1,%%mm5\n"
662 "pcmpeqd %%mm6,%%mm2\n"
663 "pcmpeqd %%mm6,%%mm4\n"
664 "pcmpeqd (%2),%%mm3\n"
665 "pcmpeqd (%2),%%mm5\n"
666 "pandn %%mm2,%%mm3\n"
667 "pandn %%mm4,%%mm5\n"
668 "movq %%mm0,%%mm2\n"
669 "movq %%mm1,%%mm4\n"
670 "pcmpeqd %%mm1,%%mm2\n"
671 "pcmpeqd %%mm0,%%mm4\n"
672 "pandn %%mm3,%%mm2\n"
673 "pandn %%mm5,%%mm4\n"
674 "movq %%mm2,%%mm3\n"
675 "movq %%mm4,%%mm5\n"
676 "pand %%mm6,%%mm2\n"
677 "pand %%mm6,%%mm4\n"
678 "pandn %%mm7,%%mm3\n"
679 "pandn %%mm7,%%mm5\n"
680 "por %%mm3,%%mm2\n"
681 "por %%mm5,%%mm4\n"
683 /* set *dst */
684 "movq %%mm2,%%mm3\n"
685 "punpckldq %%mm4,%%mm2\n"
686 "punpckhdq %%mm4,%%mm3\n"
687 "movq %%mm2,(%3)\n"
688 "movq %%mm3,8(%3)\n"
689 "emms\n"
691 : "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
693 : "cc"
695 #else
696 __asm {
697 mov eax, src0;
698 mov ebx, src1;
699 mov ecx, src2;
700 mov edx, dst;
701 mov esi, count;
703 /* first run */
704 /* set the current, current_pre, current_next registers */
705 movq mm0,qword ptr [ebx];
706 movq mm7,qword ptr [ebx];
707 movq mm1,qword ptr [ebx + 8];
708 psllq mm0,32;
709 psllq mm1,32;
710 psrlq mm0,32;
711 movq mm2,mm7;
712 movq mm3,mm7;
713 psllq mm2,32;
714 psrlq mm3,32;
715 por mm0,mm2;
716 por mm1,mm3;
718 /* current_upper */
719 movq mm6,qword ptr [eax];
721 /* compute the upper-left pixel for dst on %%mm2 */
722 /* compute the upper-right pixel for dst on %%mm4 */
723 movq mm2,mm0;
724 movq mm4,mm1;
725 movq mm3,mm0;
726 movq mm5,mm1;
727 pcmpeqd mm2,mm6;
728 pcmpeqd mm4,mm6;
729 pcmpeqd mm3,qword ptr [ecx];
730 pcmpeqd mm5,qword ptr [ecx];
731 pandn mm3,mm2;
732 pandn mm5,mm4;
733 movq mm2,mm0;
734 movq mm4,mm1;
735 pcmpeqd mm2,mm1;
736 pcmpeqd mm4,mm0;
737 pandn mm2,mm3;
738 pandn mm4,mm5;
739 movq mm3,mm2;
740 movq mm5,mm4;
741 pand mm2,mm6;
742 pand mm4,mm6;
743 pandn mm3,mm7;
744 pandn mm5,mm7;
745 por mm2,mm3;
746 por mm4,mm5;
748 /* set *dst */
749 movq mm3,mm2;
750 punpckldq mm2,mm4;
751 punpckhdq mm3,mm4;
752 movq qword ptr [edx],mm2;
753 movq qword ptr [edx+8],mm3;
755 /* next */
756 add eax,8;
757 add ebx,8;
758 add ecx,8;
759 add edx,16;
761 /* central runs */
762 shr esi,1;
763 jz label1;
764 label0:
766 /* set the current, current_pre, current_next registers */
767 movq mm0,qword ptr [ebx-8];
768 movq mm7,qword ptr [ebx];
769 movq mm1,qword ptr [ebx+8];
770 psrlq mm0,32;
771 psllq mm1,32;
772 movq mm2,mm7;
773 movq mm3,mm7;
774 psllq mm2,32;
775 psrlq mm3,32;
776 por mm0,mm2;
777 por mm1,mm3;
779 /* current_upper */
780 movq mm6,qword ptr[eax];
782 /* compute the upper-left pixel for dst on %%mm2 */
783 /* compute the upper-right pixel for dst on %%mm4 */
784 movq mm2,mm0;
785 movq mm4,mm1;
786 movq mm3,mm0;
787 movq mm5,mm1;
788 pcmpeqd mm2,mm6;
789 pcmpeqd mm4,mm6;
790 pcmpeqd mm3,qword ptr[ecx];
791 pcmpeqd mm5,qword ptr[ecx];
792 pandn mm3,mm2;
793 pandn mm5,mm4;
794 movq mm2,mm0;
795 movq mm4,mm1;
796 pcmpeqd mm2,mm1;
797 pcmpeqd mm4,mm0;
798 pandn mm2,mm3;
799 pandn mm4,mm5;
800 movq mm3,mm2;
801 movq mm5,mm4;
802 pand mm2,mm6;
803 pand mm4,mm6;
804 pandn mm3,mm7;
805 pandn mm5,mm7;
806 por mm2,mm3;
807 por mm4,mm5;
809 /* set *dst */
810 movq mm3,mm2;
811 punpckldq mm2,mm4;
812 punpckhdq mm3,mm4;
813 movq qword ptr [edx],mm2;
814 movq qword ptr [edx+8],mm3;
816 /* next */
817 add eax,8;
818 add ebx,8;
819 add ecx,8;
820 add edx,16;
822 dec esi;
823 jnz label0;
824 label1:
826 /* final run */
827 /* set the current, current_pre, current_next registers */
828 movq mm1,qword ptr [ebx];
829 movq mm7,qword ptr [ebx];
830 movq mm0,qword ptr [ebx-8];
831 psrlq mm1,32;
832 psrlq mm0,32;
833 psllq mm1,32;
834 movq mm2,mm7;
835 movq mm3,mm7;
836 psllq mm2,32;
837 psrlq mm3,32;
838 por mm0,mm2;
839 por mm1,mm3;
841 /* current_upper */
842 movq mm6,qword ptr [eax];
844 /* compute the upper-left pixel for dst on %%mm2 */
845 /* compute the upper-right pixel for dst on %%mm4 */
846 movq mm2,mm0;
847 movq mm4,mm1;
848 movq mm3,mm0;
849 movq mm5,mm1;
850 pcmpeqd mm2,mm6;
851 pcmpeqd mm4,mm6;
852 pcmpeqd mm3,qword ptr [ecx];
853 pcmpeqd mm5,qword ptr [ecx];
854 pandn mm3,mm2;
855 pandn mm5,mm4;
856 movq mm2,mm0;
857 movq mm4,mm1;
858 pcmpeqd mm2,mm1;
859 pcmpeqd mm4,mm0;
860 pandn mm2,mm3;
861 pandn mm4,mm5;
862 movq mm3,mm2;
863 movq mm5,mm4;
864 pand mm2,mm6;
865 pand mm4,mm6;
866 pandn mm3,mm7;
867 pandn mm5,mm7;
868 por mm2,mm3;
869 por mm4,mm5;
871 /* set *dst */
872 movq mm3,mm2;
873 punpckldq mm2,mm4;
874 punpckhdq mm3,mm4;
875 movq qword ptr [edx],mm2;
876 movq qword ptr [edx+8],mm3;
878 mov src0, eax;
879 mov src1, ebx;
880 mov src2, ecx;
881 mov dst, edx;
882 mov count, esi;
884 emms;
886 #endif
889 static void internal_scale2x_16_mmx(u16* dst0, u16* dst1, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
890 // assert( count >= 2*4 );
891 internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
892 internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
895 static void internal_scale2x_32_mmx(u32* dst0, u32* dst1, const u32* src0, const u32* src1, const u32* src2, unsigned count) {
896 // assert( count >= 2*2 );
897 internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
898 internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
900 #endif
902 void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
903 u8 *dstPtr, u32 dstPitch, int width, int height)
905 u16 *dst0 = (u16 *)dstPtr;
906 u16 *dst1 = dst0 + (dstPitch >> 1);
908 u16 *src0 = (u16 *)srcPtr;
909 u16 *src1 = src0 + (srcPitch >> 1);
910 u16 *src2 = src1 + (srcPitch >> 1);
911 #ifdef MMX
912 if(cpu_mmx) {
913 internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
915 int count = height;
917 count -= 2;
918 while(count) {
919 dst0 += dstPitch;
920 dst1 += dstPitch;
921 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
922 src0 = src1;
923 src1 = src2;
924 src2 += srcPitch >> 1;
925 --count;
927 dst0 += dstPitch;
928 dst1 += dstPitch;
929 internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
930 } else {
931 #endif
932 internal_scale2x_16_def(dst0, src0, src0, src1, width);
933 internal_scale2x_16_def(dst1, src1, src0, src0, width);
935 int count = height;
937 count -= 2;
938 while(count) {
939 dst0 += dstPitch;
940 dst1 += dstPitch;
941 internal_scale2x_16_def(dst0, src0, src1, src2, width);
942 internal_scale2x_16_def(dst1, src2, src1, src0, width);
943 src0 = src1;
944 src1 = src2;
945 src2 += srcPitch >> 1;
946 --count;
948 dst0 += dstPitch;
949 dst1 += dstPitch;
950 internal_scale2x_16_def(dst0, src0, src1, src1, width);
951 internal_scale2x_16_def(dst1, src1, src1, src0, width);
952 #ifdef MMX
954 #endif
957 void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
958 u8 *dstPtr, u32 dstPitch, int width, int height)
960 u32 *dst0 = (u32 *)dstPtr;
961 u32 *dst1 = dst0 + (dstPitch >> 2);
963 u32 *src0 = (u32 *)srcPtr;
964 u32 *src1 = src0 + (srcPitch >> 2);
965 u32 *src2 = src1 + (srcPitch >> 2);
966 #ifdef MMX
967 if(cpu_mmx) {
968 internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
970 int count = height;
972 count -= 2;
973 while(count) {
974 dst0 += dstPitch >> 1;
975 dst1 += dstPitch >> 1;
976 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
977 src0 = src1;
978 src1 = src2;
979 src2 += srcPitch >> 2;
980 --count;
982 dst0 += dstPitch >> 1;
983 dst1 += dstPitch >> 1;
984 internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
985 } else {
986 #endif
987 internal_scale2x_32_def(dst0, src0, src0, src1, width);
988 internal_scale2x_32_def(dst1, src1, src0, src0, width);
990 int count = height;
992 count -= 2;
993 while(count) {
994 dst0 += dstPitch >> 1;
995 dst1 += dstPitch >> 1;
996 internal_scale2x_32_def(dst0, src0, src1, src2, width);
997 internal_scale2x_32_def(dst1, src2, src1, src0, width);
998 src0 = src1;
999 src1 = src2;
1000 src2 += srcPitch >> 2;
1001 --count;
1003 dst0 += dstPitch >> 1;
1004 dst1 += dstPitch >> 1;
1005 internal_scale2x_32_def(dst0, src0, src1, src1, width);
1006 internal_scale2x_32_def(dst1, src1, src1, src0, width);
1007 #ifdef MMX
1009 #endif