Remove unused ProjectConfiguration
[xy_vsfilter.git] / src / dsutil / vd.cpp
blob25e402c3ff4df94a8240aa4a630eb5a519639beb
1 // VirtualDub - Video processing and capture application
2 // Copyright (C) 1998-2001 Avery Lee
3 //
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; if not, write to the Free Software
16 // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 // Notes:
19 // - BitBltFromI420ToRGB is from VirtualDub
20 // - The core assembly function of CCpuID is from DVD2AVI
21 // - sse2 yv12 to yuy2 conversion by Haali
22 // (- vd.cpp/h should be renamed to something more sensible already :)
25 #include "stdafx.h"
26 #include "vd.h"
28 #pragma warning(disable : 4799) // no emms... blahblahblah
30 CCpuID::CCpuID()
32 DWORD flags = 0;
34 __asm
36 mov eax, 1
37 cpuid
38 test edx, 0x00800000 // STD MMX
39 jz TEST_SSE
40 or [flags], 1
41 TEST_SSE:
42 test edx, 0x02000000 // STD SSE
43 jz TEST_SSE2
44 or [flags], 2
45 or [flags], 4
46 TEST_SSE2:
47 test edx, 0x04000000 // SSE2
48 jz TEST_3DNOW
49 or [flags], 8
50 TEST_3DNOW:
51 mov eax, 0x80000001
52 cpuid
53 test edx, 0x80000000 // 3D NOW
54 jz TEST_SSEMMX
55 or [flags], 16
56 TEST_SSEMMX:
57 test edx, 0x00400000 // SSE MMX
58 jz TEST_END
59 or [flags], 2
60 TEST_END:
63 m_flags = (flag_t)flags;
66 CCpuID g_cpuid;
68 void memcpy_accel(void* dst, const void* src, size_t len)
70 if((g_cpuid.m_flags & CCpuID::flag_t::ssefpu) && len >= 128
71 && !((DWORD)src&15) && !((DWORD)dst&15))
73 __asm
75 mov esi, dword ptr [src]
76 mov edi, dword ptr [dst]
77 mov ecx, len
78 shr ecx, 7
79 memcpy_accel_sse_loop:
80 prefetchnta [esi+16*8]
81 movaps xmm0, [esi]
82 movaps xmm1, [esi+16*1]
83 movaps xmm2, [esi+16*2]
84 movaps xmm3, [esi+16*3]
85 movaps xmm4, [esi+16*4]
86 movaps xmm5, [esi+16*5]
87 movaps xmm6, [esi+16*6]
88 movaps xmm7, [esi+16*7]
89 movntps [edi], xmm0
90 movntps [edi+16*1], xmm1
91 movntps [edi+16*2], xmm2
92 movntps [edi+16*3], xmm3
93 movntps [edi+16*4], xmm4
94 movntps [edi+16*5], xmm5
95 movntps [edi+16*6], xmm6
96 movntps [edi+16*7], xmm7
97 add esi, 128
98 add edi, 128
99 dec ecx
100 jne memcpy_accel_sse_loop
101 mov ecx, len
102 and ecx, 127
103 cmp ecx, 0
104 je memcpy_accel_sse_end
105 memcpy_accel_sse_loop2:
106 mov dl, byte ptr[esi]
107 mov byte ptr[edi], dl
108 inc esi
109 inc edi
110 dec ecx
111 jne memcpy_accel_sse_loop2
112 memcpy_accel_sse_end:
113 emms
114 sfence
117 else if((g_cpuid.m_flags & CCpuID::flag_t::mmx) && len >= 64
118 && !((DWORD)src&7) && !((DWORD)dst&7))
120 __asm
122 mov esi, dword ptr [src]
123 mov edi, dword ptr [dst]
124 mov ecx, len
125 shr ecx, 6
126 memcpy_accel_mmx_loop:
127 movq mm0, qword ptr [esi]
128 movq mm1, qword ptr [esi+8*1]
129 movq mm2, qword ptr [esi+8*2]
130 movq mm3, qword ptr [esi+8*3]
131 movq mm4, qword ptr [esi+8*4]
132 movq mm5, qword ptr [esi+8*5]
133 movq mm6, qword ptr [esi+8*6]
134 movq mm7, qword ptr [esi+8*7]
135 movq qword ptr [edi], mm0
136 movq qword ptr [edi+8*1], mm1
137 movq qword ptr [edi+8*2], mm2
138 movq qword ptr [edi+8*3], mm3
139 movq qword ptr [edi+8*4], mm4
140 movq qword ptr [edi+8*5], mm5
141 movq qword ptr [edi+8*6], mm6
142 movq qword ptr [edi+8*7], mm7
143 add esi, 64
144 add edi, 64
145 loop memcpy_accel_mmx_loop
146 mov ecx, len
147 and ecx, 63
148 cmp ecx, 0
149 je memcpy_accel_mmx_end
150 memcpy_accel_mmx_loop2:
151 mov dl, byte ptr [esi]
152 mov byte ptr [edi], dl
153 inc esi
154 inc edi
155 dec ecx
156 jne memcpy_accel_mmx_loop2
157 memcpy_accel_mmx_end:
158 emms
161 else
163 memcpy(dst, src, len);
167 bool BitBltFromI420ToI420(int w, int h, BYTE* dsty, BYTE* dstu, BYTE* dstv, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
169 if((w&1)) return(false);
171 if(w > 0 && w == srcpitch && w == dstpitch)
173 memcpy_accel(dsty, srcy, h*srcpitch);
174 memcpy_accel(dstu, srcu, h/2*srcpitch/2);
175 memcpy_accel(dstv, srcv, h/2*srcpitch/2);
177 else
179 int pitch = min(abs(srcpitch), abs(dstpitch));
181 for(int y = 0; y < h; y++, srcy += srcpitch, dsty += dstpitch)
182 memcpy_accel(dsty, srcy, pitch);
184 srcpitch >>= 1;
185 dstpitch >>= 1;
187 pitch = min(abs(srcpitch), abs(dstpitch));
189 for(int y = 0; y < h; y+=2, srcu += srcpitch, dstu += dstpitch)
190 memcpy_accel(dstu, srcu, pitch);
192 for(int y = 0; y < h; y+=2, srcv += srcpitch, dstv += dstpitch)
193 memcpy_accel(dstv, srcv, pitch);
196 return true;
199 bool BitBltFromYUY2ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* src, int srcpitch)
201 if(w > 0 && w == srcpitch && w == dstpitch)
203 memcpy_accel(dst, src, h*srcpitch);
205 else
207 int pitch = min(abs(srcpitch), abs(dstpitch));
209 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
210 memcpy_accel(dst, src, pitch);
213 return(true);
216 extern "C" void asm_YUVtoRGB32_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
217 extern "C" void asm_YUVtoRGB24_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
218 extern "C" void asm_YUVtoRGB16_row(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
219 extern "C" void asm_YUVtoRGB32_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
220 extern "C" void asm_YUVtoRGB24_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
221 extern "C" void asm_YUVtoRGB16_row_MMX(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
222 extern "C" void asm_YUVtoRGB32_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
223 extern "C" void asm_YUVtoRGB24_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
224 extern "C" void asm_YUVtoRGB16_row_ISSE(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width);
226 bool BitBltFromI420ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch)
228 if(w<=0 || h<=0 || (w&1) || (h&1))
229 return(false);
231 void (*asm_YUVtoRGB_row)(void* ARGB1, void* ARGB2, BYTE* Y1, BYTE* Y2, BYTE* U, BYTE* V, long width) = NULL;;
233 if((g_cpuid.m_flags & CCpuID::ssefpu) && !(w&7))
235 switch(dbpp)
237 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_ISSE*/; break; // TODO: fix _ISSE (555->565)
238 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_ISSE; break;
239 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_ISSE; break;
242 else if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
244 switch(dbpp)
246 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row/*_MMX*/; break; // TODO: fix _MMX (555->565)
247 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row_MMX; break;
248 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row_MMX; break;
251 else
253 switch(dbpp)
255 case 16: asm_YUVtoRGB_row = asm_YUVtoRGB16_row; break;
256 case 24: asm_YUVtoRGB_row = asm_YUVtoRGB24_row; break;
257 case 32: asm_YUVtoRGB_row = asm_YUVtoRGB32_row; break;
261 if(!asm_YUVtoRGB_row)
262 return(false);
266 asm_YUVtoRGB_row(dst + dstpitch, dst, srcy + srcpitch, srcy, srcu, srcv, w/2);
268 dst += 2*dstpitch;
269 srcy += srcpitch*2;
270 srcu += srcpitch/2;
271 srcv += srcpitch/2;
273 while(h -= 2);
275 if(g_cpuid.m_flags & CCpuID::mmx)
276 __asm emms
278 if(g_cpuid.m_flags & CCpuID::ssefpu)
279 __asm sfence
281 return true;
284 static void yuvtoyuy2row_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
286 WORD* dstw = (WORD*)dst;
287 for(; width > 1; width -= 2)
289 *dstw++ = (*srcu++<<8)|*srcy++;
290 *dstw++ = (*srcv++<<8)|*srcy++;
294 static void __declspec(naked) yuvtoyuy2row_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width)
296 __asm {
297 push ebp
298 push edi
299 push esi
300 push ebx
302 mov edi, [esp+20] // dst
303 mov ebp, [esp+24] // srcy
304 mov ebx, [esp+28] // srcu
305 mov esi, [esp+32] // srcv
306 mov ecx, [esp+36] // width
308 shr ecx, 3
310 yuvtoyuy2row_loop:
312 movd mm0, [ebx]
313 punpcklbw mm0, [esi]
315 movq mm1, [ebp]
316 movq mm2, mm1
317 punpcklbw mm1, mm0
318 punpckhbw mm2, mm0
320 movq [edi], mm1
321 movq [edi+8], mm2
323 add ebp, 8
324 add ebx, 4
325 add esi, 4
326 add edi, 16
328 dec ecx
329 jnz yuvtoyuy2row_loop
331 pop ebx
332 pop esi
333 pop edi
334 pop ebp
339 static void yuvtoyuy2row_avg_c(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
341 WORD* dstw = (WORD*)dst;
342 for(; width > 1; width -= 2, srcu++, srcv++)
344 *dstw++ = (((srcu[0]+srcu[pitchuv])>>1)<<8)|*srcy++;
345 *dstw++ = (((srcv[0]+srcv[pitchuv])>>1)<<8)|*srcy++;
349 static void __declspec(naked) yuvtoyuy2row_avg_MMX(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv)
351 static const __int64 mask = 0x7f7f7f7f7f7f7f7fi64;
353 __asm {
354 push ebp
355 push edi
356 push esi
357 push ebx
359 movq mm7, mask
361 mov edi, [esp+20] // dst
362 mov ebp, [esp+24] // srcy
363 mov ebx, [esp+28] // srcu
364 mov esi, [esp+32] // srcv
365 mov ecx, [esp+36] // width
366 mov eax, [esp+40] // pitchuv
368 shr ecx, 3
370 yuvtoyuy2row_avg_loop:
372 movd mm0, [ebx]
373 punpcklbw mm0, [esi]
374 movq mm1, mm0
376 movd mm2, [ebx + eax]
377 punpcklbw mm2, [esi + eax]
378 movq mm3, mm2
380 // (x+y)>>1 == (x&y)+((x^y)>>1)
382 pand mm0, mm2
383 pxor mm1, mm3
384 psrlq mm1, 1
385 pand mm1, mm7
386 paddb mm0, mm1
388 movq mm1, [ebp]
389 movq mm2, mm1
390 punpcklbw mm1, mm0
391 punpckhbw mm2, mm0
393 movq [edi], mm1
394 movq [edi+8], mm2
396 add ebp, 8
397 add ebx, 4
398 add esi, 4
399 add edi, 16
401 dec ecx
402 jnz yuvtoyuy2row_avg_loop
404 pop ebx
405 pop esi
406 pop edi
407 pop ebp
412 static void __declspec(naked) yv12_yuy2_row_sse2() {
413 __asm {
414 // ebx - Y
415 // edx - U
416 // esi - V
417 // edi - dest
418 // ecx - halfwidth
419 xor eax, eax
421 one:
422 movdqa xmm0, [ebx + eax*2] // YYYYYYYY
423 movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
425 movdqa xmm2, [edx + eax] // UUUUUUUU
426 movdqa xmm3, [esi + eax] // VVVVVVVV
428 movdqa xmm4, xmm2
429 movdqa xmm5, xmm0
430 movdqa xmm6, xmm1
431 punpcklbw xmm2, xmm3 // VUVUVUVU
432 punpckhbw xmm4, xmm3 // VUVUVUVU
434 punpcklbw xmm0, xmm2 // VYUYVYUY
435 punpcklbw xmm1, xmm4
436 punpckhbw xmm5, xmm2
437 punpckhbw xmm6, xmm4
439 movntdq [edi + eax*4], xmm0
440 movntdq [edi + eax*4 + 16], xmm5
441 movntdq [edi + eax*4 + 32], xmm1
442 movntdq [edi + eax*4 + 48], xmm6
444 add eax, 16
445 cmp eax, ecx
447 jb one
453 static void __declspec(naked) yv12_yuy2_row_sse2_linear() {
454 __asm {
455 // ebx - Y
456 // edx - U
457 // esi - V
458 // edi - dest
459 // ecx - width
460 // ebp - uv_stride
461 xor eax, eax
463 one:
464 movdqa xmm0, [ebx + eax*2] // YYYYYYYY
465 movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
467 movdqa xmm2, [edx]
468 movdqa xmm3, [esi]
469 pavgb xmm2, [edx + ebp] // UUUUUUUU
470 pavgb xmm3, [esi + ebp] // VVVVVVVV
472 movdqa xmm4, xmm2
473 movdqa xmm5, xmm0
474 movdqa xmm6, xmm1
475 punpcklbw xmm2, xmm3 // VUVUVUVU
476 punpckhbw xmm4, xmm3 // VUVUVUVU
478 punpcklbw xmm0, xmm2 // VYUYVYUY
479 punpcklbw xmm1, xmm4
480 punpckhbw xmm5, xmm2
481 punpckhbw xmm6, xmm4
483 movntdq [edi + eax*4], xmm0
484 movntdq [edi + eax*4 + 16], xmm5
485 movntdq [edi + eax*4 + 32], xmm1
486 movntdq [edi + eax*4 + 48], xmm6
488 add eax, 16
489 add edx, 16
490 add esi, 16
491 cmp eax, ecx
493 jb one
499 static void __declspec(naked) yv12_yuy2_row_sse2_linear_interlaced() {
500 __asm {
501 // ebx - Y
502 // edx - U
503 // esi - V
504 // edi - dest
505 // ecx - width
506 // ebp - uv_stride
507 xor eax, eax
509 one:
510 movdqa xmm0, [ebx + eax*2] // YYYYYYYY
511 movdqa xmm1, [ebx + eax*2 + 16] // YYYYYYYY
513 movdqa xmm2, [edx]
514 movdqa xmm3, [esi]
515 pavgb xmm2, [edx + ebp*2] // UUUUUUUU
516 pavgb xmm3, [esi + ebp*2] // VVVVVVVV
518 movdqa xmm4, xmm2
519 movdqa xmm5, xmm0
520 movdqa xmm6, xmm1
521 punpcklbw xmm2, xmm3 // VUVUVUVU
522 punpckhbw xmm4, xmm3 // VUVUVUVU
524 punpcklbw xmm0, xmm2 // VYUYVYUY
525 punpcklbw xmm1, xmm4
526 punpckhbw xmm5, xmm2
527 punpckhbw xmm6, xmm4
529 movntdq [edi + eax*4], xmm0
530 movntdq [edi + eax*4 + 16], xmm5
531 movntdq [edi + eax*4 + 32], xmm1
532 movntdq [edi + eax*4 + 48], xmm6
534 add eax, 16
535 add edx, 16
536 add esi, 16
537 cmp eax, ecx
539 jb one
545 void __declspec(naked) yv12_yuy2_sse2(const BYTE *Y, const BYTE *U, const BYTE *V,
546 int halfstride, unsigned halfwidth, unsigned height,
547 BYTE *YUY2, int d_stride)
549 __asm {
550 push ebx
551 push esi
552 push edi
553 push ebp
555 mov ebx, [esp + 20] // Y
556 mov edx, [esp + 24] // U
557 mov esi, [esp + 28] // V
558 mov edi, [esp + 44] // D
559 mov ebp, [esp + 32] // uv_stride
560 mov ecx, [esp + 36] // uv_width
562 mov eax, ecx
563 add eax, 15
564 and eax, 0xfffffff0
565 sub [esp + 32], eax
567 cmp dword ptr [esp + 40], 2
568 jbe last2
570 row:
571 sub dword ptr [esp + 40], 2
572 call yv12_yuy2_row_sse2
574 lea ebx, [ebx + ebp*2]
575 add edi, [esp + 48]
577 call yv12_yuy2_row_sse2_linear
579 add edx, [esp + 32]
580 add esi, [esp + 32]
582 lea ebx, [ebx + ebp*2]
583 add edi, [esp + 48]
585 cmp dword ptr [esp + 40], 2
586 ja row
588 last2:
589 call yv12_yuy2_row_sse2
591 dec dword ptr [esp + 40]
592 jz done
594 lea ebx, [ebx + ebp*2]
595 add edi, [esp + 48]
596 call yv12_yuy2_row_sse2
597 done:
599 pop ebp
600 pop edi
601 pop esi
602 pop ebx
608 void __declspec(naked) yv12_yuy2_sse2_interlaced(const BYTE *Y, const BYTE *U, const BYTE *V,
609 int halfstride, unsigned halfwidth, unsigned height,
610 BYTE *YUY2, int d_stride)
612 __asm {
613 push ebx
614 push esi
615 push edi
616 push ebp
618 mov ebx, [esp + 20] // Y
619 mov edx, [esp + 24] // U
620 mov esi, [esp + 28] // V
621 mov edi, [esp + 44] // D
622 mov ebp, [esp + 32] // uv_stride
623 mov ecx, [esp + 36] // uv_width
625 mov eax, ecx
626 add eax, 15
627 and eax, 0xfffffff0
628 sub [esp + 32], eax
630 cmp dword ptr [esp + 40], 4
631 jbe last4
633 row:
634 sub dword ptr [esp + 40], 4
635 call yv12_yuy2_row_sse2 // first row, first field
637 lea ebx, [ebx + ebp*2]
638 add edi, [esp + 48]
640 add edx, ebp
641 add esi, ebp
643 call yv12_yuy2_row_sse2 // first row, second field
645 lea ebx, [ebx + ebp*2]
646 add edi, [esp + 48]
648 sub edx, ebp
649 sub esi, ebp
651 call yv12_yuy2_row_sse2_linear_interlaced // second row, first field
653 add edx, [esp + 32]
654 add esi, [esp + 32]
656 lea ebx, [ebx + ebp*2]
657 add edi, [esp + 48]
659 call yv12_yuy2_row_sse2_linear_interlaced // second row, second field
661 add edx, [esp + 32]
662 add esi, [esp + 32]
664 lea ebx, [ebx + ebp*2]
665 add edi, [esp + 48]
667 cmp dword ptr [esp + 40], 4
668 ja row
670 last4:
671 call yv12_yuy2_row_sse2
673 lea ebx, [ebx + ebp*2]
674 add edi, [esp + 48]
676 add edx, ebp
677 add esi, ebp
679 call yv12_yuy2_row_sse2
681 lea ebx, [ebx + ebp*2]
682 add edi, [esp + 48]
684 sub edx, ebp
685 sub esi, ebp
687 call yv12_yuy2_row_sse2
689 lea ebx, [ebx + ebp*2]
690 add edi, [esp + 48]
692 add edx, ebp
693 add esi, ebp
695 call yv12_yuy2_row_sse2
697 pop ebp
698 pop edi
699 pop esi
700 pop ebx
706 bool BitBltFromI420ToYUY2(int w, int h, BYTE* dst, int dstpitch, BYTE* srcy, BYTE* srcu, BYTE* srcv, int srcpitch, bool fInterlaced)
708 if(w<=0 || h<=0 || (w&1) || (h&1))
709 return(false);
711 if(srcpitch == 0) srcpitch = w;
713 if((g_cpuid.m_flags & CCpuID::sse2)
714 && !((DWORD_PTR)srcy&15) && !((DWORD_PTR)srcu&15) && !((DWORD_PTR)srcv&15) && !(srcpitch&31)
715 && !((DWORD_PTR)dst&15) && !(dstpitch&15))
717 if(!fInterlaced) yv12_yuy2_sse2(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch);
718 else yv12_yuy2_sse2_interlaced(srcy, srcu, srcv, srcpitch/2, w/2, h, dst, dstpitch);
719 return true;
721 else
723 ASSERT(!fInterlaced);
726 void (*yuvtoyuy2row)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width) = NULL;
727 void (*yuvtoyuy2row_avg)(BYTE* dst, BYTE* srcy, BYTE* srcu, BYTE* srcv, DWORD width, DWORD pitchuv) = NULL;
729 if((g_cpuid.m_flags & CCpuID::mmx) && !(w&7))
731 yuvtoyuy2row = yuvtoyuy2row_MMX;
732 yuvtoyuy2row_avg = yuvtoyuy2row_avg_MMX;
734 else
736 yuvtoyuy2row = yuvtoyuy2row_c;
737 yuvtoyuy2row_avg = yuvtoyuy2row_avg_c;
740 if(!yuvtoyuy2row)
741 return(false);
745 yuvtoyuy2row(dst, srcy, srcu, srcv, w);
746 yuvtoyuy2row_avg(dst + dstpitch, srcy + srcpitch, srcu, srcv, w, srcpitch/2);
748 dst += 2*dstpitch;
749 srcy += srcpitch*2;
750 srcu += srcpitch/2;
751 srcv += srcpitch/2;
753 while((h -= 2) > 2);
755 yuvtoyuy2row(dst, srcy, srcu, srcv, w);
756 yuvtoyuy2row(dst + dstpitch, srcy + srcpitch, srcu, srcv, w);
758 if(g_cpuid.m_flags & CCpuID::mmx)
759 __asm emms
761 return(true);
764 bool BitBltFromRGBToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch, int sbpp)
766 if(dbpp == sbpp)
768 int rowbytes = w*dbpp>>3;
770 if(rowbytes > 0 && rowbytes == srcpitch && rowbytes == dstpitch)
772 memcpy_accel(dst, src, h*rowbytes);
774 else
776 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
777 memcpy_accel(dst, src, rowbytes);
780 return(true);
783 if(sbpp != 16 && sbpp != 24 && sbpp != 32
784 || dbpp != 16 && dbpp != 24 && dbpp != 32)
785 return(false);
787 if(dbpp == 16)
789 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
791 if(sbpp == 24)
793 BYTE* s = (BYTE*)src;
794 WORD* d = (WORD*)dst;
795 for(int x = 0; x < w; x++, s+=3, d++)
796 *d = (WORD)(((*((DWORD*)s)>>8)&0xf800)|((*((DWORD*)s)>>5)&0x07e0)|((*((DWORD*)s)>>3)&0x1f));
798 else if(sbpp == 32)
800 DWORD* s = (DWORD*)src;
801 WORD* d = (WORD*)dst;
802 for(int x = 0; x < w; x++, s++, d++)
803 *d = (WORD)(((*s>>8)&0xf800)|((*s>>5)&0x07e0)|((*s>>3)&0x1f));
807 else if(dbpp == 24)
809 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
811 if(sbpp == 16)
813 WORD* s = (WORD*)src;
814 BYTE* d = (BYTE*)dst;
815 for(int x = 0; x < w; x++, s++, d+=3)
816 { // not tested, r-g-b might be in reverse
817 d[0] = (*s&0x001f)<<3;
818 d[1] = (*s&0x07e0)<<5;
819 d[2] = (*s&0xf800)<<8;
822 else if(sbpp == 32)
824 BYTE* s = (BYTE*)src;
825 BYTE* d = (BYTE*)dst;
826 for(int x = 0; x < w; x++, s+=4, d+=3)
827 {d[0] = s[0]; d[1] = s[1]; d[2] = s[2];}
831 else if(dbpp == 32)
833 for(int y = 0; y < h; y++, src += srcpitch, dst += dstpitch)
835 if(sbpp == 16)
837 WORD* s = (WORD*)src;
838 DWORD* d = (DWORD*)dst;
839 for(int x = 0; x < w; x++, s++, d++)
840 *d = ((*s&0xf800)<<8)|((*s&0x07e0)<<5)|((*s&0x001f)<<3);
842 else if(sbpp == 24)
844 BYTE* s = (BYTE*)src;
845 DWORD* d = (DWORD*)dst;
846 for(int x = 0; x < w; x++, s+=3, d++)
847 *d = *((DWORD*)s)&0xffffff;
852 return(true);
855 static void asm_blend_row_clipped_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
857 BYTE* src2 = src + srcpitch;
858 do {*dst++ = (*src++ + *src2++ + 1) >> 1;}
859 while(w--);
862 static void asm_blend_row_c(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
864 BYTE* src2 = src + srcpitch;
865 BYTE* src3 = src2 + srcpitch;
866 do {*dst++ = (*src++ + (*src2++ << 1) + *src3++ + 2) >> 2;}
867 while(w--);
870 static void __declspec(naked) asm_blend_row_clipped_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
872 static const __int64 _x0001000100010001 = 0x0001000100010001;
874 __asm {
875 push ebp
876 push edi
877 push esi
878 push ebx
880 mov edi,[esp+20]
881 mov esi,[esp+24]
882 sub edi,esi
883 mov ebp,[esp+28]
884 mov edx,[esp+32]
886 shr ebp, 3
888 movq mm6, _x0001000100010001
889 pxor mm7, mm7
891 xloop:
892 movq mm0, [esi]
893 movq mm3, mm0
894 punpcklbw mm0, mm7
895 punpckhbw mm3, mm7
897 movq mm1, [esi+edx]
898 movq mm4, mm1
899 punpcklbw mm1, mm7
900 punpckhbw mm4, mm7
902 paddw mm1, mm0
903 paddw mm1, mm6
904 psrlw mm1, 1
906 paddw mm4, mm3
907 paddw mm4, mm6
908 psrlw mm4, 1
910 add esi, 8
911 packuswb mm1, mm4
912 movq [edi+esi-8], mm1
914 dec ebp
915 jne xloop
917 pop ebx
918 pop esi
919 pop edi
920 pop ebp
925 static void __declspec(naked) asm_blend_row_MMX(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
927 static const __int64 mask0 = 0xfcfcfcfcfcfcfcfci64;
928 static const __int64 mask1 = 0x7f7f7f7f7f7f7f7fi64;
929 static const __int64 mask2 = 0x3f3f3f3f3f3f3f3fi64;
930 static const __int64 _x0002000200020002 = 0x0002000200020002;
932 __asm {
933 push ebp
934 push edi
935 push esi
936 push ebx
938 mov edi, [esp+20]
939 mov esi, [esp+24]
940 sub edi, esi
941 mov ebp, [esp+28]
942 mov edx, [esp+32]
944 shr ebp, 3
946 movq mm6, _x0002000200020002
947 pxor mm7, mm7
949 xloop:
950 movq mm0, [esi]
951 movq mm3, mm0
952 punpcklbw mm0, mm7
953 punpckhbw mm3, mm7
955 movq mm1, [esi+edx]
956 movq mm4, mm1
957 punpcklbw mm1, mm7
958 punpckhbw mm4, mm7
960 movq mm2, [esi+edx*2]
961 movq mm5, mm2
962 punpcklbw mm2, mm7
963 punpckhbw mm5, mm7
965 psllw mm1, 1
966 paddw mm1, mm0
967 paddw mm1, mm2
968 paddw mm1, mm6
969 psrlw mm1, 2
971 psllw mm4, 1
972 paddw mm4, mm3
973 paddw mm4, mm5
974 paddw mm4, mm6
975 psrlw mm4, 2
977 add esi, 8
978 packuswb mm1, mm4
979 movq [edi+esi-8], mm1
981 dec ebp
982 jne xloop
984 // sadly the original code makes a lot of visible banding artifacts on yuv
985 // (it seems those shiftings without rounding introduce too much error)
987 mov edi,[esp+20]
988 mov esi,[esp+24]
989 sub edi,esi
990 mov ebp,[esp+28]
991 mov edx,[esp+32]
993 movq mm5,mask0
994 movq mm6,mask1
995 movq mm7,mask2
996 shr ebp,1
997 jz oddpart
999 xloop:
1000 movq mm2,[esi]
1001 movq mm0,mm5
1003 movq mm1,[esi+edx]
1004 pand mm0,mm2
1006 psrlq mm1,1
1007 movq mm2,[esi+edx*2]
1009 psrlq mm2,2
1010 pand mm1,mm6
1012 psrlq mm0,2
1013 pand mm2,mm7
1015 paddb mm0,mm1
1016 add esi,8
1018 paddb mm0,mm2
1019 dec ebp
1021 movq [edi+esi-8],mm0
1022 jne xloop
1024 oddpart:
1025 test byte ptr [esp+28],1
1026 jz nooddpart
1028 mov ecx,[esi]
1029 mov eax,0fcfcfcfch
1030 mov ebx,[esi+edx]
1031 and eax,ecx
1032 shr ebx,1
1033 mov ecx,[esi+edx*2]
1034 shr ecx,2
1035 and ebx,07f7f7f7fh
1036 shr eax,2
1037 and ecx,03f3f3f3fh
1038 add eax,ebx
1039 add eax,ecx
1040 mov [edi+esi],eax
1042 nooddpart:
1044 pop ebx
1045 pop esi
1046 pop edi
1047 pop ebp
1052 __declspec(align(16)) static BYTE const_1_16_bytes[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
1054 static void asm_blend_row_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
1056 __asm
1058 mov edx, srcpitch
1059 mov esi, src
1060 mov edi, dst
1061 sub edi, esi
1062 mov ecx, w
1063 mov ebx, ecx
1064 shr ecx, 4
1065 and ebx, 15
1067 movdqa xmm7, [const_1_16_bytes]
1069 asm_blend_row_SSE2_loop:
1070 movdqa xmm0, [esi]
1071 movdqa xmm1, [esi+edx]
1072 movdqa xmm2, [esi+edx*2]
1073 pavgb xmm0, xmm1
1074 pavgb xmm2, xmm1
1075 psubusb xmm0, xmm7
1076 pavgb xmm0, xmm2
1077 movdqa [esi+edi], xmm0
1078 add esi, 16
1079 dec ecx
1080 jnz asm_blend_row_SSE2_loop
1082 test ebx,15
1083 jz asm_blend_row_SSE2_end
1085 mov ecx, ebx
1086 xor ax, ax
1087 xor bx, bx
1088 xor dx, dx
1089 asm_blend_row_SSE2_loop2:
1090 mov al, [esi]
1091 mov bl, [esi+edx]
1092 mov dl, [esi+edx*2]
1093 add ax, bx
1094 inc ax
1095 shr ax, 1
1096 add dx, bx
1097 inc dx
1098 shr dx, 1
1099 add ax, dx
1100 shr ax, 1
1101 mov [esi+edi], al
1102 inc esi
1103 dec ecx
1104 jnz asm_blend_row_SSE2_loop2
1106 asm_blend_row_SSE2_end:
1110 static void asm_blend_row_clipped_SSE2(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch)
1112 __asm
1114 mov edx, srcpitch
1115 mov esi, src
1116 mov edi, dst
1117 sub edi, esi
1118 mov ecx, w
1119 mov ebx, ecx
1120 shr ecx, 4
1121 and ebx, 15
1123 movdqa xmm7, [const_1_16_bytes]
1125 asm_blend_row_clipped_SSE2_loop:
1126 movdqa xmm0, [esi]
1127 movdqa xmm1, [esi+edx]
1128 pavgb xmm0, xmm1
1129 movdqa [esi+edi], xmm0
1130 add esi, 16
1131 dec ecx
1132 jnz asm_blend_row_clipped_SSE2_loop
1134 test ebx,15
1135 jz asm_blend_row_clipped_SSE2_end
1137 mov ecx, ebx
1138 xor ax, ax
1139 xor bx, bx
1140 asm_blend_row_clipped_SSE2_loop2:
1141 mov al, [esi]
1142 mov bl, [esi+edx]
1143 add ax, bx
1144 inc ax
1145 shr ax, 1
1146 mov [esi+edi], al
1147 inc esi
1148 dec ecx
1149 jnz asm_blend_row_clipped_SSE2_loop2
1151 asm_blend_row_clipped_SSE2_end:
1155 void DeinterlaceBlend(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch)
1157 void (*asm_blend_row_clipped)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL;
1158 void (*asm_blend_row)(BYTE* dst, BYTE* src, DWORD w, DWORD srcpitch) = NULL;
1160 if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)src&0xf) && !((DWORD)dst&0xf) && !(srcpitch&0xf))
1162 asm_blend_row_clipped = asm_blend_row_clipped_SSE2;
1163 asm_blend_row = asm_blend_row_SSE2;
1165 else if(g_cpuid.m_flags & CCpuID::mmx)
1167 asm_blend_row_clipped = asm_blend_row_clipped_MMX;
1168 asm_blend_row = asm_blend_row_MMX;
1170 else
1172 asm_blend_row_clipped = asm_blend_row_clipped_c;
1173 asm_blend_row = asm_blend_row_c;
1176 if(!asm_blend_row_clipped)
1177 return;
1179 asm_blend_row_clipped(dst, src, rowbytes, srcpitch);
1181 if((h -= 2) > 0) do
1183 dst += dstpitch;
1184 asm_blend_row(dst, src, rowbytes, srcpitch);
1185 src += srcpitch;
1187 while(--h);
1189 asm_blend_row_clipped(dst + dstpitch, src, rowbytes, srcpitch);
1191 if(g_cpuid.m_flags & CCpuID::mmx)
1192 __asm emms
1195 void DeinterlaceBob(BYTE* dst, BYTE* src, DWORD rowbytes, DWORD h, DWORD dstpitch, DWORD srcpitch, bool topfield)
1197 if(topfield)
1199 BitBltFromRGBToRGB(rowbytes, h/2, dst, dstpitch*2, 8, src, srcpitch*2, 8);
1200 AvgLines8(dst, h, dstpitch);
1202 else
1204 BitBltFromRGBToRGB(rowbytes, h/2, dst + dstpitch, dstpitch*2, 8, src + srcpitch, srcpitch*2, 8);
1205 AvgLines8(dst + dstpitch, h-1, dstpitch);
1209 void AvgLines8(BYTE* dst, DWORD h, DWORD pitch)
1211 if(h <= 1) return;
1213 BYTE* s = dst;
1214 BYTE* d = dst + (h-2)*pitch;
1216 for(; s < d; s += pitch*2)
1218 BYTE* tmp = s;
1220 if((g_cpuid.m_flags & CCpuID::sse2) && !((DWORD)tmp&0xf) && !((DWORD)pitch&0xf))
1222 __asm
1224 mov esi, tmp
1225 mov ebx, pitch
1227 mov ecx, ebx
1228 shr ecx, 4
1230 AvgLines8_sse2_loop:
1231 movdqa xmm0, [esi]
1232 pavgb xmm0, [esi+ebx*2]
1233 movdqa [esi+ebx], xmm0
1234 add esi, 16
1236 dec ecx
1237 jnz AvgLines8_sse2_loop
1239 mov tmp, esi
1242 for(int i = pitch&7; i--; tmp++)
1244 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
1247 else if(g_cpuid.m_flags & CCpuID::mmx)
1249 __asm
1251 mov esi, tmp
1252 mov ebx, pitch
1254 mov ecx, ebx
1255 shr ecx, 3
1257 pxor mm7, mm7
1258 AvgLines8_mmx_loop:
1259 movq mm0, [esi]
1260 movq mm1, mm0
1262 punpcklbw mm0, mm7
1263 punpckhbw mm1, mm7
1265 movq mm2, [esi+ebx*2]
1266 movq mm3, mm2
1268 punpcklbw mm2, mm7
1269 punpckhbw mm3, mm7
1271 paddw mm0, mm2
1272 psrlw mm0, 1
1274 paddw mm1, mm3
1275 psrlw mm1, 1
1277 packuswb mm0, mm1
1279 movq [esi+ebx], mm0
1281 lea esi, [esi+8]
1283 dec ecx
1284 jnz AvgLines8_mmx_loop
1286 mov tmp, esi
1289 for(int i = pitch&7; i--; tmp++)
1291 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
1294 else
1296 for(int i = pitch; i--; tmp++)
1298 tmp[pitch] = (tmp[0] + tmp[pitch<<1] + 1) >> 1;
1303 if(!(h&1) && h >= 2)
1305 dst += (h-2)*pitch;
1306 memcpy_accel(dst + pitch, dst, pitch);
1309 __asm emms;
1312 void AvgLines555(BYTE* dst, DWORD h, DWORD pitch)
1314 if(h <= 1) return;
1316 unsigned __int64 __0x7c007c007c007c00 = 0x7c007c007c007c00;
1317 unsigned __int64 __0x03e003e003e003e0 = 0x03e003e003e003e0;
1318 unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;
1320 BYTE* s = dst;
1321 BYTE* d = dst + (h-2)*pitch;
1323 for(; s < d; s += pitch*2)
1325 BYTE* tmp = s;
1327 __asm
1329 mov esi, tmp
1330 mov ebx, pitch
1332 mov ecx, ebx
1333 shr ecx, 3
1335 movq mm6, __0x03e003e003e003e0
1336 movq mm7, __0x001f001f001f001f
1338 AvgLines555_loop:
1339 movq mm0, [esi]
1340 movq mm1, mm0
1341 movq mm2, mm0
1343 psrlw mm0, 10 // red1 bits: mm0 = 001f001f001f001f
1344 pand mm1, mm6 // green1 bits: mm1 = 03e003e003e003e0
1345 pand mm2, mm7 // blue1 bits: mm2 = 001f001f001f001f
1347 movq mm3, [esi+ebx*2]
1348 movq mm4, mm3
1349 movq mm5, mm3
1351 psrlw mm3, 10 // red2 bits: mm3 = 001f001f001f001f
1352 pand mm4, mm6 // green2 bits: mm4 = 03e003e003e003e0
1353 pand mm5, mm7 // blue2 bits: mm5 = 001f001f001f001f
1355 paddw mm0, mm3
1356 psrlw mm0, 1 // (red1+red2)/2
1357 psllw mm0, 10 // red bits at 7c007c007c007c00
1359 paddw mm1, mm4
1360 psrlw mm1, 1 // (green1+green2)/2
1361 pand mm1, mm6 // green bits at 03e003e003e003e0
1363 paddw mm2, mm5
1364 psrlw mm2, 1 // (blue1+blue2)/2
1365 // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)
1367 por mm0, mm1
1368 por mm0, mm2
1370 movq [esi+ebx], mm0
1372 lea esi, [esi+8]
1374 dec ecx
1375 jnz AvgLines555_loop
1377 mov tmp, esi
1380 for(int i = (pitch&7)>>1; i--; tmp++)
1382 tmp[pitch] =
1383 ((((*tmp&0x7c00) + (tmp[pitch<<1]&0x7c00)) >> 1)&0x7c00)|
1384 ((((*tmp&0x03e0) + (tmp[pitch<<1]&0x03e0)) >> 1)&0x03e0)|
1385 ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
1389 if(!(h&1) && h >= 2)
1391 dst += (h-2)*pitch;
1392 memcpy_accel(dst + pitch, dst, pitch);
1395 __asm emms;
1398 void AvgLines565(BYTE* dst, DWORD h, DWORD pitch)
1400 if(h <= 1) return;
1402 unsigned __int64 __0xf800f800f800f800 = 0xf800f800f800f800;
1403 unsigned __int64 __0x07e007e007e007e0 = 0x07e007e007e007e0;
1404 unsigned __int64 __0x001f001f001f001f = 0x001f001f001f001f;
1406 BYTE* s = dst;
1407 BYTE* d = dst + (h-2)*pitch;
1409 for(; s < d; s += pitch*2)
1411 WORD* tmp = (WORD*)s;
1413 __asm
1415 mov esi, tmp
1416 mov ebx, pitch
1418 mov ecx, ebx
1419 shr ecx, 3
1421 movq mm6, __0x07e007e007e007e0
1422 movq mm7, __0x001f001f001f001f
1424 AvgLines565_loop:
1425 movq mm0, [esi]
1426 movq mm1, mm0
1427 movq mm2, mm0
1429 psrlw mm0, 11 // red1 bits: mm0 = 001f001f001f001f
1430 pand mm1, mm6 // green1 bits: mm1 = 07e007e007e007e0
1431 pand mm2, mm7 // blue1 bits: mm2 = 001f001f001f001f
1433 movq mm3, [esi+ebx*2]
1434 movq mm4, mm3
1435 movq mm5, mm3
1437 psrlw mm3, 11 // red2 bits: mm3 = 001f001f001f001f
1438 pand mm4, mm6 // green2 bits: mm4 = 07e007e007e007e0
1439 pand mm5, mm7 // blue2 bits: mm5 = 001f001f001f001f
1441 paddw mm0, mm3
1442 psrlw mm0, 1 // (red1+red2)/2
1443 psllw mm0, 11 // red bits at f800f800f800f800
1445 paddw mm1, mm4
1446 psrlw mm1, 1 // (green1+green2)/2
1447 pand mm1, mm6 // green bits at 03e003e003e003e0
1449 paddw mm2, mm5
1450 psrlw mm2, 1 // (blue1+blue2)/2
1451 // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)
1453 por mm0, mm1
1454 por mm0, mm2
1456 movq [esi+ebx], mm0
1458 lea esi, [esi+8]
1460 dec ecx
1461 jnz AvgLines565_loop
1463 mov tmp, esi
1466 for(int i = (pitch&7)>>1; i--; tmp++)
1468 tmp[pitch] =
1469 ((((*tmp&0xf800) + (tmp[pitch<<1]&0xf800)) >> 1)&0xf800)|
1470 ((((*tmp&0x07e0) + (tmp[pitch<<1]&0x07e0)) >> 1)&0x07e0)|
1471 ((((*tmp&0x001f) + (tmp[pitch<<1]&0x001f)) >> 1)&0x001f);
1475 if(!(h&1) && h >= 2)
1477 dst += (h-2)*pitch;
1478 memcpy_accel(dst + pitch, dst, pitch);
1481 __asm emms;
1484 extern "C" void mmx_YUY2toRGB24(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709);
1485 extern "C" void mmx_YUY2toRGB32(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709);
1487 bool BitBltFromYUY2ToRGB(int w, int h, BYTE* dst, int dstpitch, int dbpp, BYTE* src, int srcpitch)
1489 void (* YUY2toRGB)(const BYTE* src, BYTE* dst, const BYTE* src_end, int src_pitch, int row_size, bool rec709) = NULL;
1491 if(g_cpuid.m_flags & CCpuID::mmx)
1493 YUY2toRGB =
1494 dbpp == 32 ? mmx_YUY2toRGB32 :
1495 dbpp == 24 ? mmx_YUY2toRGB24 :
1496 // dbpp == 16 ? mmx_YUY2toRGB16 : // TODO
1497 NULL;
1499 else
1501 // TODO
1504 if(!YUY2toRGB) return(false);
1506 YUY2toRGB(src, dst, src + h*srcpitch, srcpitch, w, false);
1508 return(true);