1 // VirtualDub - Video processing and capture application
2 // Copyright (C) 1998-2001 Avery Lee
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; if not, write to the Free Software
16 // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 // - BitBltFromI420ToRGB is from VirtualDub
20 // - The core assembly function of CCpuID is from DVD2AVI
21 // - sse2 yv12 to yuy2 conversion by Haali
22 // (- vd.cpp/h should be renamed to something more sensible already :)
28 #pragma warning(disable : 4799) // no emms... blahblahblah
38 test edx
, 0x00800000 // STD MMX
42 test edx
, 0x02000000 // STD SSE
47 test edx
, 0x04000000 // SSE2
53 test edx
, 0x80000000 // 3D NOW
57 test edx
, 0x00400000 // SSE MMX
63 m_flags
= (flag_t
)flags
;
68 void memcpy_accel(void* dst
, const void* src
, size_t len
)
70 if((g_cpuid
.m_flags
& CCpuID::flag_t::ssefpu
) && len
>= 128
71 && !((DWORD
)src
&15) && !((DWORD
)dst
&15))
75 mov esi
, dword ptr
[src
]
76 mov edi
, dword ptr
[dst
]
79 memcpy_accel_sse_loop
:
80 prefetchnta
[esi
+16*8]
82 movaps xmm1
, [esi
+16*1]
83 movaps xmm2
, [esi
+16*2]
84 movaps xmm3
, [esi
+16*3]
85 movaps xmm4
, [esi
+16*4]
86 movaps xmm5
, [esi
+16*5]
87 movaps xmm6
, [esi
+16*6]
88 movaps xmm7
, [esi
+16*7]
90 movntps
[edi
+16*1], xmm1
91 movntps
[edi
+16*2], xmm2
92 movntps
[edi
+16*3], xmm3
93 movntps
[edi
+16*4], xmm4
94 movntps
[edi
+16*5], xmm5
95 movntps
[edi
+16*6], xmm6
96 movntps
[edi
+16*7], xmm7
100 jne memcpy_accel_sse_loop
104 je memcpy_accel_sse_end
105 memcpy_accel_sse_loop2
:
106 mov dl
, byte ptr
[esi
]
107 mov byte ptr
[edi
], dl
111 jne memcpy_accel_sse_loop2
112 memcpy_accel_sse_end
:
117 else if((g_cpuid
.m_flags
& CCpuID::flag_t::mmx
) && len
>= 64
118 && !((DWORD
)src
&7) && !((DWORD
)dst
&7))
122 mov esi
, dword ptr
[src
]
123 mov edi
, dword ptr
[dst
]
126 memcpy_accel_mmx_loop
:
127 movq mm0
, qword ptr
[esi
]
128 movq mm1
, qword ptr
[esi
+8*1]
129 movq mm2
, qword ptr
[esi
+8*2]
130 movq mm3
, qword ptr
[esi
+8*3]
131 movq mm4
, qword ptr
[esi
+8*4]
132 movq mm5
, qword ptr
[esi
+8*5]
133 movq mm6
, qword ptr
[esi
+8*6]
134 movq mm7
, qword ptr
[esi
+8*7]
135 movq qword ptr
[edi
], mm0
136 movq qword ptr
[edi
+8*1], mm1
137 movq qword ptr
[edi
+8*2], mm2
138 movq qword ptr
[edi
+8*3], mm3
139 movq qword ptr
[edi
+8*4], mm4
140 movq qword ptr
[edi
+8*5], mm5
141 movq qword ptr
[edi
+8*6], mm6
142 movq qword ptr
[edi
+8*7], mm7
145 loop memcpy_accel_mmx_loop
149 je memcpy_accel_mmx_end
150 memcpy_accel_mmx_loop2
:
151 mov dl
, byte ptr
[esi
]
152 mov byte ptr
[edi
], dl
156 jne memcpy_accel_mmx_loop2
157 memcpy_accel_mmx_end
:
163 memcpy(dst
, src
, len
);
167 bool BitBltFromI420ToI420(int w
, int h
, BYTE
* dsty
, BYTE
* dstu
, BYTE
* dstv
, int dstpitch
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, int srcpitch
)
169 if((w
&1)) return(false);
171 if(w
> 0 && w
== srcpitch
&& w
== dstpitch
)
173 memcpy_accel(dsty
, srcy
, h
*srcpitch
);
174 memcpy_accel(dstu
, srcu
, h
/2*srcpitch
/2);
175 memcpy_accel(dstv
, srcv
, h
/2*srcpitch
/2);
179 int pitch
= min(abs(srcpitch
), abs(dstpitch
));
181 for(int y
= 0; y
< h
; y
++, srcy
+= srcpitch
, dsty
+= dstpitch
)
182 memcpy_accel(dsty
, srcy
, pitch
);
187 pitch
= min(abs(srcpitch
), abs(dstpitch
));
189 for(int y
= 0; y
< h
; y
+=2, srcu
+= srcpitch
, dstu
+= dstpitch
)
190 memcpy_accel(dstu
, srcu
, pitch
);
192 for(int y
= 0; y
< h
; y
+=2, srcv
+= srcpitch
, dstv
+= dstpitch
)
193 memcpy_accel(dstv
, srcv
, pitch
);
199 bool BitBltFromYUY2ToYUY2(int w
, int h
, BYTE
* dst
, int dstpitch
, BYTE
* src
, int srcpitch
)
201 if(w
> 0 && w
== srcpitch
&& w
== dstpitch
)
203 memcpy_accel(dst
, src
, h
*srcpitch
);
207 int pitch
= min(abs(srcpitch
), abs(dstpitch
));
209 for(int y
= 0; y
< h
; y
++, src
+= srcpitch
, dst
+= dstpitch
)
210 memcpy_accel(dst
, src
, pitch
);
216 extern "C" void asm_YUVtoRGB32_row(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
217 extern "C" void asm_YUVtoRGB24_row(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
218 extern "C" void asm_YUVtoRGB16_row(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
219 extern "C" void asm_YUVtoRGB32_row_MMX(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
220 extern "C" void asm_YUVtoRGB24_row_MMX(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
221 extern "C" void asm_YUVtoRGB16_row_MMX(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
222 extern "C" void asm_YUVtoRGB32_row_ISSE(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
223 extern "C" void asm_YUVtoRGB24_row_ISSE(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
224 extern "C" void asm_YUVtoRGB16_row_ISSE(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
);
226 bool BitBltFromI420ToRGB(int w
, int h
, BYTE
* dst
, int dstpitch
, int dbpp
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, int srcpitch
)
228 if(w
<=0 || h
<=0 || (w
&1) || (h
&1))
231 void (*asm_YUVtoRGB_row
)(void* ARGB1
, void* ARGB2
, BYTE
* Y1
, BYTE
* Y2
, BYTE
* U
, BYTE
* V
, long width
) = NULL
;;
233 if((g_cpuid
.m_flags
& CCpuID::ssefpu
) && !(w
&7))
237 case 16: asm_YUVtoRGB_row
= asm_YUVtoRGB16_row
/*_ISSE*/; break; // TODO: fix _ISSE (555->565)
238 case 24: asm_YUVtoRGB_row
= asm_YUVtoRGB24_row_ISSE
; break;
239 case 32: asm_YUVtoRGB_row
= asm_YUVtoRGB32_row_ISSE
; break;
242 else if((g_cpuid
.m_flags
& CCpuID::mmx
) && !(w
&7))
246 case 16: asm_YUVtoRGB_row
= asm_YUVtoRGB16_row
/*_MMX*/; break; // TODO: fix _MMX (555->565)
247 case 24: asm_YUVtoRGB_row
= asm_YUVtoRGB24_row_MMX
; break;
248 case 32: asm_YUVtoRGB_row
= asm_YUVtoRGB32_row_MMX
; break;
255 case 16: asm_YUVtoRGB_row
= asm_YUVtoRGB16_row
; break;
256 case 24: asm_YUVtoRGB_row
= asm_YUVtoRGB24_row
; break;
257 case 32: asm_YUVtoRGB_row
= asm_YUVtoRGB32_row
; break;
261 if(!asm_YUVtoRGB_row
)
266 asm_YUVtoRGB_row(dst
+ dstpitch
, dst
, srcy
+ srcpitch
, srcy
, srcu
, srcv
, w
/2);
275 if(g_cpuid
.m_flags
& CCpuID::mmx
)
278 if(g_cpuid
.m_flags
& CCpuID::ssefpu
)
284 static void yuvtoyuy2row_c(BYTE
* dst
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, DWORD width
)
286 WORD
* dstw
= (WORD
*)dst
;
287 for(; width
> 1; width
-= 2)
289 *dstw
++ = (*srcu
++<<8)|*srcy
++;
290 *dstw
++ = (*srcv
++<<8)|*srcy
++;
294 static void __declspec(naked
) yuvtoyuy2row_MMX(BYTE
* dst
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, DWORD width
)
302 mov edi
, [esp
+20] // dst
303 mov ebp
, [esp
+24] // srcy
304 mov ebx
, [esp
+28] // srcu
305 mov esi
, [esp
+32] // srcv
306 mov ecx
, [esp
+36] // width
329 jnz yuvtoyuy2row_loop
339 static void yuvtoyuy2row_avg_c(BYTE
* dst
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, DWORD width
, DWORD pitchuv
)
341 WORD
* dstw
= (WORD
*)dst
;
342 for(; width
> 1; width
-= 2, srcu
++, srcv
++)
344 *dstw
++ = (((srcu
[0]+srcu
[pitchuv
])>>1)<<8)|*srcy
++;
345 *dstw
++ = (((srcv
[0]+srcv
[pitchuv
])>>1)<<8)|*srcy
++;
349 static void __declspec(naked
) yuvtoyuy2row_avg_MMX(BYTE
* dst
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, DWORD width
, DWORD pitchuv
)
351 static const __int64 mask
= 0x7f7f7f7f7f7f7f7fi
64;
361 mov edi
, [esp
+20] // dst
362 mov ebp
, [esp
+24] // srcy
363 mov ebx
, [esp
+28] // srcu
364 mov esi
, [esp
+32] // srcv
365 mov ecx
, [esp
+36] // width
366 mov eax
, [esp
+40] // pitchuv
370 yuvtoyuy2row_avg_loop
:
376 movd mm2
, [ebx
+ eax
]
377 punpcklbw mm2
, [esi
+ eax
]
380 // (x+y)>>1 == (x&y)+((x^y)>>1)
402 jnz yuvtoyuy2row_avg_loop
412 static void __declspec(naked
) yv12_yuy2_row_sse2() {
422 movdqa xmm0
, [ebx
+ eax
*2] // YYYYYYYY
423 movdqa xmm1
, [ebx
+ eax
*2 + 16] // YYYYYYYY
425 movdqa xmm2
, [edx
+ eax
] // UUUUUUUU
426 movdqa xmm3
, [esi
+ eax
] // VVVVVVVV
431 punpcklbw xmm2
, xmm3
// VUVUVUVU
432 punpckhbw xmm4
, xmm3
// VUVUVUVU
434 punpcklbw xmm0
, xmm2
// VYUYVYUY
439 movntdq
[edi
+ eax
*4], xmm0
440 movntdq
[edi
+ eax
*4 + 16], xmm5
441 movntdq
[edi
+ eax
*4 + 32], xmm1
442 movntdq
[edi
+ eax
*4 + 48], xmm6
453 static void __declspec(naked
) yv12_yuy2_row_sse2_linear() {
464 movdqa xmm0
, [ebx
+ eax
*2] // YYYYYYYY
465 movdqa xmm1
, [ebx
+ eax
*2 + 16] // YYYYYYYY
469 pavgb xmm2
, [edx
+ ebp
] // UUUUUUUU
470 pavgb xmm3
, [esi
+ ebp
] // VVVVVVVV
475 punpcklbw xmm2
, xmm3
// VUVUVUVU
476 punpckhbw xmm4
, xmm3
// VUVUVUVU
478 punpcklbw xmm0
, xmm2
// VYUYVYUY
483 movntdq
[edi
+ eax
*4], xmm0
484 movntdq
[edi
+ eax
*4 + 16], xmm5
485 movntdq
[edi
+ eax
*4 + 32], xmm1
486 movntdq
[edi
+ eax
*4 + 48], xmm6
499 static void __declspec(naked
) yv12_yuy2_row_sse2_linear_interlaced() {
510 movdqa xmm0
, [ebx
+ eax
*2] // YYYYYYYY
511 movdqa xmm1
, [ebx
+ eax
*2 + 16] // YYYYYYYY
515 pavgb xmm2
, [edx
+ ebp
*2] // UUUUUUUU
516 pavgb xmm3
, [esi
+ ebp
*2] // VVVVVVVV
521 punpcklbw xmm2
, xmm3
// VUVUVUVU
522 punpckhbw xmm4
, xmm3
// VUVUVUVU
524 punpcklbw xmm0
, xmm2
// VYUYVYUY
529 movntdq
[edi
+ eax
*4], xmm0
530 movntdq
[edi
+ eax
*4 + 16], xmm5
531 movntdq
[edi
+ eax
*4 + 32], xmm1
532 movntdq
[edi
+ eax
*4 + 48], xmm6
545 void __declspec(naked
) yv12_yuy2_sse2(const BYTE
*Y
, const BYTE
*U
, const BYTE
*V
,
546 int halfstride
, unsigned halfwidth
, unsigned height
,
547 BYTE
*YUY2
, int d_stride
)
555 mov ebx
, [esp
+ 20] // Y
556 mov edx
, [esp
+ 24] // U
557 mov esi
, [esp
+ 28] // V
558 mov edi
, [esp
+ 44] // D
559 mov ebp
, [esp
+ 32] // uv_stride
560 mov ecx
, [esp
+ 36] // uv_width
567 cmp dword ptr
[esp
+ 40], 2
571 sub dword ptr
[esp
+ 40], 2
572 call yv12_yuy2_row_sse2
574 lea ebx
, [ebx
+ ebp
*2]
577 call yv12_yuy2_row_sse2_linear
582 lea ebx
, [ebx
+ ebp
*2]
585 cmp dword ptr
[esp
+ 40], 2
589 call yv12_yuy2_row_sse2
591 dec dword ptr
[esp
+ 40]
594 lea ebx
, [ebx
+ ebp
*2]
596 call yv12_yuy2_row_sse2
608 void __declspec(naked
) yv12_yuy2_sse2_interlaced(const BYTE
*Y
, const BYTE
*U
, const BYTE
*V
,
609 int halfstride
, unsigned halfwidth
, unsigned height
,
610 BYTE
*YUY2
, int d_stride
)
618 mov ebx
, [esp
+ 20] // Y
619 mov edx
, [esp
+ 24] // U
620 mov esi
, [esp
+ 28] // V
621 mov edi
, [esp
+ 44] // D
622 mov ebp
, [esp
+ 32] // uv_stride
623 mov ecx
, [esp
+ 36] // uv_width
630 cmp dword ptr
[esp
+ 40], 4
634 sub dword ptr
[esp
+ 40], 4
635 call yv12_yuy2_row_sse2
// first row, first field
637 lea ebx
, [ebx
+ ebp
*2]
643 call yv12_yuy2_row_sse2
// first row, second field
645 lea ebx
, [ebx
+ ebp
*2]
651 call yv12_yuy2_row_sse2_linear_interlaced
// second row, first field
656 lea ebx
, [ebx
+ ebp
*2]
659 call yv12_yuy2_row_sse2_linear_interlaced
// second row, second field
664 lea ebx
, [ebx
+ ebp
*2]
667 cmp dword ptr
[esp
+ 40], 4
671 call yv12_yuy2_row_sse2
673 lea ebx
, [ebx
+ ebp
*2]
679 call yv12_yuy2_row_sse2
681 lea ebx
, [ebx
+ ebp
*2]
687 call yv12_yuy2_row_sse2
689 lea ebx
, [ebx
+ ebp
*2]
695 call yv12_yuy2_row_sse2
706 bool BitBltFromI420ToYUY2(int w
, int h
, BYTE
* dst
, int dstpitch
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, int srcpitch
, bool fInterlaced
)
708 if(w
<=0 || h
<=0 || (w
&1) || (h
&1))
711 if(srcpitch
== 0) srcpitch
= w
;
713 if((g_cpuid
.m_flags
& CCpuID::sse2
)
714 && !((DWORD_PTR
)srcy
&15) && !((DWORD_PTR
)srcu
&15) && !((DWORD_PTR
)srcv
&15) && !(srcpitch
&31)
715 && !((DWORD_PTR
)dst
&15) && !(dstpitch
&15))
717 if(!fInterlaced
) yv12_yuy2_sse2(srcy
, srcu
, srcv
, srcpitch
/2, w
/2, h
, dst
, dstpitch
);
718 else yv12_yuy2_sse2_interlaced(srcy
, srcu
, srcv
, srcpitch
/2, w
/2, h
, dst
, dstpitch
);
723 ASSERT(!fInterlaced
);
726 void (*yuvtoyuy2row
)(BYTE
* dst
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, DWORD width
) = NULL
;
727 void (*yuvtoyuy2row_avg
)(BYTE
* dst
, BYTE
* srcy
, BYTE
* srcu
, BYTE
* srcv
, DWORD width
, DWORD pitchuv
) = NULL
;
729 if((g_cpuid
.m_flags
& CCpuID::mmx
) && !(w
&7))
731 yuvtoyuy2row
= yuvtoyuy2row_MMX
;
732 yuvtoyuy2row_avg
= yuvtoyuy2row_avg_MMX
;
736 yuvtoyuy2row
= yuvtoyuy2row_c
;
737 yuvtoyuy2row_avg
= yuvtoyuy2row_avg_c
;
745 yuvtoyuy2row(dst
, srcy
, srcu
, srcv
, w
);
746 yuvtoyuy2row_avg(dst
+ dstpitch
, srcy
+ srcpitch
, srcu
, srcv
, w
, srcpitch
/2);
755 yuvtoyuy2row(dst
, srcy
, srcu
, srcv
, w
);
756 yuvtoyuy2row(dst
+ dstpitch
, srcy
+ srcpitch
, srcu
, srcv
, w
);
758 if(g_cpuid
.m_flags
& CCpuID::mmx
)
764 bool BitBltFromRGBToRGB(int w
, int h
, BYTE
* dst
, int dstpitch
, int dbpp
, BYTE
* src
, int srcpitch
, int sbpp
)
768 int rowbytes
= w
*dbpp
>>3;
770 if(rowbytes
> 0 && rowbytes
== srcpitch
&& rowbytes
== dstpitch
)
772 memcpy_accel(dst
, src
, h
*rowbytes
);
776 for(int y
= 0; y
< h
; y
++, src
+= srcpitch
, dst
+= dstpitch
)
777 memcpy_accel(dst
, src
, rowbytes
);
783 if(sbpp
!= 16 && sbpp
!= 24 && sbpp
!= 32
784 || dbpp
!= 16 && dbpp
!= 24 && dbpp
!= 32)
789 for(int y
= 0; y
< h
; y
++, src
+= srcpitch
, dst
+= dstpitch
)
793 BYTE
* s
= (BYTE
*)src
;
794 WORD
* d
= (WORD
*)dst
;
795 for(int x
= 0; x
< w
; x
++, s
+=3, d
++)
796 *d
= (WORD
)(((*((DWORD
*)s
)>>8)&0xf800)|((*((DWORD
*)s
)>>5)&0x07e0)|((*((DWORD
*)s
)>>3)&0x1f));
800 DWORD
* s
= (DWORD
*)src
;
801 WORD
* d
= (WORD
*)dst
;
802 for(int x
= 0; x
< w
; x
++, s
++, d
++)
803 *d
= (WORD
)(((*s
>>8)&0xf800)|((*s
>>5)&0x07e0)|((*s
>>3)&0x1f));
809 for(int y
= 0; y
< h
; y
++, src
+= srcpitch
, dst
+= dstpitch
)
813 WORD
* s
= (WORD
*)src
;
814 BYTE
* d
= (BYTE
*)dst
;
815 for(int x
= 0; x
< w
; x
++, s
++, d
+=3)
816 { // not tested, r-g-b might be in reverse
817 d
[0] = (*s
&0x001f)<<3;
818 d
[1] = (*s
&0x07e0)<<5;
819 d
[2] = (*s
&0xf800)<<8;
824 BYTE
* s
= (BYTE
*)src
;
825 BYTE
* d
= (BYTE
*)dst
;
826 for(int x
= 0; x
< w
; x
++, s
+=4, d
+=3)
827 {d
[0] = s
[0]; d
[1] = s
[1]; d
[2] = s
[2];}
833 for(int y
= 0; y
< h
; y
++, src
+= srcpitch
, dst
+= dstpitch
)
837 WORD
* s
= (WORD
*)src
;
838 DWORD
* d
= (DWORD
*)dst
;
839 for(int x
= 0; x
< w
; x
++, s
++, d
++)
840 *d
= ((*s
&0xf800)<<8)|((*s
&0x07e0)<<5)|((*s
&0x001f)<<3);
844 BYTE
* s
= (BYTE
*)src
;
845 DWORD
* d
= (DWORD
*)dst
;
846 for(int x
= 0; x
< w
; x
++, s
+=3, d
++)
847 *d
= *((DWORD
*)s
)&0xffffff;
855 static void asm_blend_row_clipped_c(BYTE
* dst
, BYTE
* src
, DWORD w
, DWORD srcpitch
)
857 BYTE
* src2
= src
+ srcpitch
;
858 do {*dst
++ = (*src
++ + *src2
++ + 1) >> 1;}
862 static void asm_blend_row_c(BYTE
* dst
, BYTE
* src
, DWORD w
, DWORD srcpitch
)
864 BYTE
* src2
= src
+ srcpitch
;
865 BYTE
* src3
= src2
+ srcpitch
;
866 do {*dst
++ = (*src
++ + (*src2
++ << 1) + *src3
++ + 2) >> 2;}
870 static void __declspec(naked
) asm_blend_row_clipped_MMX(BYTE
* dst
, BYTE
* src
, DWORD w
, DWORD srcpitch
)
872 static const __int64 _x0001000100010001
= 0x0001000100010001;
888 movq mm6
, _x0001000100010001
912 movq
[edi
+esi
-8], mm1
925 static void __declspec(naked
) asm_blend_row_MMX(BYTE
* dst
, BYTE
* src
, DWORD w
, DWORD srcpitch
)
927 static const __int64 mask0
= 0xfcfcfcfcfcfcfcfci
64;
928 static const __int64 mask1
= 0x7f7f7f7f7f7f7f7fi
64;
929 static const __int64 mask2
= 0x3f3f3f3f3f3f3f3fi
64;
930 static const __int64 _x0002000200020002
= 0x0002000200020002;
946 movq mm6
, _x0002000200020002
960 movq mm2
, [esi
+edx
*2]
979 movq
[edi
+esi
-8], mm1
984 // sadly the original code makes a lot of visible banding artifacts on yuv
985 // (it seems those shiftings without rounding introduce too much error)
1007 movq mm2,[esi+edx*2]
1021 movq [edi+esi-8],mm0
1025 test byte ptr [esp+28],1
1052 __declspec(align(16)) static BYTE const_1_16_bytes
[] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
1054 static void asm_blend_row_SSE2(BYTE
* dst
, BYTE
* src
, DWORD w
, DWORD srcpitch
)
1067 movdqa xmm7
, [const_1_16_bytes
]
1069 asm_blend_row_SSE2_loop
:
1071 movdqa xmm1
, [esi
+edx
]
1072 movdqa xmm2
, [esi
+edx
*2]
1077 movdqa
[esi
+edi
], xmm0
1080 jnz asm_blend_row_SSE2_loop
1083 jz asm_blend_row_SSE2_end
1089 asm_blend_row_SSE2_loop2
:
1104 jnz asm_blend_row_SSE2_loop2
1106 asm_blend_row_SSE2_end
:
1110 static void asm_blend_row_clipped_SSE2(BYTE
* dst
, BYTE
* src
, DWORD w
, DWORD srcpitch
)
1123 movdqa xmm7
, [const_1_16_bytes
]
1125 asm_blend_row_clipped_SSE2_loop
:
1127 movdqa xmm1
, [esi
+edx
]
1129 movdqa
[esi
+edi
], xmm0
1132 jnz asm_blend_row_clipped_SSE2_loop
1135 jz asm_blend_row_clipped_SSE2_end
1140 asm_blend_row_clipped_SSE2_loop2
:
1149 jnz asm_blend_row_clipped_SSE2_loop2
1151 asm_blend_row_clipped_SSE2_end
:
1155 void DeinterlaceBlend(BYTE
* dst
, BYTE
* src
, DWORD rowbytes
, DWORD h
, DWORD dstpitch
, DWORD srcpitch
)
1157 void (*asm_blend_row_clipped
)(BYTE
* dst
, BYTE
* src
, DWORD w
, DWORD srcpitch
) = NULL
;
1158 void (*asm_blend_row
)(BYTE
* dst
, BYTE
* src
, DWORD w
, DWORD srcpitch
) = NULL
;
1160 if((g_cpuid
.m_flags
& CCpuID::sse2
) && !((DWORD
)src
&0xf) && !((DWORD
)dst
&0xf) && !(srcpitch
&0xf))
1162 asm_blend_row_clipped
= asm_blend_row_clipped_SSE2
;
1163 asm_blend_row
= asm_blend_row_SSE2
;
1165 else if(g_cpuid
.m_flags
& CCpuID::mmx
)
1167 asm_blend_row_clipped
= asm_blend_row_clipped_MMX
;
1168 asm_blend_row
= asm_blend_row_MMX
;
1172 asm_blend_row_clipped
= asm_blend_row_clipped_c
;
1173 asm_blend_row
= asm_blend_row_c
;
1176 if(!asm_blend_row_clipped
)
1179 asm_blend_row_clipped(dst
, src
, rowbytes
, srcpitch
);
1184 asm_blend_row(dst
, src
, rowbytes
, srcpitch
);
1189 asm_blend_row_clipped(dst
+ dstpitch
, src
, rowbytes
, srcpitch
);
1191 if(g_cpuid
.m_flags
& CCpuID::mmx
)
1195 void DeinterlaceBob(BYTE
* dst
, BYTE
* src
, DWORD rowbytes
, DWORD h
, DWORD dstpitch
, DWORD srcpitch
, bool topfield
)
1199 BitBltFromRGBToRGB(rowbytes
, h
/2, dst
, dstpitch
*2, 8, src
, srcpitch
*2, 8);
1200 AvgLines8(dst
, h
, dstpitch
);
1204 BitBltFromRGBToRGB(rowbytes
, h
/2, dst
+ dstpitch
, dstpitch
*2, 8, src
+ srcpitch
, srcpitch
*2, 8);
1205 AvgLines8(dst
+ dstpitch
, h
-1, dstpitch
);
1209 void AvgLines8(BYTE
* dst
, DWORD h
, DWORD pitch
)
1214 BYTE
* d
= dst
+ (h
-2)*pitch
;
1216 for(; s
< d
; s
+= pitch
*2)
1220 if((g_cpuid
.m_flags
& CCpuID::sse2
) && !((DWORD
)tmp
&0xf) && !((DWORD
)pitch
&0xf))
1230 AvgLines8_sse2_loop
:
1232 pavgb xmm0
, [esi
+ebx
*2]
1233 movdqa
[esi
+ebx
], xmm0
1237 jnz AvgLines8_sse2_loop
1242 for(int i
= pitch
&7; i
--; tmp
++)
1244 tmp
[pitch
] = (tmp
[0] + tmp
[pitch
<<1] + 1) >> 1;
1247 else if(g_cpuid
.m_flags
& CCpuID::mmx
)
1265 movq mm2
, [esi
+ebx
*2]
1284 jnz AvgLines8_mmx_loop
1289 for(int i
= pitch
&7; i
--; tmp
++)
1291 tmp
[pitch
] = (tmp
[0] + tmp
[pitch
<<1] + 1) >> 1;
1296 for(int i
= pitch
; i
--; tmp
++)
1298 tmp
[pitch
] = (tmp
[0] + tmp
[pitch
<<1] + 1) >> 1;
1303 if(!(h
&1) && h
>= 2)
1306 memcpy_accel(dst
+ pitch
, dst
, pitch
);
1312 void AvgLines555(BYTE
* dst
, DWORD h
, DWORD pitch
)
1316 unsigned __int64 __0x7c007c007c007c00
= 0x7c007c007c007c00;
1317 unsigned __int64 __0x03e003e003e003e0
= 0x03e003e003e003e0;
1318 unsigned __int64 __0x001f001f001f001f
= 0x001f001f001f001f;
1321 BYTE
* d
= dst
+ (h
-2)*pitch
;
1323 for(; s
< d
; s
+= pitch
*2)
1335 movq mm6
, __0x03e003e003e003e0
1336 movq mm7
, __0x001f001f001f001f
1343 psrlw mm0
, 10 // red1 bits: mm0 = 001f001f001f001f
1344 pand mm1
, mm6
// green1 bits: mm1 = 03e003e003e003e0
1345 pand mm2
, mm7
// blue1 bits: mm2 = 001f001f001f001f
1347 movq mm3
, [esi
+ebx
*2]
1351 psrlw mm3
, 10 // red2 bits: mm3 = 001f001f001f001f
1352 pand mm4
, mm6
// green2 bits: mm4 = 03e003e003e003e0
1353 pand mm5
, mm7
// blue2 bits: mm5 = 001f001f001f001f
1356 psrlw mm0
, 1 // (red1+red2)/2
1357 psllw mm0
, 10 // red bits at 7c007c007c007c00
1360 psrlw mm1
, 1 // (green1+green2)/2
1361 pand mm1
, mm6
// green bits at 03e003e003e003e0
1364 psrlw mm2
, 1 // (blue1+blue2)/2
1365 // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)
1375 jnz AvgLines555_loop
1380 for(int i
= (pitch
&7)>>1; i
--; tmp
++)
1383 ((((*tmp
&0x7c00) + (tmp
[pitch
<<1]&0x7c00)) >> 1)&0x7c00)|
1384 ((((*tmp
&0x03e0) + (tmp
[pitch
<<1]&0x03e0)) >> 1)&0x03e0)|
1385 ((((*tmp
&0x001f) + (tmp
[pitch
<<1]&0x001f)) >> 1)&0x001f);
1389 if(!(h
&1) && h
>= 2)
1392 memcpy_accel(dst
+ pitch
, dst
, pitch
);
1398 void AvgLines565(BYTE
* dst
, DWORD h
, DWORD pitch
)
1402 unsigned __int64 __0xf800f800f800f800
= 0xf800f800f800f800;
1403 unsigned __int64 __0x07e007e007e007e0
= 0x07e007e007e007e0;
1404 unsigned __int64 __0x001f001f001f001f
= 0x001f001f001f001f;
1407 BYTE
* d
= dst
+ (h
-2)*pitch
;
1409 for(; s
< d
; s
+= pitch
*2)
1411 WORD
* tmp
= (WORD
*)s
;
1421 movq mm6
, __0x07e007e007e007e0
1422 movq mm7
, __0x001f001f001f001f
1429 psrlw mm0
, 11 // red1 bits: mm0 = 001f001f001f001f
1430 pand mm1
, mm6
// green1 bits: mm1 = 07e007e007e007e0
1431 pand mm2
, mm7
// blue1 bits: mm2 = 001f001f001f001f
1433 movq mm3
, [esi
+ebx
*2]
1437 psrlw mm3
, 11 // red2 bits: mm3 = 001f001f001f001f
1438 pand mm4
, mm6
// green2 bits: mm4 = 07e007e007e007e0
1439 pand mm5
, mm7
// blue2 bits: mm5 = 001f001f001f001f
1442 psrlw mm0
, 1 // (red1+red2)/2
1443 psllw mm0
, 11 // red bits at f800f800f800f800
1446 psrlw mm1
, 1 // (green1+green2)/2
1447 pand mm1
, mm6
// green bits at 03e003e003e003e0
1450 psrlw mm2
, 1 // (blue1+blue2)/2
1451 // blue bits at 001f001f001f001f (no need to pand, lower bits were discareded)
1461 jnz AvgLines565_loop
1466 for(int i
= (pitch
&7)>>1; i
--; tmp
++)
1469 ((((*tmp
&0xf800) + (tmp
[pitch
<<1]&0xf800)) >> 1)&0xf800)|
1470 ((((*tmp
&0x07e0) + (tmp
[pitch
<<1]&0x07e0)) >> 1)&0x07e0)|
1471 ((((*tmp
&0x001f) + (tmp
[pitch
<<1]&0x001f)) >> 1)&0x001f);
1475 if(!(h
&1) && h
>= 2)
1478 memcpy_accel(dst
+ pitch
, dst
, pitch
);
1484 extern "C" void mmx_YUY2toRGB24(const BYTE
* src
, BYTE
* dst
, const BYTE
* src_end
, int src_pitch
, int row_size
, bool rec709
);
1485 extern "C" void mmx_YUY2toRGB32(const BYTE
* src
, BYTE
* dst
, const BYTE
* src_end
, int src_pitch
, int row_size
, bool rec709
);
1487 bool BitBltFromYUY2ToRGB(int w
, int h
, BYTE
* dst
, int dstpitch
, int dbpp
, BYTE
* src
, int srcpitch
)
1489 void (* YUY2toRGB
)(const BYTE
* src
, BYTE
* dst
, const BYTE
* src_end
, int src_pitch
, int row_size
, bool rec709
) = NULL
;
1491 if(g_cpuid
.m_flags
& CCpuID::mmx
)
1494 dbpp
== 32 ? mmx_YUY2toRGB32
:
1495 dbpp
== 24 ? mmx_YUY2toRGB24
:
1496 // dbpp == 16 ? mmx_YUY2toRGB16 : // TODO
1504 if(!YUY2toRGB
) return(false);
1506 YUY2toRGB(src
, dst
, src
+ h
*srcpitch
, srcpitch
, w
, false);