Merge MPC-HC code(01b8dbf34d6a486fa1cd02d7a123249fec1e4160) [PART 2] (HdmvSub)
[xy_vsfilter.git] / src / subpic / xy_intrinsics.h
blob33eb9c29012011b8851ba5172f41dc1ba76488d3
1 #ifndef __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__
2 #define __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__
4 #ifdef LINUX
5 #include <pmmintrin.h>
6 #else
7 #include<intrin.h>
8 #endif
10 #include <WTypes.h>
12 //out: m128_1 = avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
13 // 0
14 // avg(...)
15 // 0
16 // ...
17 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
18 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
19 m128_2 = _mm_slli_epi16(m128_1, 8); \
20 m128_1 = _mm_srli_epi16(m128_1, 8); \
21 m128_2 = _mm_srli_epi16(m128_2, 8); \
22 m128_1 = _mm_avg_epu8(m128_1, m128_2);
24 //out: m128_1 = avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
25 // avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
26 // avg(...)
27 // avg(...)
28 // ...
29 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
31 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
32 m128_2 = _mm_slli_epi16(m128_1, 8); \
33 __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
34 m128_2 = _mm_or_si128(m128_2, m128_3);\
35 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
38 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
39 //out: m128_1 = avg(U_last, u8[0], u8[1])
40 // 0
41 // avg(u8[1], u8[2], u8[3])
42 // 0
43 // ...
44 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
45 #define AVERAGE_4_PIX_INTRINSICS_3(m128_1, m128_last) \
47 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
48 m128_2 = _mm_or_si128(m128_2, m128_last);\
49 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
50 m128_last = _mm_srli_si128(m128_1,14);\
51 m128_1 = _mm_slli_epi16(m128_1, 8);\
52 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
53 m128_1 = _mm_srli_epi16(m128_1, 8);\
56 static void average_4_pix_intrinsics_3_c(__m128i& m128i_1, __m128i& m128i_last)
58 int last=m128i_last.m128i_u8[1];
59 m128i_last.m128i_u8[0] = m128i_1.m128i_u8[14];
60 m128i_last.m128i_u8[1] = m128i_1.m128i_u8[15];
61 for (int i=2;i<16;i++)
63 m128i_last.m128i_u8[i] = 0;
65 for (int i=0;i<8;i++)
67 int u0 = m128i_1.m128i_u8[2*i];
68 int u1 = m128i_1.m128i_u8[2*i+1];
69 last = (last + u1 + 1)/2;
70 u0 = (last + u0 + 1)/2;
71 last = u1;
72 m128i_1.m128i_u8[2*i] = u0;
73 m128i_1.m128i_u8[2*i+1] = 0;
77 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
78 //out: m128_1 = 0
79 // avg(U_last, u8[0], u8[1])
80 // 0
81 // avg(u8[1], u8[2], u8[3])
82 // ...
83 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
84 #define AVERAGE_4_PIX_INTRINSICS_4(m128_1, m128_last) \
86 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
87 m128_2 = _mm_or_si128(m128_2, m128_last);\
88 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
89 m128_last = _mm_srli_si128(m128_1,14);\
90 m128_2 = _mm_srli_epi16(m128_2, 8);\
91 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
92 m128_1 = _mm_slli_epi16(m128_1, 8);\
95 static void average_4_pix_intrinsics_4_c(__m128i& m128i_1, __m128i& m128i_last)
97 int last=m128i_last.m128i_u8[1];
98 m128i_last.m128i_u8[0] = m128i_1.m128i_u8[14];
99 m128i_last.m128i_u8[1] = m128i_1.m128i_u8[15];
100 for (int i=2;i<16;i++)
102 m128i_last.m128i_u8[i] = 0;
104 for (int i=0;i<8;i++)
106 int u0 = m128i_1.m128i_u8[2*i];
107 int u1 = m128i_1.m128i_u8[2*i+1];
108 last = (last + u1 + 1)/2;
109 u0 = (last + u0 + 1)/2;
110 last = u1;
111 m128i_1.m128i_u8[2*i+1] = u0;
112 m128i_1.m128i_u8[2*i] = 0;
115 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
116 //out: m128_1 = avg(U_last, u8[0], u8[1])
117 // avg(U_last, u8[0], u8[1])
118 // avg(u8[1], u8[2], u8[3])
119 // avg(u8[1], u8[2], u8[3])
120 // ...
121 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
122 #define AVERAGE_4_PIX_INTRINSICS_5(m128_1, m128_last) \
124 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
125 m128_2 = _mm_or_si128(m128_2, m128_last);\
126 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
127 m128_last = _mm_srli_si128(m128_1,14);\
128 m128_2 = _mm_srli_epi16(m128_2, 8);\
129 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
130 m128_1 = _mm_slli_epi16(m128_1, 8);\
131 m128_2 = _mm_srli_epi16(m128_1, 8);\
132 m128_1 = _mm_or_si128(m128_1, m128_2);\
135 static void average_4_pix_intrinsics_5_c(__m128i& m128i_1, __m128i& m128i_last)
137 int last=m128i_last.m128i_u8[1];
138 m128i_last.m128i_u8[0] = m128i_1.m128i_u8[14];
139 m128i_last.m128i_u8[1] = m128i_1.m128i_u8[15];
140 for (int i=2;i<16;i++)
142 m128i_last.m128i_u8[i] = 0;
144 for (int i=0;i<8;i++)
146 int u0 = m128i_1.m128i_u8[2*i];
147 int u1 = m128i_1.m128i_u8[2*i+1];
148 last = (last + u1 + 1)/2;
149 u0 = (last + u0 + 1)/2;
150 last = u1;
151 m128i_1.m128i_u8[2*i+1] = u0;
152 m128i_1.m128i_u8[2*i] = u0;
156 static void subsample_and_interlace_2_line_c(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
158 const BYTE* end = u + w;
159 for (;u<end;dst+=2,u+=2,v+=2)
161 dst[0] = (u[0] + u[0+pitch] + 1)/2;
162 int tmp1 = (u[1] + u[1+pitch] + 1)/2;
163 dst[0] = (dst[0] + tmp1 + 1)/2;
164 dst[1] = (v[0] + v[0+pitch] + 1)/2;
165 tmp1 = (v[1] + v[1+pitch] + 1)/2;
166 dst[1] = (dst[1] + tmp1 + 1)/2;
170 static __forceinline void subsample_and_interlace_2_line_sse2(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch)
172 const BYTE* end = u + w;
173 for (;u<end;dst+=16,u+=16,v+=16)
175 __m128i u_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(u) );
176 __m128i u_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(u+pitch) );
177 __m128i v_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(v) );
178 __m128i v_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(v+pitch) );
179 AVERAGE_4_PIX_INTRINSICS(u_1, u_2);
180 AVERAGE_4_PIX_INTRINSICS(v_1, v_2);
181 u_1 = _mm_packus_epi16(u_1, u_1);
182 v_1 = _mm_packus_epi16(v_1, v_1);
183 u_1 = _mm_unpacklo_epi8(u_1, v_1);
185 _mm_store_si128( reinterpret_cast<__m128i*>(dst), u_1 );
189 static __forceinline void pix_alpha_blend_yv12_luma_sse2(byte* dst, const byte* alpha, const byte* sub)
191 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
192 __m128i alpha128 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
193 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(sub) );
194 __m128i zero = _mm_setzero_si128();
196 __m128i ones;
197 #ifdef _DEBUG
198 ones = _mm_setzero_si128();//disable warning C4700
199 #endif
200 ones = _mm_cmpeq_epi32(ones,ones);
201 ones = _mm_cmpeq_epi8(ones,alpha128);
203 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
204 __m128i alpha_lo128 = _mm_unpacklo_epi8(alpha128, zero);
206 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
208 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha_lo128);
209 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
210 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
212 dst128 = _mm_unpackhi_epi8(dst128, zero);
213 alpha128 = _mm_unpackhi_epi8(alpha128, zero);
215 ones2 = _mm_unpackhi_epi8(ones, zero);
217 dst128 = _mm_mullo_epi16(dst128, alpha128);
218 dst128 = _mm_adds_epu16(dst128, ones2);
219 dst128 = _mm_srli_epi16(dst128, 8);
220 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
222 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
223 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
226 /***
227 * output not exactly identical to pix_alpha_blend_yv12_chroma
229 static __forceinline void pix_alpha_blend_yv12_chroma_sse2(byte* dst, const byte* src, const byte* alpha, int src_pitch)
231 __m128i zero = _mm_setzero_si128();
232 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha) );
233 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(alpha+src_pitch) );
234 __m128i dst128 = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(dst) );
236 __m128i sub128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
237 __m128i sub128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src+src_pitch) );
239 AVERAGE_4_PIX_INTRINSICS(alpha128_1, alpha128_2);
241 __m128i ones;
242 #ifdef _DEBUG
243 ones = _mm_setzero_si128();//disable warning C4700
244 #endif
245 ones = _mm_cmpeq_epi32(ones,ones);
246 ones = _mm_cmpeq_epi8(ones, alpha128_1);
248 dst128 = _mm_unpacklo_epi8(dst128, zero);
249 __m128i dst128_2 = _mm_and_si128(dst128, ones);
251 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
252 dst128 = _mm_adds_epu16(dst128, dst128_2);
254 dst128 = _mm_srli_epi16(dst128, 8);
256 AVERAGE_4_PIX_INTRINSICS(sub128_1, sub128_2);
258 dst128 = _mm_adds_epi16(dst128, sub128_1);
259 dst128 = _mm_packus_epi16(dst128, dst128);
261 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst), dst128 );
264 static __forceinline void mix_16_y_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
266 //important!
267 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
268 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
269 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
271 __m128i alpha_ff;
272 #ifdef _DEBUG
273 alpha_ff = _mm_setzero_si128();//disable warning C4700
274 #endif
275 alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
277 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
279 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
280 //so we do it another way
281 //first, (alpha<<8)+0xff
282 __m128i ones = _mm_setzero_si128();
283 ones = _mm_cmpeq_epi16(dst_y, ones);
285 __m128i ones2;
286 #ifdef _DEBUG
287 ones2 = _mm_setzero_si128();//disable warning C4700
288 #endif
289 ones2 = _mm_cmpeq_epi32(ones2,ones2);
291 ones = _mm_xor_si128(ones, ones2);
292 ones = _mm_srli_epi16(ones, 15);
293 ones = _mm_and_si128(ones, lo);
295 dst_y = _mm_mulhi_epu16(dst_y, lo);
296 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
298 lo = _mm_setzero_si128();
299 lo = _mm_unpacklo_epi8(lo, src_y);
300 dst_y = _mm_adds_epu16(dst_y, lo);
301 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
303 dst += 16;
304 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
306 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
308 ones = _mm_setzero_si128();
309 ones = _mm_cmpeq_epi16(dst_y, ones);
310 ones = _mm_xor_si128(ones, ones2);
311 ones = _mm_srli_epi16(ones, 15);
312 ones = _mm_and_si128(ones, lo);
314 dst_y = _mm_mulhi_epu16(dst_y, lo);
315 dst_y = _mm_adds_epu16(dst_y, ones);
317 lo = _mm_setzero_si128();
318 lo = _mm_unpackhi_epi8(lo, src_y);
319 dst_y = _mm_adds_epu16(dst_y, lo);
320 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
323 //for test only
324 static void mix_16_y_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha)
326 WORD* dst_word = reinterpret_cast<WORD*>(dst);
327 for (int i=0;i<16;i++)
329 if (src_alpha[i]!=0xff)
331 dst_word[i] = ((dst_word[i] *src_alpha[i])>>8) + (src[i]<<8);
336 static __forceinline void pix_alpha_blend_yv12_chroma(byte* dst, const byte* src, const byte* alpha, int src_pitch)
338 unsigned int ia = (alpha[0]+alpha[1]+
339 alpha[0+src_pitch]+alpha[1+src_pitch])>>2;
340 if(ia!=0xff)
342 *dst= (((*dst)*ia)>>8) + ((src[0] +src[1]+
343 src[src_pitch]+src[1+src_pitch] )>>2);
346 static __forceinline void mix_16_uv_p010_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
348 //important!
349 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
350 __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
352 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
353 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
355 AVERAGE_4_PIX_INTRINSICS_2(alpha, alpha2);
357 __m128i alpha_ff;
358 #ifdef _DEBUG
359 alpha_ff = _mm_setzero_si128();//disable warning C4700
360 #endif
361 alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
363 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
365 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
366 //so we do it another way
367 //first, (alpha<<8)+0xff
368 __m128i ones = _mm_setzero_si128();
369 ones = _mm_cmpeq_epi16(dst_y, ones);
371 __m128i ones2;
372 #ifdef _DEBUG
373 ones2 = _mm_setzero_si128();//disable warning C4700
374 #endif
375 ones2 = _mm_cmpeq_epi32(ones2,ones2);
376 ones = _mm_xor_si128(ones, ones2);
377 ones = _mm_srli_epi16(ones, 15);
378 ones = _mm_and_si128(ones, lo);
380 dst_y = _mm_mulhi_epu16(dst_y, lo);
381 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
383 lo = _mm_setzero_si128();
384 lo = _mm_unpacklo_epi8(lo, src_y);
385 dst_y = _mm_adds_epu16(dst_y, lo);
386 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
388 dst += 16;
389 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
391 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
393 ones = _mm_setzero_si128();
394 ones = _mm_cmpeq_epi16(dst_y, ones);
395 ones = _mm_xor_si128(ones, ones2);
396 ones = _mm_srli_epi16(ones, 15);
397 ones = _mm_and_si128(ones, lo);
399 dst_y = _mm_mulhi_epu16(dst_y, lo);
400 dst_y = _mm_adds_epu16(dst_y, ones);
402 lo = _mm_setzero_si128();
403 lo = _mm_unpackhi_epi8(lo, src_y);
404 dst_y = _mm_adds_epu16(dst_y, lo);
405 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
408 static void mix_16_uv_p010_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
410 WORD* dst_word = reinterpret_cast<WORD*>(dst);
411 for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst_word+=2)
413 unsigned int ia = (
414 (src_alpha[0]+src_alpha[0+pitch]+1)/2+
415 (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
416 if( ia!=0xFF )
418 int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);
419 if(tmp>0xffff) tmp = 0xffff;
420 dst_word[0] = tmp;
421 tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);
422 if(tmp>0xffff) tmp = 0xffff;
423 dst_word[1] = tmp;
428 static __forceinline void mix_16_uv_nvxx_sse2(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
430 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
431 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha) );
432 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(src_alpha+pitch) );
433 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
435 AVERAGE_4_PIX_INTRINSICS_2(alpha128_1, alpha128_2);
436 __m128i zero = _mm_setzero_si128();
438 __m128i ones;
439 #ifdef _DEBUG
440 ones = _mm_setzero_si128();//disable warning C4700
441 #endif
442 ones = _mm_cmpeq_epi32(ones,ones);
443 ones = _mm_cmpeq_epi8(ones,alpha128_1);
445 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
446 alpha128_2 = _mm_unpacklo_epi8(alpha128_1, zero);
448 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
450 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha128_2);
451 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
452 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
454 dst128 = _mm_unpackhi_epi8(dst128, zero);
455 alpha128_1 = _mm_unpackhi_epi8(alpha128_1, zero);
457 ones2 = _mm_unpackhi_epi8(ones, zero);
459 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
460 dst128 = _mm_adds_epu16(dst128, ones2);
461 dst128 = _mm_srli_epi16(dst128, 8);
462 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
464 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
465 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
468 //for test only
469 static void mix_16_uv_nvxx_c(BYTE* dst, const BYTE* src, const BYTE* src_alpha, int pitch)
471 for (int i=0;i<8;i++, src_alpha+=2, src+=2, dst+=2)
473 unsigned int ia = (
474 (src_alpha[0]+src_alpha[0+pitch]+1)/2+
475 (src_alpha[1]+src_alpha[1+pitch]+1)/2+1)/2;
476 if( ia!=0xFF )
478 dst[0] = (((dst[0])*ia)>>8) + src[0];
479 dst[1] = (((dst[1])*ia)>>8) + src[1];
484 /******
485 * hleft_vmid:
486 * chroma placement(x=Y, o=U,V):
487 * x x x x ...
488 * o o ...
489 * x x x x ...
490 * o o ...
491 * x x x x ...
492 ******/
493 static __forceinline void hleft_vmid_subsample_and_interlace_2_line_c(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch, int last_src_id=0)
495 const BYTE* end = u + w;
496 BYTE last_u = (u[last_src_id]+u[last_src_id+pitch]+1)/2;
497 BYTE last_v = (v[last_src_id]+v[last_src_id+pitch]+1)/2;
498 for (;u<end;dst+=2,u+=2,v+=2)
500 dst[0] = (u[0] + u[0+pitch] + 1)/2;
501 int tmp1 = (u[1] + u[1+pitch] + 1)/2;
502 last_u = (tmp1+last_u+1)/2;
503 dst[0] = (dst[0] + last_u + 1)/2;
504 last_u = tmp1;
506 dst[1] = (v[0] + v[0+pitch] + 1)/2;
507 tmp1 = (v[1] + v[1+pitch] + 1)/2;
508 last_v = (tmp1+last_v+1)/2;
509 dst[1] = (last_v + dst[1] + 1)/2;
510 last_v = tmp1;
514 // @w : w % 16 must == 0!
515 static __forceinline void hleft_vmid_subsample_and_interlace_2_line_sse2(BYTE* dst, const BYTE* u, const BYTE* v, int w, int pitch, int last_src_id=0)
517 const BYTE* end_mod16 = u + (w&~15);
519 __m128i u_last = _mm_cvtsi32_si128( (u[last_src_id]+u[pitch+last_src_id]+1)<<7 );
520 __m128i v_last = _mm_cvtsi32_si128( (v[last_src_id]+v[pitch+last_src_id]+1)<<7 );
521 for (;u<end_mod16;dst+=16,u+=16,v+=16)
523 __m128i u_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(u) );
524 __m128i u_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(u+pitch) );
525 __m128i v_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(v) );
526 __m128i v_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(v+pitch) );
527 u_1 = _mm_avg_epu8(u_1, u_2);
528 AVERAGE_4_PIX_INTRINSICS_3(u_1, u_last);
529 v_1 = _mm_avg_epu8(v_1, v_2);
530 AVERAGE_4_PIX_INTRINSICS_4(v_1, v_last);
531 u_1 = _mm_or_si128(u_1, v_1);
532 _mm_store_si128( reinterpret_cast<__m128i*>(dst), u_1 );
534 //The following fails if dst==u
535 //hleft_vmid_subsample_and_interlace_2_line_c(dst, u, v, w&15, pitch, w>15?-1:0);
538 static __forceinline void hleft_vmid_mix_uv_yv12_c(byte* dst, int w, const byte* src, const byte* am, int src_pitch, int last_src_id=0)
540 int last_alpha = (am[last_src_id]+am[last_src_id+src_pitch]+1)/2;
541 int last_sub = (src[last_src_id]+src[last_src_id+src_pitch]+1)/2;
542 const BYTE* end = src + w;
543 for(; src < end; src += 2, am += 2, dst++)
545 int ia = (am[0]+am[0+src_pitch]+1)/2;
546 int tmp1 = (am[1]+am[1+src_pitch]+1)/2;
547 last_alpha = (last_alpha + tmp1 + 1)/2;
548 ia = (ia + last_alpha + 1)/2;
549 last_alpha = tmp1;
551 if(ia!=0xff)
553 tmp1 = (src[0]+src[0+src_pitch]+1)/2;
554 int tmp2 = (src[1]+src[1+src_pitch]+1)/2;
555 last_sub = (last_sub+tmp2+1)/2;
556 tmp1 = (tmp1+last_sub+1)/2;
557 last_sub = tmp2;
559 *dst= (((*dst)*ia)>>8) + tmp1;
561 else
563 last_sub = (src[1]+src[1+src_pitch]+1)/2;
568 //0<=w15<=15 && w15%2==0
569 static __forceinline void hleft_vmid_mix_uv_yv12_c2(byte* dst, int w15, const byte* src, const byte* am, int src_pitch, int last_src_id=0)
571 ASSERT(w15>=0 && w15<=15 && (w15&1)==0 );
573 int last_alpha = (am[last_src_id]+am[last_src_id+src_pitch]+1)/2;
574 int last_sub = (src[last_src_id]+src[last_src_id+src_pitch]+1)/2;
575 const BYTE* end = src + w15;
577 switch(w15)
579 case 14:
580 #define _hleft_vmid_mix_uv_yv12_c2_mix_2 \
581 int ia = (am[0]+am[0+src_pitch]+1)/2;\
582 int tmp1 = (am[1]+am[1+src_pitch]+1)/2;\
583 last_alpha = (last_alpha + tmp1 + 1)/2;\
584 ia = (ia + last_alpha + 1)/2;\
585 last_alpha = tmp1;\
587 if(ia!=0xff)\
589 tmp1 = (src[0]+src[0+src_pitch]+1)/2;\
590 int tmp2 = (src[1]+src[1+src_pitch]+1)/2;\
591 last_sub = (last_sub+tmp2+1)/2;\
592 tmp1 = (tmp1+last_sub+1)/2;\
593 last_sub = tmp2;\
595 *dst= (((*dst)*ia)>>8) + tmp1;\
597 else\
599 last_sub = (src[1]+src[1+src_pitch]+1)/2;\
600 }src += 2, am += 2, dst++
602 { _hleft_vmid_mix_uv_yv12_c2_mix_2; }
603 case 12:
604 { _hleft_vmid_mix_uv_yv12_c2_mix_2; }
605 case 10:
606 { _hleft_vmid_mix_uv_yv12_c2_mix_2; }
607 case 8:
608 { _hleft_vmid_mix_uv_yv12_c2_mix_2 ; }
609 case 6:
610 { _hleft_vmid_mix_uv_yv12_c2_mix_2 ; }
611 case 4:
612 { _hleft_vmid_mix_uv_yv12_c2_mix_2; }
613 case 2:
614 { _hleft_vmid_mix_uv_yv12_c2_mix_2; }
618 // am[last_src_id] valid && w&15=0
619 static __forceinline void hleft_vmid_mix_uv_yv12_sse2(byte* dst, int w00, const byte* src, const byte* am, int src_pitch, int last_src_id=0)
621 ASSERT( (( (2*(int)dst) | w00 | (int)src | (int)am | src_pitch)&15)==0 );
623 __m128i last_src = _mm_cvtsi32_si128( (src[last_src_id]+src[src_pitch+last_src_id]+1)<<7 );
624 __m128i last_alpha = _mm_cvtsi32_si128( (am[last_src_id]+am[src_pitch+last_src_id]+1)<<7 );
625 const BYTE* end_mod16 = src + (w00&~15);
626 for(; src < end_mod16; src += 16, am += 16, dst+=8)
628 __m128i zero = _mm_setzero_si128();
630 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(am) );
631 __m128i tmp = _mm_load_si128( reinterpret_cast<const __m128i*>(am+src_pitch) );
632 alpha128_1 = _mm_avg_epu8(alpha128_1, tmp);
633 AVERAGE_4_PIX_INTRINSICS_3(alpha128_1, last_alpha);
635 __m128i dst128 = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(dst) );
637 __m128i sub128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
638 tmp = _mm_load_si128( reinterpret_cast<const __m128i*>(src+src_pitch) );
639 sub128_1 = _mm_avg_epu8(sub128_1, tmp);
640 AVERAGE_4_PIX_INTRINSICS_3(sub128_1, last_src);
642 __m128i ones;
643 #ifdef _DEBUG
644 ones = _mm_setzero_si128();//disable warning C4700
645 #endif
646 ones = _mm_cmpeq_epi32(ones,ones);
647 ones = _mm_cmpeq_epi8(ones, alpha128_1);
649 dst128 = _mm_unpacklo_epi8(dst128, zero);
650 __m128i dst128_2 = _mm_and_si128(dst128, ones);
652 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
653 dst128 = _mm_adds_epu16(dst128, dst128_2);
655 dst128 = _mm_srli_epi16(dst128, 8);
657 dst128 = _mm_adds_epi16(dst128, sub128_1);
658 dst128 = _mm_packus_epi16(dst128, dst128);
660 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst), dst128 );
664 static __forceinline void hleft_vmid_mix_uv_p010_c(BYTE* dst, int w, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
666 int last_alpha = (am[last_src_id]+am[src_pitch+last_src_id]+1)/2;
667 const BYTE* end = src + w;
668 WORD* dst_word = reinterpret_cast<WORD*>(dst);
669 for(; src < end; src+=2, am+=2, dst_word+=2)
671 int ia = (am[0]+am[0+src_pitch]+1)/2;
672 int tmp2 = (am[1]+am[1+src_pitch]+1)/2;
673 last_alpha = (last_alpha + tmp2 + 1)/2;
674 ia = (ia + last_alpha + 1)/2;
675 last_alpha = tmp2;
677 if( ia!=0xFF )
679 int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);
680 #ifdef XY_UNIT_TEST
681 tmp ^= (tmp^0xffff)&((0xffff-tmp)>>31);//if(tmp>0xffff) tmp = 0xffff;
682 #endif
683 dst_word[0] = tmp;
684 tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);
685 #ifdef XY_UNIT_TEST
686 tmp ^= (tmp^0xffff)&((0xffff-tmp)>>31);//if(tmp>0xffff) tmp = 0xffff;
687 #endif
688 dst_word[1] = tmp;
694 //0<=w15<=15
695 static __forceinline void hleft_vmid_mix_uv_p010_c2(BYTE* dst, int w15, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
697 ASSERT(w15>=0 && w15<=15 && (w15&1)==0 );
698 int last_alpha = (am[last_src_id]+am[src_pitch+last_src_id]+1)/2;
699 WORD* dst_word = reinterpret_cast<WORD*>(dst);
701 #ifdef XY_UNIT_TEST
702 # define _hleft_vmid_mix_uv_p010_c2_CLIP(tmp) tmp ^= (tmp^0xffff)&((0xffff-tmp)>>31);/*if(tmp>0xffff) tmp = 0xffff;*/
703 #else
704 # define _hleft_vmid_mix_uv_p010_c2_CLIP(tmp)
705 #endif
707 switch(w15)
709 case 14:
710 #define _hleft_vmid_mix_uv_p010_c2_mix_2 \
711 int ia = (am[0]+am[0+src_pitch]+1)/2;\
712 int tmp2 = (am[1]+am[1+src_pitch]+1)/2;\
713 last_alpha = (last_alpha + tmp2 + 1)/2;\
714 ia = (ia + last_alpha + 1)/2;\
715 last_alpha = tmp2;\
717 if( ia!=0xFF )\
719 int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);\
720 _hleft_vmid_mix_uv_p010_c2_CLIP(tmp);\
721 dst_word[0] = tmp;\
722 tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);\
723 _hleft_vmid_mix_uv_p010_c2_CLIP(tmp);\
724 dst_word[1] = tmp;\
725 } src+=2, am+=2, dst_word+=2
727 { _hleft_vmid_mix_uv_p010_c2_mix_2; }
728 case 12:
729 { _hleft_vmid_mix_uv_p010_c2_mix_2; }
730 case 10:
731 { _hleft_vmid_mix_uv_p010_c2_mix_2; }
732 case 8:
733 { _hleft_vmid_mix_uv_p010_c2_mix_2; }
734 case 6:
735 { _hleft_vmid_mix_uv_p010_c2_mix_2; }
736 case 4:
737 { _hleft_vmid_mix_uv_p010_c2_mix_2; }
738 case 2:
739 { _hleft_vmid_mix_uv_p010_c2_mix_2; }
743 // am[last_src_id] valid && w&15=0
744 static __forceinline void hleft_vmid_mix_uv_p010_sse2(BYTE* dst, int w00, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
746 ASSERT( (((int)dst | w00 | (int)src | (int)am | src_pitch)&15)==0 );
747 __m128i last_alpha = _mm_cvtsi32_si128( (am[last_src_id]+am[src_pitch+last_src_id]+1)<<7 );
748 const BYTE* end_mod16 = src + w00;
749 for(; src < end_mod16; src+=16, am+=16, dst+=32)
751 //important!
752 __m128i alpha = _mm_load_si128( reinterpret_cast<const __m128i*>(am) );
753 __m128i alpha2 = _mm_load_si128( reinterpret_cast<const __m128i*>(am+src_pitch) );
755 __m128i src_y = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
756 __m128i dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
758 alpha = _mm_avg_epu8(alpha, alpha2);
759 AVERAGE_4_PIX_INTRINSICS_5(alpha, last_alpha);
761 __m128i alpha_ff;
762 #ifdef _DEBUG
763 alpha_ff = _mm_setzero_si128();//disable warning C4700
764 #endif
765 alpha_ff = _mm_cmpeq_epi32(alpha_ff,alpha_ff);
767 alpha_ff = _mm_cmpeq_epi8(alpha_ff, alpha);
769 __m128i lo = _mm_unpacklo_epi8(alpha_ff, alpha);//(alpha<<8)+0x100 will overflow
770 //so we do it another way
771 //first, (alpha<<8)+0xff
772 __m128i ones = _mm_setzero_si128();
773 ones = _mm_cmpeq_epi16(dst_y, ones);
775 __m128i ones2;
776 #ifdef _DEBUG
777 ones2 = _mm_setzero_si128();//disable warning C4700
778 #endif
779 ones2 = _mm_cmpeq_epi32(ones2,ones2);
780 ones = _mm_xor_si128(ones, ones2);
781 ones = _mm_srli_epi16(ones, 15);
782 ones = _mm_and_si128(ones, lo);
784 dst_y = _mm_mulhi_epu16(dst_y, lo);
785 dst_y = _mm_adds_epu16(dst_y, ones);//then add one if necessary
787 lo = _mm_setzero_si128();
788 lo = _mm_unpacklo_epi8(lo, src_y);
789 dst_y = _mm_adds_epu16(dst_y, lo);
790 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_y );
792 dst_y = _mm_load_si128( reinterpret_cast<const __m128i*>(dst+16) );
794 lo = _mm_unpackhi_epi8(alpha_ff, alpha);
796 ones = _mm_setzero_si128();
797 ones = _mm_cmpeq_epi16(dst_y, ones);
798 ones = _mm_xor_si128(ones, ones2);
799 ones = _mm_srli_epi16(ones, 15);
800 ones = _mm_and_si128(ones, lo);
802 dst_y = _mm_mulhi_epu16(dst_y, lo);
803 dst_y = _mm_adds_epu16(dst_y, ones);
805 lo = _mm_setzero_si128();
806 lo = _mm_unpackhi_epi8(lo, src_y);
807 dst_y = _mm_adds_epu16(dst_y, lo);
808 _mm_store_si128( reinterpret_cast<__m128i*>(dst+16), dst_y );
812 static __forceinline void hleft_vmid_mix_uv_nv12_c(BYTE* dst, int w, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
814 int last_alpha = (am[last_src_id]+am[src_pitch+last_src_id]+1)/2;
815 const BYTE* end = src + w;
816 for(; src < end; src+=2, am+=2, dst+=2)
818 int ia = (am[0]+am[0+src_pitch]+1)/2;
819 int tmp2 = (am[1]+am[1+src_pitch]+1)/2;
820 last_alpha = (last_alpha + tmp2 + 1)/2;
821 ia = (ia + last_alpha + 1)/2;
822 last_alpha = tmp2;
823 if ( ia!=0xFF )
825 dst[0] = (((dst[0])*ia)>>8) + src[0];
826 dst[1] = (((dst[1])*ia)>>8) + src[1];
831 //0<=w15<=15
832 static __forceinline void hleft_vmid_mix_uv_nv12_c2(BYTE* dst, int w15, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
834 ASSERT(w15>=0 && w15<=15 && (w15&1)==0 );
835 int last_alpha = (am[last_src_id]+am[src_pitch+last_src_id]+1)/2;
837 switch(w15)
839 case 14:
840 #define _hleft_vmid_mix_uv_nv12_c2_mix_2 \
841 int ia = (am[0]+am[0+src_pitch]+1)/2;\
842 int tmp2 = (am[1]+am[1+src_pitch]+1)/2;\
843 last_alpha = (last_alpha + tmp2 + 1)/2;\
844 ia = (ia + last_alpha + 1)/2;\
845 last_alpha = tmp2;\
846 if ( ia!=0xFF )\
848 dst[0] = (((dst[0])*ia)>>8) + src[0];\
849 dst[1] = (((dst[1])*ia)>>8) + src[1];\
851 src+=2, am+=2, dst+=2
853 { _hleft_vmid_mix_uv_nv12_c2_mix_2; }
854 case 12:
855 { _hleft_vmid_mix_uv_nv12_c2_mix_2; }
856 case 10:
857 { _hleft_vmid_mix_uv_nv12_c2_mix_2; }
858 case 8:
859 { _hleft_vmid_mix_uv_nv12_c2_mix_2; }
860 case 6:
861 { _hleft_vmid_mix_uv_nv12_c2_mix_2; }
862 case 4:
863 { _hleft_vmid_mix_uv_nv12_c2_mix_2; }
864 case 2:
865 { _hleft_vmid_mix_uv_nv12_c2_mix_2; }
869 // am[last_src_id] valid && w&15=0
870 static __forceinline void hleft_vmid_mix_uv_nv12_sse2(BYTE* dst, int w00, const BYTE* src, const BYTE* am, int src_pitch, int last_src_id=0)
872 ASSERT( (((int)dst | w00 | (int)src | (int)am | src_pitch)&15)==0 );
873 __m128i last_alpha = _mm_cvtsi32_si128( (am[last_src_id]+am[src_pitch+last_src_id]+1)<<7 );
874 const BYTE* end_mod16 = src + w00;
875 for(; src < end_mod16; src+=16, am+=16, dst+=16)
877 __m128i dst128 = _mm_load_si128( reinterpret_cast<const __m128i*>(dst) );
878 __m128i alpha128_1 = _mm_load_si128( reinterpret_cast<const __m128i*>(am) );
879 __m128i alpha128_2 = _mm_load_si128( reinterpret_cast<const __m128i*>(am+src_pitch) );
880 __m128i sub128 = _mm_load_si128( reinterpret_cast<const __m128i*>(src) );
882 alpha128_1 = _mm_avg_epu8(alpha128_1, alpha128_2);
883 AVERAGE_4_PIX_INTRINSICS_5(alpha128_1, last_alpha);
885 __m128i zero = _mm_setzero_si128();
887 __m128i ones;
888 #ifdef _DEBUG
889 ones = _mm_setzero_si128();//disable warning C4700
890 #endif
891 ones = _mm_cmpeq_epi32(ones,ones);
892 ones = _mm_cmpeq_epi8(ones,alpha128_1);
894 __m128i dst_lo128 = _mm_unpacklo_epi8(dst128, zero);
895 alpha128_2 = _mm_unpacklo_epi8(alpha128_1, zero);
897 __m128i ones2 = _mm_unpacklo_epi8(ones, zero);
899 dst_lo128 = _mm_mullo_epi16(dst_lo128, alpha128_2);
900 dst_lo128 = _mm_adds_epu16(dst_lo128, ones2);
901 dst_lo128 = _mm_srli_epi16(dst_lo128, 8);
903 dst128 = _mm_unpackhi_epi8(dst128, zero);
904 alpha128_1 = _mm_unpackhi_epi8(alpha128_1, zero);
906 ones2 = _mm_unpackhi_epi8(ones, zero);
908 dst128 = _mm_mullo_epi16(dst128, alpha128_1);
909 dst128 = _mm_adds_epu16(dst128, ones2);
910 dst128 = _mm_srli_epi16(dst128, 8);
911 dst_lo128 = _mm_packus_epi16(dst_lo128, dst128);
913 dst_lo128 = _mm_adds_epu8(dst_lo128, sub128);
914 _mm_store_si128( reinterpret_cast<__m128i*>(dst), dst_lo128 );
918 #endif // __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__