1 #ifndef __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__
2 #define __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__
12 //out: m128_1 = avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
17 #define AVERAGE_4_PIX_INTRINSICS(m128_1, m128_2) \
18 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
19 m128_2 = _mm_slli_epi16(m128_1, 8); \
20 m128_1 = _mm_srli_epi16(m128_1, 8); \
21 m128_2 = _mm_srli_epi16(m128_2, 8); \
22 m128_1 = _mm_avg_epu8(m128_1, m128_2);
24 //out: m128_1 = avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
25 // avg(m128_1.u8[0],m128_1.u8[1],m128_2.u8[0],m128_2.u8[1])
29 #define AVERAGE_4_PIX_INTRINSICS_2(m128_1, m128_2) \
31 m128_1 = _mm_avg_epu8(m128_1, m128_2); \
32 m128_2 = _mm_slli_epi16(m128_1, 8); \
33 __m128i m128_3 = _mm_srli_epi16(m128_1, 8); \
34 m128_2 = _mm_or_si128(m128_2, m128_3);\
35 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
38 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
39 //out: m128_1 = avg(U_last, u8[0], u8[1])
41 // avg(u8[1], u8[2], u8[3])
44 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
45 #define AVERAGE_4_PIX_INTRINSICS_3(m128_1, m128_last) \
47 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
48 m128_2 = _mm_or_si128(m128_2, m128_last);\
49 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
50 m128_last = _mm_srli_si128(m128_1,14);\
51 m128_1 = _mm_slli_epi16(m128_1, 8);\
52 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
53 m128_1 = _mm_srli_epi16(m128_1, 8);\
56 static void average_4_pix_intrinsics_3_c(__m128i
& m128i_1
, __m128i
& m128i_last
)
58 int last
=m128i_last
.m128i_u8
[1];
59 m128i_last
.m128i_u8
[0] = m128i_1
.m128i_u8
[14];
60 m128i_last
.m128i_u8
[1] = m128i_1
.m128i_u8
[15];
61 for (int i
=2;i
<16;i
++)
63 m128i_last
.m128i_u8
[i
] = 0;
67 int u0
= m128i_1
.m128i_u8
[2*i
];
68 int u1
= m128i_1
.m128i_u8
[2*i
+1];
69 last
= (last
+ u1
+ 1)/2;
70 u0
= (last
+ u0
+ 1)/2;
72 m128i_1
.m128i_u8
[2*i
] = u0
;
73 m128i_1
.m128i_u8
[2*i
+1] = 0;
77 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
79 // avg(U_last, u8[0], u8[1])
81 // avg(u8[1], u8[2], u8[3])
83 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
84 #define AVERAGE_4_PIX_INTRINSICS_4(m128_1, m128_last) \
86 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
87 m128_2 = _mm_or_si128(m128_2, m128_last);\
88 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
89 m128_last = _mm_srli_si128(m128_1,14);\
90 m128_2 = _mm_srli_epi16(m128_2, 8);\
91 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
92 m128_1 = _mm_slli_epi16(m128_1, 8);\
95 static void average_4_pix_intrinsics_4_c(__m128i
& m128i_1
, __m128i
& m128i_last
)
97 int last
=m128i_last
.m128i_u8
[1];
98 m128i_last
.m128i_u8
[0] = m128i_1
.m128i_u8
[14];
99 m128i_last
.m128i_u8
[1] = m128i_1
.m128i_u8
[15];
100 for (int i
=2;i
<16;i
++)
102 m128i_last
.m128i_u8
[i
] = 0;
104 for (int i
=0;i
<8;i
++)
106 int u0
= m128i_1
.m128i_u8
[2*i
];
107 int u1
= m128i_1
.m128i_u8
[2*i
+1];
108 last
= (last
+ u1
+ 1)/2;
109 u0
= (last
+ u0
+ 1)/2;
111 m128i_1
.m128i_u8
[2*i
+1] = u0
;
112 m128i_1
.m128i_u8
[2*i
] = 0;
115 //in : m128_1 = whatever, m128_last = u8 U_last 0 0 0 ... 0
116 //out: m128_1 = avg(U_last, u8[0], u8[1])
117 // avg(U_last, u8[0], u8[1])
118 // avg(u8[1], u8[2], u8[3])
119 // avg(u8[1], u8[2], u8[3])
121 // m128_last = m128_1.u8[14] m128_1.u8[15] 0 0 0 ... 0
122 #define AVERAGE_4_PIX_INTRINSICS_5(m128_1, m128_last) \
124 __m128i m128_2 = _mm_slli_si128(m128_1,2);\
125 m128_2 = _mm_or_si128(m128_2, m128_last);\
126 m128_2 = _mm_avg_epu8(m128_2, m128_1);\
127 m128_last = _mm_srli_si128(m128_1,14);\
128 m128_2 = _mm_srli_epi16(m128_2, 8);\
129 m128_1 = _mm_avg_epu8(m128_1, m128_2);\
130 m128_1 = _mm_slli_epi16(m128_1, 8);\
131 m128_2 = _mm_srli_epi16(m128_1, 8);\
132 m128_1 = _mm_or_si128(m128_1, m128_2);\
135 static void average_4_pix_intrinsics_5_c(__m128i
& m128i_1
, __m128i
& m128i_last
)
137 int last
=m128i_last
.m128i_u8
[1];
138 m128i_last
.m128i_u8
[0] = m128i_1
.m128i_u8
[14];
139 m128i_last
.m128i_u8
[1] = m128i_1
.m128i_u8
[15];
140 for (int i
=2;i
<16;i
++)
142 m128i_last
.m128i_u8
[i
] = 0;
144 for (int i
=0;i
<8;i
++)
146 int u0
= m128i_1
.m128i_u8
[2*i
];
147 int u1
= m128i_1
.m128i_u8
[2*i
+1];
148 last
= (last
+ u1
+ 1)/2;
149 u0
= (last
+ u0
+ 1)/2;
151 m128i_1
.m128i_u8
[2*i
+1] = u0
;
152 m128i_1
.m128i_u8
[2*i
] = u0
;
156 static void subsample_and_interlace_2_line_c(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
)
158 const BYTE
* end
= u
+ w
;
159 for (;u
<end
;dst
+=2,u
+=2,v
+=2)
161 dst
[0] = (u
[0] + u
[0+pitch
] + 1)/2;
162 int tmp1
= (u
[1] + u
[1+pitch
] + 1)/2;
163 dst
[0] = (dst
[0] + tmp1
+ 1)/2;
164 dst
[1] = (v
[0] + v
[0+pitch
] + 1)/2;
165 tmp1
= (v
[1] + v
[1+pitch
] + 1)/2;
166 dst
[1] = (dst
[1] + tmp1
+ 1)/2;
170 static __forceinline
void subsample_and_interlace_2_line_sse2(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
)
172 const BYTE
* end
= u
+ w
;
173 for (;u
<end
;dst
+=16,u
+=16,v
+=16)
175 __m128i u_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
) );
176 __m128i u_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
+pitch
) );
177 __m128i v_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
) );
178 __m128i v_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
+pitch
) );
179 AVERAGE_4_PIX_INTRINSICS(u_1
, u_2
);
180 AVERAGE_4_PIX_INTRINSICS(v_1
, v_2
);
181 u_1
= _mm_packus_epi16(u_1
, u_1
);
182 v_1
= _mm_packus_epi16(v_1
, v_1
);
183 u_1
= _mm_unpacklo_epi8(u_1
, v_1
);
185 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), u_1
);
189 static __forceinline
void pix_alpha_blend_yv12_luma_sse2(byte
* dst
, const byte
* alpha
, const byte
* sub
)
191 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
192 __m128i alpha128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
) );
193 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(sub
) );
194 __m128i zero
= _mm_setzero_si128();
198 ones
= _mm_setzero_si128();//disable warning C4700
200 ones
= _mm_cmpeq_epi32(ones
,ones
);
201 ones
= _mm_cmpeq_epi8(ones
,alpha128
);
203 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
204 __m128i alpha_lo128
= _mm_unpacklo_epi8(alpha128
, zero
);
206 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
208 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha_lo128
);
209 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
210 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
212 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
213 alpha128
= _mm_unpackhi_epi8(alpha128
, zero
);
215 ones2
= _mm_unpackhi_epi8(ones
, zero
);
217 dst128
= _mm_mullo_epi16(dst128
, alpha128
);
218 dst128
= _mm_adds_epu16(dst128
, ones2
);
219 dst128
= _mm_srli_epi16(dst128
, 8);
220 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
222 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
223 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
227 * output not exactly identical to pix_alpha_blend_yv12_chroma
229 static __forceinline
void pix_alpha_blend_yv12_chroma_sse2(byte
* dst
, const byte
* src
, const byte
* alpha
, int src_pitch
)
231 __m128i zero
= _mm_setzero_si128();
232 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
) );
233 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(alpha
+src_pitch
) );
234 __m128i dst128
= _mm_loadl_epi64( reinterpret_cast<const __m128i
*>(dst
) );
236 __m128i sub128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
237 __m128i sub128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
+src_pitch
) );
239 AVERAGE_4_PIX_INTRINSICS(alpha128_1
, alpha128_2
);
243 ones
= _mm_setzero_si128();//disable warning C4700
245 ones
= _mm_cmpeq_epi32(ones
,ones
);
246 ones
= _mm_cmpeq_epi8(ones
, alpha128_1
);
248 dst128
= _mm_unpacklo_epi8(dst128
, zero
);
249 __m128i dst128_2
= _mm_and_si128(dst128
, ones
);
251 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
252 dst128
= _mm_adds_epu16(dst128
, dst128_2
);
254 dst128
= _mm_srli_epi16(dst128
, 8);
256 AVERAGE_4_PIX_INTRINSICS(sub128_1
, sub128_2
);
258 dst128
= _mm_adds_epi16(dst128
, sub128_1
);
259 dst128
= _mm_packus_epi16(dst128
, dst128
);
261 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst
), dst128
);
264 static __forceinline
void mix_16_y_p010_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
)
267 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
268 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
269 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
273 alpha_ff
= _mm_setzero_si128();//disable warning C4700
275 alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
277 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
279 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
280 //so we do it another way
281 //first, (alpha<<8)+0xff
282 __m128i ones
= _mm_setzero_si128();
283 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
287 ones2
= _mm_setzero_si128();//disable warning C4700
289 ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
291 ones
= _mm_xor_si128(ones
, ones2
);
292 ones
= _mm_srli_epi16(ones
, 15);
293 ones
= _mm_and_si128(ones
, lo
);
295 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
296 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
298 lo
= _mm_setzero_si128();
299 lo
= _mm_unpacklo_epi8(lo
, src_y
);
300 dst_y
= _mm_adds_epu16(dst_y
, lo
);
301 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
304 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
306 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
308 ones
= _mm_setzero_si128();
309 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
310 ones
= _mm_xor_si128(ones
, ones2
);
311 ones
= _mm_srli_epi16(ones
, 15);
312 ones
= _mm_and_si128(ones
, lo
);
314 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
315 dst_y
= _mm_adds_epu16(dst_y
, ones
);
317 lo
= _mm_setzero_si128();
318 lo
= _mm_unpackhi_epi8(lo
, src_y
);
319 dst_y
= _mm_adds_epu16(dst_y
, lo
);
320 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
324 static void mix_16_y_p010_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
)
326 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
327 for (int i
=0;i
<16;i
++)
329 if (src_alpha
[i
]!=0xff)
331 dst_word
[i
] = ((dst_word
[i
] *src_alpha
[i
])>>8) + (src
[i
]<<8);
336 static __forceinline
void pix_alpha_blend_yv12_chroma(byte
* dst
, const byte
* src
, const byte
* alpha
, int src_pitch
)
338 unsigned int ia
= (alpha
[0]+alpha
[1]+
339 alpha
[0+src_pitch
]+alpha
[1+src_pitch
])>>2;
342 *dst
= (((*dst
)*ia
)>>8) + ((src
[0] +src
[1]+
343 src
[src_pitch
]+src
[1+src_pitch
] )>>2);
346 static __forceinline
void mix_16_uv_p010_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
349 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
350 __m128i alpha2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
+pitch
) );
352 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
353 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
355 AVERAGE_4_PIX_INTRINSICS_2(alpha
, alpha2
);
359 alpha_ff
= _mm_setzero_si128();//disable warning C4700
361 alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
363 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
365 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
366 //so we do it another way
367 //first, (alpha<<8)+0xff
368 __m128i ones
= _mm_setzero_si128();
369 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
373 ones2
= _mm_setzero_si128();//disable warning C4700
375 ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
376 ones
= _mm_xor_si128(ones
, ones2
);
377 ones
= _mm_srli_epi16(ones
, 15);
378 ones
= _mm_and_si128(ones
, lo
);
380 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
381 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
383 lo
= _mm_setzero_si128();
384 lo
= _mm_unpacklo_epi8(lo
, src_y
);
385 dst_y
= _mm_adds_epu16(dst_y
, lo
);
386 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
389 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
391 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
393 ones
= _mm_setzero_si128();
394 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
395 ones
= _mm_xor_si128(ones
, ones2
);
396 ones
= _mm_srli_epi16(ones
, 15);
397 ones
= _mm_and_si128(ones
, lo
);
399 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
400 dst_y
= _mm_adds_epu16(dst_y
, ones
);
402 lo
= _mm_setzero_si128();
403 lo
= _mm_unpackhi_epi8(lo
, src_y
);
404 dst_y
= _mm_adds_epu16(dst_y
, lo
);
405 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
408 static void mix_16_uv_p010_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
410 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
411 for (int i
=0;i
<8;i
++, src_alpha
+=2, src
+=2, dst_word
+=2)
414 (src_alpha
[0]+src_alpha
[0+pitch
]+1)/2+
415 (src_alpha
[1]+src_alpha
[1+pitch
]+1)/2+1)/2;
418 int tmp
= (((dst_word
[0])*ia
)>>8) + (src
[0]<<8);
419 if(tmp
>0xffff) tmp
= 0xffff;
421 tmp
= (((dst_word
[1])*ia
)>>8) + (src
[1]<<8);
422 if(tmp
>0xffff) tmp
= 0xffff;
428 static __forceinline
void mix_16_uv_nvxx_sse2(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
430 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
431 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
) );
432 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src_alpha
+pitch
) );
433 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
435 AVERAGE_4_PIX_INTRINSICS_2(alpha128_1
, alpha128_2
);
436 __m128i zero
= _mm_setzero_si128();
440 ones
= _mm_setzero_si128();//disable warning C4700
442 ones
= _mm_cmpeq_epi32(ones
,ones
);
443 ones
= _mm_cmpeq_epi8(ones
,alpha128_1
);
445 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
446 alpha128_2
= _mm_unpacklo_epi8(alpha128_1
, zero
);
448 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
450 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha128_2
);
451 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
452 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
454 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
455 alpha128_1
= _mm_unpackhi_epi8(alpha128_1
, zero
);
457 ones2
= _mm_unpackhi_epi8(ones
, zero
);
459 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
460 dst128
= _mm_adds_epu16(dst128
, ones2
);
461 dst128
= _mm_srli_epi16(dst128
, 8);
462 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
464 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
465 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
469 static void mix_16_uv_nvxx_c(BYTE
* dst
, const BYTE
* src
, const BYTE
* src_alpha
, int pitch
)
471 for (int i
=0;i
<8;i
++, src_alpha
+=2, src
+=2, dst
+=2)
474 (src_alpha
[0]+src_alpha
[0+pitch
]+1)/2+
475 (src_alpha
[1]+src_alpha
[1+pitch
]+1)/2+1)/2;
478 dst
[0] = (((dst
[0])*ia
)>>8) + src
[0];
479 dst
[1] = (((dst
[1])*ia
)>>8) + src
[1];
486 * chroma placement(x=Y, o=U,V):
493 static __forceinline
void hleft_vmid_subsample_and_interlace_2_line_c(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
, int last_src_id
=0)
495 const BYTE
* end
= u
+ w
;
496 BYTE last_u
= (u
[last_src_id
]+u
[last_src_id
+pitch
]+1)/2;
497 BYTE last_v
= (v
[last_src_id
]+v
[last_src_id
+pitch
]+1)/2;
498 for (;u
<end
;dst
+=2,u
+=2,v
+=2)
500 dst
[0] = (u
[0] + u
[0+pitch
] + 1)/2;
501 int tmp1
= (u
[1] + u
[1+pitch
] + 1)/2;
502 last_u
= (tmp1
+last_u
+1)/2;
503 dst
[0] = (dst
[0] + last_u
+ 1)/2;
506 dst
[1] = (v
[0] + v
[0+pitch
] + 1)/2;
507 tmp1
= (v
[1] + v
[1+pitch
] + 1)/2;
508 last_v
= (tmp1
+last_v
+1)/2;
509 dst
[1] = (last_v
+ dst
[1] + 1)/2;
514 // @w : w % 16 must == 0!
515 static __forceinline
void hleft_vmid_subsample_and_interlace_2_line_sse2(BYTE
* dst
, const BYTE
* u
, const BYTE
* v
, int w
, int pitch
, int last_src_id
=0)
517 const BYTE
* end_mod16
= u
+ (w
&~15);
519 __m128i u_last
= _mm_cvtsi32_si128( (u
[last_src_id
]+u
[pitch
+last_src_id
]+1)<<7 );
520 __m128i v_last
= _mm_cvtsi32_si128( (v
[last_src_id
]+v
[pitch
+last_src_id
]+1)<<7 );
521 for (;u
<end_mod16
;dst
+=16,u
+=16,v
+=16)
523 __m128i u_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
) );
524 __m128i u_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(u
+pitch
) );
525 __m128i v_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
) );
526 __m128i v_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(v
+pitch
) );
527 u_1
= _mm_avg_epu8(u_1
, u_2
);
528 AVERAGE_4_PIX_INTRINSICS_3(u_1
, u_last
);
529 v_1
= _mm_avg_epu8(v_1
, v_2
);
530 AVERAGE_4_PIX_INTRINSICS_4(v_1
, v_last
);
531 u_1
= _mm_or_si128(u_1
, v_1
);
532 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), u_1
);
534 //The following fails if dst==u
535 //hleft_vmid_subsample_and_interlace_2_line_c(dst, u, v, w&15, pitch, w>15?-1:0);
538 static __forceinline
void hleft_vmid_mix_uv_yv12_c(byte
* dst
, int w
, const byte
* src
, const byte
* am
, int src_pitch
, int last_src_id
=0)
540 int last_alpha
= (am
[last_src_id
]+am
[last_src_id
+src_pitch
]+1)/2;
541 int last_sub
= (src
[last_src_id
]+src
[last_src_id
+src_pitch
]+1)/2;
542 const BYTE
* end
= src
+ w
;
543 for(; src
< end
; src
+= 2, am
+= 2, dst
++)
545 int ia
= (am
[0]+am
[0+src_pitch
]+1)/2;
546 int tmp1
= (am
[1]+am
[1+src_pitch
]+1)/2;
547 last_alpha
= (last_alpha
+ tmp1
+ 1)/2;
548 ia
= (ia
+ last_alpha
+ 1)/2;
553 tmp1
= (src
[0]+src
[0+src_pitch
]+1)/2;
554 int tmp2
= (src
[1]+src
[1+src_pitch
]+1)/2;
555 last_sub
= (last_sub
+tmp2
+1)/2;
556 tmp1
= (tmp1
+last_sub
+1)/2;
559 *dst
= (((*dst
)*ia
)>>8) + tmp1
;
563 last_sub
= (src
[1]+src
[1+src_pitch
]+1)/2;
568 //0<=w15<=15 && w15%2==0
569 static __forceinline
void hleft_vmid_mix_uv_yv12_c2(byte
* dst
, int w15
, const byte
* src
, const byte
* am
, int src_pitch
, int last_src_id
=0)
571 ASSERT(w15
>=0 && w15
<=15 && (w15
&1)==0 );
573 int last_alpha
= (am
[last_src_id
]+am
[last_src_id
+src_pitch
]+1)/2;
574 int last_sub
= (src
[last_src_id
]+src
[last_src_id
+src_pitch
]+1)/2;
575 const BYTE
* end
= src
+ w15
;
580 #define _hleft_vmid_mix_uv_yv12_c2_mix_2 \
581 int ia = (am[0]+am[0+src_pitch]+1)/2;\
582 int tmp1 = (am[1]+am[1+src_pitch]+1)/2;\
583 last_alpha = (last_alpha + tmp1 + 1)/2;\
584 ia = (ia + last_alpha + 1)/2;\
589 tmp1 = (src[0]+src[0+src_pitch]+1)/2;\
590 int tmp2 = (src[1]+src[1+src_pitch]+1)/2;\
591 last_sub = (last_sub+tmp2+1)/2;\
592 tmp1 = (tmp1+last_sub+1)/2;\
595 *dst= (((*dst)*ia)>>8) + tmp1;\
599 last_sub = (src[1]+src[1+src_pitch]+1)/2;\
600 }src += 2, am += 2, dst++
602 { _hleft_vmid_mix_uv_yv12_c2_mix_2
; }
604 { _hleft_vmid_mix_uv_yv12_c2_mix_2
; }
606 { _hleft_vmid_mix_uv_yv12_c2_mix_2
; }
608 { _hleft_vmid_mix_uv_yv12_c2_mix_2
; }
610 { _hleft_vmid_mix_uv_yv12_c2_mix_2
; }
612 { _hleft_vmid_mix_uv_yv12_c2_mix_2
; }
614 { _hleft_vmid_mix_uv_yv12_c2_mix_2
; }
618 // am[last_src_id] valid && w&15=0
619 static __forceinline
void hleft_vmid_mix_uv_yv12_sse2(byte
* dst
, int w00
, const byte
* src
, const byte
* am
, int src_pitch
, int last_src_id
=0)
621 ASSERT( (( (2*(int)dst
) | w00
| (int)src
| (int)am
| src_pitch
)&15)==0 );
623 __m128i last_src
= _mm_cvtsi32_si128( (src
[last_src_id
]+src
[src_pitch
+last_src_id
]+1)<<7 );
624 __m128i last_alpha
= _mm_cvtsi32_si128( (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)<<7 );
625 const BYTE
* end_mod16
= src
+ (w00
&~15);
626 for(; src
< end_mod16
; src
+= 16, am
+= 16, dst
+=8)
628 __m128i zero
= _mm_setzero_si128();
630 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
) );
631 __m128i tmp
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
+src_pitch
) );
632 alpha128_1
= _mm_avg_epu8(alpha128_1
, tmp
);
633 AVERAGE_4_PIX_INTRINSICS_3(alpha128_1
, last_alpha
);
635 __m128i dst128
= _mm_loadl_epi64( reinterpret_cast<const __m128i
*>(dst
) );
637 __m128i sub128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
638 tmp
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
+src_pitch
) );
639 sub128_1
= _mm_avg_epu8(sub128_1
, tmp
);
640 AVERAGE_4_PIX_INTRINSICS_3(sub128_1
, last_src
);
644 ones
= _mm_setzero_si128();//disable warning C4700
646 ones
= _mm_cmpeq_epi32(ones
,ones
);
647 ones
= _mm_cmpeq_epi8(ones
, alpha128_1
);
649 dst128
= _mm_unpacklo_epi8(dst128
, zero
);
650 __m128i dst128_2
= _mm_and_si128(dst128
, ones
);
652 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
653 dst128
= _mm_adds_epu16(dst128
, dst128_2
);
655 dst128
= _mm_srli_epi16(dst128
, 8);
657 dst128
= _mm_adds_epi16(dst128
, sub128_1
);
658 dst128
= _mm_packus_epi16(dst128
, dst128
);
660 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst
), dst128
);
664 static __forceinline
void hleft_vmid_mix_uv_p010_c(BYTE
* dst
, int w
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
666 int last_alpha
= (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)/2;
667 const BYTE
* end
= src
+ w
;
668 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
669 for(; src
< end
; src
+=2, am
+=2, dst_word
+=2)
671 int ia
= (am
[0]+am
[0+src_pitch
]+1)/2;
672 int tmp2
= (am
[1]+am
[1+src_pitch
]+1)/2;
673 last_alpha
= (last_alpha
+ tmp2
+ 1)/2;
674 ia
= (ia
+ last_alpha
+ 1)/2;
679 int tmp
= (((dst_word
[0])*ia
)>>8) + (src
[0]<<8);
681 tmp
^= (tmp
^0xffff)&((0xffff-tmp
)>>31);//if(tmp>0xffff) tmp = 0xffff;
684 tmp
= (((dst_word
[1])*ia
)>>8) + (src
[1]<<8);
686 tmp
^= (tmp
^0xffff)&((0xffff-tmp
)>>31);//if(tmp>0xffff) tmp = 0xffff;
695 static __forceinline
void hleft_vmid_mix_uv_p010_c2(BYTE
* dst
, int w15
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
697 ASSERT(w15
>=0 && w15
<=15 && (w15
&1)==0 );
698 int last_alpha
= (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)/2;
699 WORD
* dst_word
= reinterpret_cast<WORD
*>(dst
);
702 # define _hleft_vmid_mix_uv_p010_c2_CLIP(tmp) tmp ^= (tmp^0xffff)&((0xffff-tmp)>>31);/*if(tmp>0xffff) tmp = 0xffff;*/
704 # define _hleft_vmid_mix_uv_p010_c2_CLIP(tmp)
710 #define _hleft_vmid_mix_uv_p010_c2_mix_2 \
711 int ia = (am[0]+am[0+src_pitch]+1)/2;\
712 int tmp2 = (am[1]+am[1+src_pitch]+1)/2;\
713 last_alpha = (last_alpha + tmp2 + 1)/2;\
714 ia = (ia + last_alpha + 1)/2;\
719 int tmp = (((dst_word[0])*ia)>>8) + (src[0]<<8);\
720 _hleft_vmid_mix_uv_p010_c2_CLIP(tmp);\
722 tmp = (((dst_word[1])*ia)>>8) + (src[1]<<8);\
723 _hleft_vmid_mix_uv_p010_c2_CLIP(tmp);\
725 } src+=2, am+=2, dst_word+=2
727 { _hleft_vmid_mix_uv_p010_c2_mix_2
; }
729 { _hleft_vmid_mix_uv_p010_c2_mix_2
; }
731 { _hleft_vmid_mix_uv_p010_c2_mix_2
; }
733 { _hleft_vmid_mix_uv_p010_c2_mix_2
; }
735 { _hleft_vmid_mix_uv_p010_c2_mix_2
; }
737 { _hleft_vmid_mix_uv_p010_c2_mix_2
; }
739 { _hleft_vmid_mix_uv_p010_c2_mix_2
; }
743 // am[last_src_id] valid && w&15=0
744 static __forceinline
void hleft_vmid_mix_uv_p010_sse2(BYTE
* dst
, int w00
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
746 ASSERT( (((int)dst
| w00
| (int)src
| (int)am
| src_pitch
)&15)==0 );
747 __m128i last_alpha
= _mm_cvtsi32_si128( (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)<<7 );
748 const BYTE
* end_mod16
= src
+ w00
;
749 for(; src
< end_mod16
; src
+=16, am
+=16, dst
+=32)
752 __m128i alpha
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
) );
753 __m128i alpha2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
+src_pitch
) );
755 __m128i src_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
756 __m128i dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
758 alpha
= _mm_avg_epu8(alpha
, alpha2
);
759 AVERAGE_4_PIX_INTRINSICS_5(alpha
, last_alpha
);
763 alpha_ff
= _mm_setzero_si128();//disable warning C4700
765 alpha_ff
= _mm_cmpeq_epi32(alpha_ff
,alpha_ff
);
767 alpha_ff
= _mm_cmpeq_epi8(alpha_ff
, alpha
);
769 __m128i lo
= _mm_unpacklo_epi8(alpha_ff
, alpha
);//(alpha<<8)+0x100 will overflow
770 //so we do it another way
771 //first, (alpha<<8)+0xff
772 __m128i ones
= _mm_setzero_si128();
773 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
777 ones2
= _mm_setzero_si128();//disable warning C4700
779 ones2
= _mm_cmpeq_epi32(ones2
,ones2
);
780 ones
= _mm_xor_si128(ones
, ones2
);
781 ones
= _mm_srli_epi16(ones
, 15);
782 ones
= _mm_and_si128(ones
, lo
);
784 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
785 dst_y
= _mm_adds_epu16(dst_y
, ones
);//then add one if necessary
787 lo
= _mm_setzero_si128();
788 lo
= _mm_unpacklo_epi8(lo
, src_y
);
789 dst_y
= _mm_adds_epu16(dst_y
, lo
);
790 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_y
);
792 dst_y
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
+16) );
794 lo
= _mm_unpackhi_epi8(alpha_ff
, alpha
);
796 ones
= _mm_setzero_si128();
797 ones
= _mm_cmpeq_epi16(dst_y
, ones
);
798 ones
= _mm_xor_si128(ones
, ones2
);
799 ones
= _mm_srli_epi16(ones
, 15);
800 ones
= _mm_and_si128(ones
, lo
);
802 dst_y
= _mm_mulhi_epu16(dst_y
, lo
);
803 dst_y
= _mm_adds_epu16(dst_y
, ones
);
805 lo
= _mm_setzero_si128();
806 lo
= _mm_unpackhi_epi8(lo
, src_y
);
807 dst_y
= _mm_adds_epu16(dst_y
, lo
);
808 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
+16), dst_y
);
812 static __forceinline
void hleft_vmid_mix_uv_nv12_c(BYTE
* dst
, int w
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
814 int last_alpha
= (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)/2;
815 const BYTE
* end
= src
+ w
;
816 for(; src
< end
; src
+=2, am
+=2, dst
+=2)
818 int ia
= (am
[0]+am
[0+src_pitch
]+1)/2;
819 int tmp2
= (am
[1]+am
[1+src_pitch
]+1)/2;
820 last_alpha
= (last_alpha
+ tmp2
+ 1)/2;
821 ia
= (ia
+ last_alpha
+ 1)/2;
825 dst
[0] = (((dst
[0])*ia
)>>8) + src
[0];
826 dst
[1] = (((dst
[1])*ia
)>>8) + src
[1];
832 static __forceinline
void hleft_vmid_mix_uv_nv12_c2(BYTE
* dst
, int w15
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
834 ASSERT(w15
>=0 && w15
<=15 && (w15
&1)==0 );
835 int last_alpha
= (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)/2;
840 #define _hleft_vmid_mix_uv_nv12_c2_mix_2 \
841 int ia = (am[0]+am[0+src_pitch]+1)/2;\
842 int tmp2 = (am[1]+am[1+src_pitch]+1)/2;\
843 last_alpha = (last_alpha + tmp2 + 1)/2;\
844 ia = (ia + last_alpha + 1)/2;\
848 dst[0] = (((dst[0])*ia)>>8) + src[0];\
849 dst[1] = (((dst[1])*ia)>>8) + src[1];\
851 src+=2, am+=2, dst+=2
853 { _hleft_vmid_mix_uv_nv12_c2_mix_2
; }
855 { _hleft_vmid_mix_uv_nv12_c2_mix_2
; }
857 { _hleft_vmid_mix_uv_nv12_c2_mix_2
; }
859 { _hleft_vmid_mix_uv_nv12_c2_mix_2
; }
861 { _hleft_vmid_mix_uv_nv12_c2_mix_2
; }
863 { _hleft_vmid_mix_uv_nv12_c2_mix_2
; }
865 { _hleft_vmid_mix_uv_nv12_c2_mix_2
; }
869 // am[last_src_id] valid && w&15=0
870 static __forceinline
void hleft_vmid_mix_uv_nv12_sse2(BYTE
* dst
, int w00
, const BYTE
* src
, const BYTE
* am
, int src_pitch
, int last_src_id
=0)
872 ASSERT( (((int)dst
| w00
| (int)src
| (int)am
| src_pitch
)&15)==0 );
873 __m128i last_alpha
= _mm_cvtsi32_si128( (am
[last_src_id
]+am
[src_pitch
+last_src_id
]+1)<<7 );
874 const BYTE
* end_mod16
= src
+ w00
;
875 for(; src
< end_mod16
; src
+=16, am
+=16, dst
+=16)
877 __m128i dst128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(dst
) );
878 __m128i alpha128_1
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
) );
879 __m128i alpha128_2
= _mm_load_si128( reinterpret_cast<const __m128i
*>(am
+src_pitch
) );
880 __m128i sub128
= _mm_load_si128( reinterpret_cast<const __m128i
*>(src
) );
882 alpha128_1
= _mm_avg_epu8(alpha128_1
, alpha128_2
);
883 AVERAGE_4_PIX_INTRINSICS_5(alpha128_1
, last_alpha
);
885 __m128i zero
= _mm_setzero_si128();
889 ones
= _mm_setzero_si128();//disable warning C4700
891 ones
= _mm_cmpeq_epi32(ones
,ones
);
892 ones
= _mm_cmpeq_epi8(ones
,alpha128_1
);
894 __m128i dst_lo128
= _mm_unpacklo_epi8(dst128
, zero
);
895 alpha128_2
= _mm_unpacklo_epi8(alpha128_1
, zero
);
897 __m128i ones2
= _mm_unpacklo_epi8(ones
, zero
);
899 dst_lo128
= _mm_mullo_epi16(dst_lo128
, alpha128_2
);
900 dst_lo128
= _mm_adds_epu16(dst_lo128
, ones2
);
901 dst_lo128
= _mm_srli_epi16(dst_lo128
, 8);
903 dst128
= _mm_unpackhi_epi8(dst128
, zero
);
904 alpha128_1
= _mm_unpackhi_epi8(alpha128_1
, zero
);
906 ones2
= _mm_unpackhi_epi8(ones
, zero
);
908 dst128
= _mm_mullo_epi16(dst128
, alpha128_1
);
909 dst128
= _mm_adds_epu16(dst128
, ones2
);
910 dst128
= _mm_srli_epi16(dst128
, 8);
911 dst_lo128
= _mm_packus_epi16(dst_lo128
, dst128
);
913 dst_lo128
= _mm_adds_epu8(dst_lo128
, sub128
);
914 _mm_store_si128( reinterpret_cast<__m128i
*>(dst
), dst_lo128
);
918 #endif // __XY_INTRINSICS_D66EF42F_67BC_47F4_A70D_40F1AB80F376_H__