Rename var: val -> energy
[FFMpeg-mirror/DVCPRO-HD.git] / libavcodec / i386 / dsputilenc_mmx.c
blobe7893de4874db1d4879ca0210f9dd3f0097b6480
1 /*
2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/x86_cpu.h"
26 #include "libavcodec/dsputil.h"
27 #include "libavcodec/mpegvideo.h"
28 #include "dsputil_mmx.h"
31 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
33 asm volatile(
34 "mov $-128, %%"REG_a" \n\t"
35 "pxor %%mm7, %%mm7 \n\t"
36 ASMALIGN(4)
37 "1: \n\t"
38 "movq (%0), %%mm0 \n\t"
39 "movq (%0, %2), %%mm2 \n\t"
40 "movq %%mm0, %%mm1 \n\t"
41 "movq %%mm2, %%mm3 \n\t"
42 "punpcklbw %%mm7, %%mm0 \n\t"
43 "punpckhbw %%mm7, %%mm1 \n\t"
44 "punpcklbw %%mm7, %%mm2 \n\t"
45 "punpckhbw %%mm7, %%mm3 \n\t"
46 "movq %%mm0, (%1, %%"REG_a") \n\t"
47 "movq %%mm1, 8(%1, %%"REG_a") \n\t"
48 "movq %%mm2, 16(%1, %%"REG_a") \n\t"
49 "movq %%mm3, 24(%1, %%"REG_a") \n\t"
50 "add %3, %0 \n\t"
51 "add $32, %%"REG_a" \n\t"
52 "js 1b \n\t"
53 : "+r" (pixels)
54 : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
55 : "%"REG_a
59 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
61 asm volatile(
62 "pxor %%mm7, %%mm7 \n\t"
63 "mov $-128, %%"REG_a" \n\t"
64 ASMALIGN(4)
65 "1: \n\t"
66 "movq (%0), %%mm0 \n\t"
67 "movq (%1), %%mm2 \n\t"
68 "movq %%mm0, %%mm1 \n\t"
69 "movq %%mm2, %%mm3 \n\t"
70 "punpcklbw %%mm7, %%mm0 \n\t"
71 "punpckhbw %%mm7, %%mm1 \n\t"
72 "punpcklbw %%mm7, %%mm2 \n\t"
73 "punpckhbw %%mm7, %%mm3 \n\t"
74 "psubw %%mm2, %%mm0 \n\t"
75 "psubw %%mm3, %%mm1 \n\t"
76 "movq %%mm0, (%2, %%"REG_a") \n\t"
77 "movq %%mm1, 8(%2, %%"REG_a") \n\t"
78 "add %3, %0 \n\t"
79 "add %3, %1 \n\t"
80 "add $16, %%"REG_a" \n\t"
81 "jnz 1b \n\t"
82 : "+r" (s1), "+r" (s2)
83 : "r" (block+64), "r" ((x86_reg)stride)
84 : "%"REG_a
88 static int pix_sum16_mmx(uint8_t * pix, int line_size){
89 const int h=16;
90 int sum;
91 x86_reg index= -line_size*h;
93 asm volatile(
94 "pxor %%mm7, %%mm7 \n\t"
95 "pxor %%mm6, %%mm6 \n\t"
96 "1: \n\t"
97 "movq (%2, %1), %%mm0 \n\t"
98 "movq (%2, %1), %%mm1 \n\t"
99 "movq 8(%2, %1), %%mm2 \n\t"
100 "movq 8(%2, %1), %%mm3 \n\t"
101 "punpcklbw %%mm7, %%mm0 \n\t"
102 "punpckhbw %%mm7, %%mm1 \n\t"
103 "punpcklbw %%mm7, %%mm2 \n\t"
104 "punpckhbw %%mm7, %%mm3 \n\t"
105 "paddw %%mm0, %%mm1 \n\t"
106 "paddw %%mm2, %%mm3 \n\t"
107 "paddw %%mm1, %%mm3 \n\t"
108 "paddw %%mm3, %%mm6 \n\t"
109 "add %3, %1 \n\t"
110 " js 1b \n\t"
111 "movq %%mm6, %%mm5 \n\t"
112 "psrlq $32, %%mm6 \n\t"
113 "paddw %%mm5, %%mm6 \n\t"
114 "movq %%mm6, %%mm5 \n\t"
115 "psrlq $16, %%mm6 \n\t"
116 "paddw %%mm5, %%mm6 \n\t"
117 "movd %%mm6, %0 \n\t"
118 "andl $0xFFFF, %0 \n\t"
119 : "=&r" (sum), "+r" (index)
120 : "r" (pix - index), "r" ((x86_reg)line_size)
123 return sum;
126 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
127 int tmp;
128 asm volatile (
129 "movl $16,%%ecx\n"
130 "pxor %%mm0,%%mm0\n"
131 "pxor %%mm7,%%mm7\n"
132 "1:\n"
133 "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
134 "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
136 "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
138 "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
139 "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
141 "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
142 "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
143 "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
145 "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
146 "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
148 "pmaddwd %%mm3,%%mm3\n"
149 "pmaddwd %%mm4,%%mm4\n"
151 "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
152 pix2^2+pix3^2+pix6^2+pix7^2) */
153 "paddd %%mm3,%%mm4\n"
154 "paddd %%mm2,%%mm7\n"
156 "add %2, %0\n"
157 "paddd %%mm4,%%mm7\n"
158 "dec %%ecx\n"
159 "jnz 1b\n"
161 "movq %%mm7,%%mm1\n"
162 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
163 "paddd %%mm7,%%mm1\n"
164 "movd %%mm1,%1\n"
165 : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
166 return tmp;
169 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
170 int tmp;
171 asm volatile (
172 "movl %4,%%ecx\n"
173 "shr $1,%%ecx\n"
174 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
175 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
176 "1:\n"
177 "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
178 "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
179 "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
180 "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
182 /* todo: mm1-mm2, mm3-mm4 */
183 /* algo: subtract mm1 from mm2 with saturation and vice versa */
184 /* OR the results to get absolute difference */
185 "movq %%mm1,%%mm5\n"
186 "movq %%mm3,%%mm6\n"
187 "psubusb %%mm2,%%mm1\n"
188 "psubusb %%mm4,%%mm3\n"
189 "psubusb %%mm5,%%mm2\n"
190 "psubusb %%mm6,%%mm4\n"
192 "por %%mm1,%%mm2\n"
193 "por %%mm3,%%mm4\n"
195 /* now convert to 16-bit vectors so we can square them */
196 "movq %%mm2,%%mm1\n"
197 "movq %%mm4,%%mm3\n"
199 "punpckhbw %%mm0,%%mm2\n"
200 "punpckhbw %%mm0,%%mm4\n"
201 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
202 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
204 "pmaddwd %%mm2,%%mm2\n"
205 "pmaddwd %%mm4,%%mm4\n"
206 "pmaddwd %%mm1,%%mm1\n"
207 "pmaddwd %%mm3,%%mm3\n"
209 "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
210 "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
212 "paddd %%mm2,%%mm1\n"
213 "paddd %%mm4,%%mm3\n"
214 "paddd %%mm1,%%mm7\n"
215 "paddd %%mm3,%%mm7\n"
217 "decl %%ecx\n"
218 "jnz 1b\n"
220 "movq %%mm7,%%mm1\n"
221 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
222 "paddd %%mm7,%%mm1\n"
223 "movd %%mm1,%2\n"
224 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
225 : "r" ((x86_reg)line_size) , "m" (h)
226 : "%ecx");
227 return tmp;
230 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
231 int tmp;
232 asm volatile (
233 "movl %4,%%ecx\n"
234 "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
235 "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
236 "1:\n"
237 "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
238 "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
239 "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
240 "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
242 /* todo: mm1-mm2, mm3-mm4 */
243 /* algo: subtract mm1 from mm2 with saturation and vice versa */
244 /* OR the results to get absolute difference */
245 "movq %%mm1,%%mm5\n"
246 "movq %%mm3,%%mm6\n"
247 "psubusb %%mm2,%%mm1\n"
248 "psubusb %%mm4,%%mm3\n"
249 "psubusb %%mm5,%%mm2\n"
250 "psubusb %%mm6,%%mm4\n"
252 "por %%mm1,%%mm2\n"
253 "por %%mm3,%%mm4\n"
255 /* now convert to 16-bit vectors so we can square them */
256 "movq %%mm2,%%mm1\n"
257 "movq %%mm4,%%mm3\n"
259 "punpckhbw %%mm0,%%mm2\n"
260 "punpckhbw %%mm0,%%mm4\n"
261 "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
262 "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
264 "pmaddwd %%mm2,%%mm2\n"
265 "pmaddwd %%mm4,%%mm4\n"
266 "pmaddwd %%mm1,%%mm1\n"
267 "pmaddwd %%mm3,%%mm3\n"
269 "add %3,%0\n"
270 "add %3,%1\n"
272 "paddd %%mm2,%%mm1\n"
273 "paddd %%mm4,%%mm3\n"
274 "paddd %%mm1,%%mm7\n"
275 "paddd %%mm3,%%mm7\n"
277 "decl %%ecx\n"
278 "jnz 1b\n"
280 "movq %%mm7,%%mm1\n"
281 "psrlq $32, %%mm7\n" /* shift hi dword to lo */
282 "paddd %%mm7,%%mm1\n"
283 "movd %%mm1,%2\n"
284 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
285 : "r" ((x86_reg)line_size) , "m" (h)
286 : "%ecx");
287 return tmp;
290 static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
291 int tmp;
292 asm volatile (
293 "shr $1,%2\n"
294 "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
295 "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
296 "1:\n"
297 "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
298 "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
299 "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
300 "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
302 /* todo: mm1-mm2, mm3-mm4 */
303 /* algo: subtract mm1 from mm2 with saturation and vice versa */
304 /* OR the results to get absolute difference */
305 "movdqa %%xmm1,%%xmm5\n"
306 "movdqa %%xmm3,%%xmm6\n"
307 "psubusb %%xmm2,%%xmm1\n"
308 "psubusb %%xmm4,%%xmm3\n"
309 "psubusb %%xmm5,%%xmm2\n"
310 "psubusb %%xmm6,%%xmm4\n"
312 "por %%xmm1,%%xmm2\n"
313 "por %%xmm3,%%xmm4\n"
315 /* now convert to 16-bit vectors so we can square them */
316 "movdqa %%xmm2,%%xmm1\n"
317 "movdqa %%xmm4,%%xmm3\n"
319 "punpckhbw %%xmm0,%%xmm2\n"
320 "punpckhbw %%xmm0,%%xmm4\n"
321 "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
322 "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
324 "pmaddwd %%xmm2,%%xmm2\n"
325 "pmaddwd %%xmm4,%%xmm4\n"
326 "pmaddwd %%xmm1,%%xmm1\n"
327 "pmaddwd %%xmm3,%%xmm3\n"
329 "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
330 "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
332 "paddd %%xmm2,%%xmm1\n"
333 "paddd %%xmm4,%%xmm3\n"
334 "paddd %%xmm1,%%xmm7\n"
335 "paddd %%xmm3,%%xmm7\n"
337 "decl %2\n"
338 "jnz 1b\n"
340 "movdqa %%xmm7,%%xmm1\n"
341 "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
342 "paddd %%xmm1,%%xmm7\n"
343 "movdqa %%xmm7,%%xmm1\n"
344 "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
345 "paddd %%xmm1,%%xmm7\n"
346 "movd %%xmm7,%3\n"
347 : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
348 : "r" ((x86_reg)line_size));
349 return tmp;
352 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
353 int tmp;
354 asm volatile (
355 "movl %3,%%ecx\n"
356 "pxor %%mm7,%%mm7\n"
357 "pxor %%mm6,%%mm6\n"
359 "movq (%0),%%mm0\n"
360 "movq %%mm0, %%mm1\n"
361 "psllq $8, %%mm0\n"
362 "psrlq $8, %%mm1\n"
363 "psrlq $8, %%mm0\n"
364 "movq %%mm0, %%mm2\n"
365 "movq %%mm1, %%mm3\n"
366 "punpcklbw %%mm7,%%mm0\n"
367 "punpcklbw %%mm7,%%mm1\n"
368 "punpckhbw %%mm7,%%mm2\n"
369 "punpckhbw %%mm7,%%mm3\n"
370 "psubw %%mm1, %%mm0\n"
371 "psubw %%mm3, %%mm2\n"
373 "add %2,%0\n"
375 "movq (%0),%%mm4\n"
376 "movq %%mm4, %%mm1\n"
377 "psllq $8, %%mm4\n"
378 "psrlq $8, %%mm1\n"
379 "psrlq $8, %%mm4\n"
380 "movq %%mm4, %%mm5\n"
381 "movq %%mm1, %%mm3\n"
382 "punpcklbw %%mm7,%%mm4\n"
383 "punpcklbw %%mm7,%%mm1\n"
384 "punpckhbw %%mm7,%%mm5\n"
385 "punpckhbw %%mm7,%%mm3\n"
386 "psubw %%mm1, %%mm4\n"
387 "psubw %%mm3, %%mm5\n"
388 "psubw %%mm4, %%mm0\n"
389 "psubw %%mm5, %%mm2\n"
390 "pxor %%mm3, %%mm3\n"
391 "pxor %%mm1, %%mm1\n"
392 "pcmpgtw %%mm0, %%mm3\n\t"
393 "pcmpgtw %%mm2, %%mm1\n\t"
394 "pxor %%mm3, %%mm0\n"
395 "pxor %%mm1, %%mm2\n"
396 "psubw %%mm3, %%mm0\n"
397 "psubw %%mm1, %%mm2\n"
398 "paddw %%mm0, %%mm2\n"
399 "paddw %%mm2, %%mm6\n"
401 "add %2,%0\n"
402 "1:\n"
404 "movq (%0),%%mm0\n"
405 "movq %%mm0, %%mm1\n"
406 "psllq $8, %%mm0\n"
407 "psrlq $8, %%mm1\n"
408 "psrlq $8, %%mm0\n"
409 "movq %%mm0, %%mm2\n"
410 "movq %%mm1, %%mm3\n"
411 "punpcklbw %%mm7,%%mm0\n"
412 "punpcklbw %%mm7,%%mm1\n"
413 "punpckhbw %%mm7,%%mm2\n"
414 "punpckhbw %%mm7,%%mm3\n"
415 "psubw %%mm1, %%mm0\n"
416 "psubw %%mm3, %%mm2\n"
417 "psubw %%mm0, %%mm4\n"
418 "psubw %%mm2, %%mm5\n"
419 "pxor %%mm3, %%mm3\n"
420 "pxor %%mm1, %%mm1\n"
421 "pcmpgtw %%mm4, %%mm3\n\t"
422 "pcmpgtw %%mm5, %%mm1\n\t"
423 "pxor %%mm3, %%mm4\n"
424 "pxor %%mm1, %%mm5\n"
425 "psubw %%mm3, %%mm4\n"
426 "psubw %%mm1, %%mm5\n"
427 "paddw %%mm4, %%mm5\n"
428 "paddw %%mm5, %%mm6\n"
430 "add %2,%0\n"
432 "movq (%0),%%mm4\n"
433 "movq %%mm4, %%mm1\n"
434 "psllq $8, %%mm4\n"
435 "psrlq $8, %%mm1\n"
436 "psrlq $8, %%mm4\n"
437 "movq %%mm4, %%mm5\n"
438 "movq %%mm1, %%mm3\n"
439 "punpcklbw %%mm7,%%mm4\n"
440 "punpcklbw %%mm7,%%mm1\n"
441 "punpckhbw %%mm7,%%mm5\n"
442 "punpckhbw %%mm7,%%mm3\n"
443 "psubw %%mm1, %%mm4\n"
444 "psubw %%mm3, %%mm5\n"
445 "psubw %%mm4, %%mm0\n"
446 "psubw %%mm5, %%mm2\n"
447 "pxor %%mm3, %%mm3\n"
448 "pxor %%mm1, %%mm1\n"
449 "pcmpgtw %%mm0, %%mm3\n\t"
450 "pcmpgtw %%mm2, %%mm1\n\t"
451 "pxor %%mm3, %%mm0\n"
452 "pxor %%mm1, %%mm2\n"
453 "psubw %%mm3, %%mm0\n"
454 "psubw %%mm1, %%mm2\n"
455 "paddw %%mm0, %%mm2\n"
456 "paddw %%mm2, %%mm6\n"
458 "add %2,%0\n"
459 "subl $2, %%ecx\n"
460 " jnz 1b\n"
462 "movq %%mm6, %%mm0\n"
463 "punpcklwd %%mm7,%%mm0\n"
464 "punpckhwd %%mm7,%%mm6\n"
465 "paddd %%mm0, %%mm6\n"
467 "movq %%mm6,%%mm0\n"
468 "psrlq $32, %%mm6\n"
469 "paddd %%mm6,%%mm0\n"
470 "movd %%mm0,%1\n"
471 : "+r" (pix1), "=r"(tmp)
472 : "r" ((x86_reg)line_size) , "g" (h-2)
473 : "%ecx");
474 return tmp;
477 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
478 int tmp;
479 uint8_t * pix= pix1;
480 asm volatile (
481 "movl %3,%%ecx\n"
482 "pxor %%mm7,%%mm7\n"
483 "pxor %%mm6,%%mm6\n"
485 "movq (%0),%%mm0\n"
486 "movq 1(%0),%%mm1\n"
487 "movq %%mm0, %%mm2\n"
488 "movq %%mm1, %%mm3\n"
489 "punpcklbw %%mm7,%%mm0\n"
490 "punpcklbw %%mm7,%%mm1\n"
491 "punpckhbw %%mm7,%%mm2\n"
492 "punpckhbw %%mm7,%%mm3\n"
493 "psubw %%mm1, %%mm0\n"
494 "psubw %%mm3, %%mm2\n"
496 "add %2,%0\n"
498 "movq (%0),%%mm4\n"
499 "movq 1(%0),%%mm1\n"
500 "movq %%mm4, %%mm5\n"
501 "movq %%mm1, %%mm3\n"
502 "punpcklbw %%mm7,%%mm4\n"
503 "punpcklbw %%mm7,%%mm1\n"
504 "punpckhbw %%mm7,%%mm5\n"
505 "punpckhbw %%mm7,%%mm3\n"
506 "psubw %%mm1, %%mm4\n"
507 "psubw %%mm3, %%mm5\n"
508 "psubw %%mm4, %%mm0\n"
509 "psubw %%mm5, %%mm2\n"
510 "pxor %%mm3, %%mm3\n"
511 "pxor %%mm1, %%mm1\n"
512 "pcmpgtw %%mm0, %%mm3\n\t"
513 "pcmpgtw %%mm2, %%mm1\n\t"
514 "pxor %%mm3, %%mm0\n"
515 "pxor %%mm1, %%mm2\n"
516 "psubw %%mm3, %%mm0\n"
517 "psubw %%mm1, %%mm2\n"
518 "paddw %%mm0, %%mm2\n"
519 "paddw %%mm2, %%mm6\n"
521 "add %2,%0\n"
522 "1:\n"
524 "movq (%0),%%mm0\n"
525 "movq 1(%0),%%mm1\n"
526 "movq %%mm0, %%mm2\n"
527 "movq %%mm1, %%mm3\n"
528 "punpcklbw %%mm7,%%mm0\n"
529 "punpcklbw %%mm7,%%mm1\n"
530 "punpckhbw %%mm7,%%mm2\n"
531 "punpckhbw %%mm7,%%mm3\n"
532 "psubw %%mm1, %%mm0\n"
533 "psubw %%mm3, %%mm2\n"
534 "psubw %%mm0, %%mm4\n"
535 "psubw %%mm2, %%mm5\n"
536 "pxor %%mm3, %%mm3\n"
537 "pxor %%mm1, %%mm1\n"
538 "pcmpgtw %%mm4, %%mm3\n\t"
539 "pcmpgtw %%mm5, %%mm1\n\t"
540 "pxor %%mm3, %%mm4\n"
541 "pxor %%mm1, %%mm5\n"
542 "psubw %%mm3, %%mm4\n"
543 "psubw %%mm1, %%mm5\n"
544 "paddw %%mm4, %%mm5\n"
545 "paddw %%mm5, %%mm6\n"
547 "add %2,%0\n"
549 "movq (%0),%%mm4\n"
550 "movq 1(%0),%%mm1\n"
551 "movq %%mm4, %%mm5\n"
552 "movq %%mm1, %%mm3\n"
553 "punpcklbw %%mm7,%%mm4\n"
554 "punpcklbw %%mm7,%%mm1\n"
555 "punpckhbw %%mm7,%%mm5\n"
556 "punpckhbw %%mm7,%%mm3\n"
557 "psubw %%mm1, %%mm4\n"
558 "psubw %%mm3, %%mm5\n"
559 "psubw %%mm4, %%mm0\n"
560 "psubw %%mm5, %%mm2\n"
561 "pxor %%mm3, %%mm3\n"
562 "pxor %%mm1, %%mm1\n"
563 "pcmpgtw %%mm0, %%mm3\n\t"
564 "pcmpgtw %%mm2, %%mm1\n\t"
565 "pxor %%mm3, %%mm0\n"
566 "pxor %%mm1, %%mm2\n"
567 "psubw %%mm3, %%mm0\n"
568 "psubw %%mm1, %%mm2\n"
569 "paddw %%mm0, %%mm2\n"
570 "paddw %%mm2, %%mm6\n"
572 "add %2,%0\n"
573 "subl $2, %%ecx\n"
574 " jnz 1b\n"
576 "movq %%mm6, %%mm0\n"
577 "punpcklwd %%mm7,%%mm0\n"
578 "punpckhwd %%mm7,%%mm6\n"
579 "paddd %%mm0, %%mm6\n"
581 "movq %%mm6,%%mm0\n"
582 "psrlq $32, %%mm6\n"
583 "paddd %%mm6,%%mm0\n"
584 "movd %%mm0,%1\n"
585 : "+r" (pix1), "=r"(tmp)
586 : "r" ((x86_reg)line_size) , "g" (h-2)
587 : "%ecx");
588 return tmp + hf_noise8_mmx(pix+8, line_size, h);
591 static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
592 MpegEncContext *c = p;
593 int score1, score2;
595 if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
596 else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
597 score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
599 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
600 else return score1 + FFABS(score2)*8;
603 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
604 MpegEncContext *c = p;
605 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
606 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
608 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
609 else return score1 + FFABS(score2)*8;
612 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
613 int tmp;
615 assert( (((int)pix) & 7) == 0);
616 assert((line_size &7) ==0);
618 #define SUM(in0, in1, out0, out1) \
619 "movq (%0), %%mm2\n"\
620 "movq 8(%0), %%mm3\n"\
621 "add %2,%0\n"\
622 "movq %%mm2, " #out0 "\n"\
623 "movq %%mm3, " #out1 "\n"\
624 "psubusb " #in0 ", %%mm2\n"\
625 "psubusb " #in1 ", %%mm3\n"\
626 "psubusb " #out0 ", " #in0 "\n"\
627 "psubusb " #out1 ", " #in1 "\n"\
628 "por %%mm2, " #in0 "\n"\
629 "por %%mm3, " #in1 "\n"\
630 "movq " #in0 ", %%mm2\n"\
631 "movq " #in1 ", %%mm3\n"\
632 "punpcklbw %%mm7, " #in0 "\n"\
633 "punpcklbw %%mm7, " #in1 "\n"\
634 "punpckhbw %%mm7, %%mm2\n"\
635 "punpckhbw %%mm7, %%mm3\n"\
636 "paddw " #in1 ", " #in0 "\n"\
637 "paddw %%mm3, %%mm2\n"\
638 "paddw %%mm2, " #in0 "\n"\
639 "paddw " #in0 ", %%mm6\n"
642 asm volatile (
643 "movl %3,%%ecx\n"
644 "pxor %%mm6,%%mm6\n"
645 "pxor %%mm7,%%mm7\n"
646 "movq (%0),%%mm0\n"
647 "movq 8(%0),%%mm1\n"
648 "add %2,%0\n"
649 "jmp 2f\n"
650 "1:\n"
652 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
653 "2:\n"
654 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
656 "subl $2, %%ecx\n"
657 "jnz 1b\n"
659 "movq %%mm6,%%mm0\n"
660 "psrlq $32, %%mm6\n"
661 "paddw %%mm6,%%mm0\n"
662 "movq %%mm0,%%mm6\n"
663 "psrlq $16, %%mm0\n"
664 "paddw %%mm6,%%mm0\n"
665 "movd %%mm0,%1\n"
666 : "+r" (pix), "=r"(tmp)
667 : "r" ((x86_reg)line_size) , "m" (h)
668 : "%ecx");
669 return tmp & 0xFFFF;
671 #undef SUM
673 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
674 int tmp;
676 assert( (((int)pix) & 7) == 0);
677 assert((line_size &7) ==0);
679 #define SUM(in0, in1, out0, out1) \
680 "movq (%0), " #out0 "\n"\
681 "movq 8(%0), " #out1 "\n"\
682 "add %2,%0\n"\
683 "psadbw " #out0 ", " #in0 "\n"\
684 "psadbw " #out1 ", " #in1 "\n"\
685 "paddw " #in1 ", " #in0 "\n"\
686 "paddw " #in0 ", %%mm6\n"
688 asm volatile (
689 "movl %3,%%ecx\n"
690 "pxor %%mm6,%%mm6\n"
691 "pxor %%mm7,%%mm7\n"
692 "movq (%0),%%mm0\n"
693 "movq 8(%0),%%mm1\n"
694 "add %2,%0\n"
695 "jmp 2f\n"
696 "1:\n"
698 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
699 "2:\n"
700 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
702 "subl $2, %%ecx\n"
703 "jnz 1b\n"
705 "movd %%mm6,%1\n"
706 : "+r" (pix), "=r"(tmp)
707 : "r" ((x86_reg)line_size) , "m" (h)
708 : "%ecx");
709 return tmp;
711 #undef SUM
713 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
714 int tmp;
716 assert( (((int)pix1) & 7) == 0);
717 assert( (((int)pix2) & 7) == 0);
718 assert((line_size &7) ==0);
720 #define SUM(in0, in1, out0, out1) \
721 "movq (%0),%%mm2\n"\
722 "movq (%1)," #out0 "\n"\
723 "movq 8(%0),%%mm3\n"\
724 "movq 8(%1)," #out1 "\n"\
725 "add %3,%0\n"\
726 "add %3,%1\n"\
727 "psubb " #out0 ", %%mm2\n"\
728 "psubb " #out1 ", %%mm3\n"\
729 "pxor %%mm7, %%mm2\n"\
730 "pxor %%mm7, %%mm3\n"\
731 "movq %%mm2, " #out0 "\n"\
732 "movq %%mm3, " #out1 "\n"\
733 "psubusb " #in0 ", %%mm2\n"\
734 "psubusb " #in1 ", %%mm3\n"\
735 "psubusb " #out0 ", " #in0 "\n"\
736 "psubusb " #out1 ", " #in1 "\n"\
737 "por %%mm2, " #in0 "\n"\
738 "por %%mm3, " #in1 "\n"\
739 "movq " #in0 ", %%mm2\n"\
740 "movq " #in1 ", %%mm3\n"\
741 "punpcklbw %%mm7, " #in0 "\n"\
742 "punpcklbw %%mm7, " #in1 "\n"\
743 "punpckhbw %%mm7, %%mm2\n"\
744 "punpckhbw %%mm7, %%mm3\n"\
745 "paddw " #in1 ", " #in0 "\n"\
746 "paddw %%mm3, %%mm2\n"\
747 "paddw %%mm2, " #in0 "\n"\
748 "paddw " #in0 ", %%mm6\n"
751 asm volatile (
752 "movl %4,%%ecx\n"
753 "pxor %%mm6,%%mm6\n"
754 "pcmpeqw %%mm7,%%mm7\n"
755 "psllw $15, %%mm7\n"
756 "packsswb %%mm7, %%mm7\n"
757 "movq (%0),%%mm0\n"
758 "movq (%1),%%mm2\n"
759 "movq 8(%0),%%mm1\n"
760 "movq 8(%1),%%mm3\n"
761 "add %3,%0\n"
762 "add %3,%1\n"
763 "psubb %%mm2, %%mm0\n"
764 "psubb %%mm3, %%mm1\n"
765 "pxor %%mm7, %%mm0\n"
766 "pxor %%mm7, %%mm1\n"
767 "jmp 2f\n"
768 "1:\n"
770 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
771 "2:\n"
772 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
774 "subl $2, %%ecx\n"
775 "jnz 1b\n"
777 "movq %%mm6,%%mm0\n"
778 "psrlq $32, %%mm6\n"
779 "paddw %%mm6,%%mm0\n"
780 "movq %%mm0,%%mm6\n"
781 "psrlq $16, %%mm0\n"
782 "paddw %%mm6,%%mm0\n"
783 "movd %%mm0,%2\n"
784 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
785 : "r" ((x86_reg)line_size) , "m" (h)
786 : "%ecx");
787 return tmp & 0x7FFF;
789 #undef SUM
791 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
792 int tmp;
794 assert( (((int)pix1) & 7) == 0);
795 assert( (((int)pix2) & 7) == 0);
796 assert((line_size &7) ==0);
798 #define SUM(in0, in1, out0, out1) \
799 "movq (%0)," #out0 "\n"\
800 "movq (%1),%%mm2\n"\
801 "movq 8(%0)," #out1 "\n"\
802 "movq 8(%1),%%mm3\n"\
803 "add %3,%0\n"\
804 "add %3,%1\n"\
805 "psubb %%mm2, " #out0 "\n"\
806 "psubb %%mm3, " #out1 "\n"\
807 "pxor %%mm7, " #out0 "\n"\
808 "pxor %%mm7, " #out1 "\n"\
809 "psadbw " #out0 ", " #in0 "\n"\
810 "psadbw " #out1 ", " #in1 "\n"\
811 "paddw " #in1 ", " #in0 "\n"\
812 "paddw " #in0 ", %%mm6\n"
814 asm volatile (
815 "movl %4,%%ecx\n"
816 "pxor %%mm6,%%mm6\n"
817 "pcmpeqw %%mm7,%%mm7\n"
818 "psllw $15, %%mm7\n"
819 "packsswb %%mm7, %%mm7\n"
820 "movq (%0),%%mm0\n"
821 "movq (%1),%%mm2\n"
822 "movq 8(%0),%%mm1\n"
823 "movq 8(%1),%%mm3\n"
824 "add %3,%0\n"
825 "add %3,%1\n"
826 "psubb %%mm2, %%mm0\n"
827 "psubb %%mm3, %%mm1\n"
828 "pxor %%mm7, %%mm0\n"
829 "pxor %%mm7, %%mm1\n"
830 "jmp 2f\n"
831 "1:\n"
833 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
834 "2:\n"
835 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
837 "subl $2, %%ecx\n"
838 "jnz 1b\n"
840 "movd %%mm6,%2\n"
841 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
842 : "r" ((x86_reg)line_size) , "m" (h)
843 : "%ecx");
844 return tmp;
846 #undef SUM
848 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
849 x86_reg i=0;
850 asm volatile(
851 "1: \n\t"
852 "movq (%2, %0), %%mm0 \n\t"
853 "movq (%1, %0), %%mm1 \n\t"
854 "psubb %%mm0, %%mm1 \n\t"
855 "movq %%mm1, (%3, %0) \n\t"
856 "movq 8(%2, %0), %%mm0 \n\t"
857 "movq 8(%1, %0), %%mm1 \n\t"
858 "psubb %%mm0, %%mm1 \n\t"
859 "movq %%mm1, 8(%3, %0) \n\t"
860 "add $16, %0 \n\t"
861 "cmp %4, %0 \n\t"
862 " jb 1b \n\t"
863 : "+r" (i)
864 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
866 for(; i<w; i++)
867 dst[i+0] = src1[i+0]-src2[i+0];
870 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
871 x86_reg i=0;
872 uint8_t l, lt;
874 asm volatile(
875 "1: \n\t"
876 "movq -1(%1, %0), %%mm0 \n\t" // LT
877 "movq (%1, %0), %%mm1 \n\t" // T
878 "movq -1(%2, %0), %%mm2 \n\t" // L
879 "movq (%2, %0), %%mm3 \n\t" // X
880 "movq %%mm2, %%mm4 \n\t" // L
881 "psubb %%mm0, %%mm2 \n\t"
882 "paddb %%mm1, %%mm2 \n\t" // L + T - LT
883 "movq %%mm4, %%mm5 \n\t" // L
884 "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
885 "pminub %%mm5, %%mm1 \n\t" // min(T, L)
886 "pminub %%mm2, %%mm4 \n\t"
887 "pmaxub %%mm1, %%mm4 \n\t"
888 "psubb %%mm4, %%mm3 \n\t" // dst - pred
889 "movq %%mm3, (%3, %0) \n\t"
890 "add $8, %0 \n\t"
891 "cmp %4, %0 \n\t"
892 " jb 1b \n\t"
893 : "+r" (i)
894 : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
897 l= *left;
898 lt= *left_top;
900 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
902 *left_top= src1[w-1];
903 *left = src2[w-1];
906 #define DIFF_PIXELS_1(m,a,t,p1,p2)\
907 "mov"#m" "#p1", "#a" \n\t"\
908 "mov"#m" "#p2", "#t" \n\t"\
909 "punpcklbw "#a", "#t" \n\t"\
910 "punpcklbw "#a", "#a" \n\t"\
911 "psubw "#t", "#a" \n\t"\
913 #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
914 uint8_t *p1b=p1, *p2b=p2;\
915 asm volatile(\
916 DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
917 DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
918 DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
919 "add %4, %1 \n\t"\
920 "add %4, %2 \n\t"\
921 DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
922 DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
923 DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
924 DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
925 "mov"#m1" "#mm"0, %0 \n\t"\
926 DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
927 "mov"#m1" %0, "#mm"0 \n\t"\
928 : "+m"(temp), "+r"(p1b), "+r"(p2b)\
929 : "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
932 //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
934 #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
935 #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
937 #define LBUTTERFLY2(a1,b1,a2,b2)\
938 "paddw " #b1 ", " #a1 " \n\t"\
939 "paddw " #b2 ", " #a2 " \n\t"\
940 "paddw " #b1 ", " #b1 " \n\t"\
941 "paddw " #b2 ", " #b2 " \n\t"\
942 "psubw " #a1 ", " #b1 " \n\t"\
943 "psubw " #a2 ", " #b2 " \n\t"
945 #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
946 LBUTTERFLY2(m0, m1, m2, m3)\
947 LBUTTERFLY2(m4, m5, m6, m7)\
948 LBUTTERFLY2(m0, m2, m1, m3)\
949 LBUTTERFLY2(m4, m6, m5, m7)\
950 LBUTTERFLY2(m0, m4, m1, m5)\
951 LBUTTERFLY2(m2, m6, m3, m7)\
953 #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
955 #define MMABS_MMX(a,z)\
956 "pxor " #z ", " #z " \n\t"\
957 "pcmpgtw " #a ", " #z " \n\t"\
958 "pxor " #z ", " #a " \n\t"\
959 "psubw " #z ", " #a " \n\t"
961 #define MMABS_MMX2(a,z)\
962 "pxor " #z ", " #z " \n\t"\
963 "psubw " #a ", " #z " \n\t"\
964 "pmaxsw " #z ", " #a " \n\t"
966 #define MMABS_SSSE3(a,z)\
967 "pabsw " #a ", " #a " \n\t"
969 #define MMABS_SUM(a,z, sum)\
970 MMABS(a,z)\
971 "paddusw " #a ", " #sum " \n\t"
973 #define MMABS_SUM_8x8_NOSPILL\
974 MMABS(%%xmm0, %%xmm8)\
975 MMABS(%%xmm1, %%xmm9)\
976 MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
977 MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
978 MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
979 MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
980 MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
981 MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
982 "paddusw %%xmm1, %%xmm0 \n\t"
984 #ifdef ARCH_X86_64
985 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
986 #else
987 #define MMABS_SUM_8x8_SSE2\
988 "movdqa %%xmm7, (%1) \n\t"\
989 MMABS(%%xmm0, %%xmm7)\
990 MMABS(%%xmm1, %%xmm7)\
991 MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
992 MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
993 MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
994 MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
995 MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
996 "movdqa (%1), %%xmm2 \n\t"\
997 MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
998 "paddusw %%xmm1, %%xmm0 \n\t"
999 #endif
1001 #define LOAD4(o, a, b, c, d)\
1002 "movq "#o"(%1), "#a" \n\t"\
1003 "movq "#o"+8(%1), "#b" \n\t"\
1004 "movq "#o"+16(%1), "#c" \n\t"\
1005 "movq "#o"+24(%1), "#d" \n\t"\
1007 #define STORE4(o, a, b, c, d)\
1008 "movq "#a", "#o"(%1) \n\t"\
1009 "movq "#b", "#o"+8(%1) \n\t"\
1010 "movq "#c", "#o"+16(%1) \n\t"\
1011 "movq "#d", "#o"+24(%1) \n\t"\
1013 /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
1014 * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
1015 * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
1016 #define HSUM_MMX(a, t, dst)\
1017 "movq "#a", "#t" \n\t"\
1018 "psrlq $32, "#a" \n\t"\
1019 "paddusw "#t", "#a" \n\t"\
1020 "movq "#a", "#t" \n\t"\
1021 "psrlq $16, "#a" \n\t"\
1022 "paddusw "#t", "#a" \n\t"\
1023 "movd "#a", "#dst" \n\t"\
1025 #define HSUM_MMX2(a, t, dst)\
1026 "pshufw $0x0E, "#a", "#t" \n\t"\
1027 "paddusw "#t", "#a" \n\t"\
1028 "pshufw $0x01, "#a", "#t" \n\t"\
1029 "paddusw "#t", "#a" \n\t"\
1030 "movd "#a", "#dst" \n\t"\
1032 #define HSUM_SSE2(a, t, dst)\
1033 "movhlps "#a", "#t" \n\t"\
1034 "paddusw "#t", "#a" \n\t"\
1035 "pshuflw $0x0E, "#a", "#t" \n\t"\
1036 "paddusw "#t", "#a" \n\t"\
1037 "pshuflw $0x01, "#a", "#t" \n\t"\
1038 "paddusw "#t", "#a" \n\t"\
1039 "movd "#a", "#dst" \n\t"\
1041 #define HADAMARD8_DIFF_MMX(cpu) \
1042 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1043 DECLARE_ALIGNED_8(uint64_t, temp[13]);\
1044 int sum;\
1046 assert(h==8);\
1048 DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
1050 asm volatile(\
1051 HADAMARD48\
1053 "movq %%mm7, 96(%1) \n\t"\
1055 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1056 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
1058 "movq 96(%1), %%mm7 \n\t"\
1059 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1060 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
1062 : "=r" (sum)\
1063 : "r"(temp)\
1066 DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
1068 asm volatile(\
1069 HADAMARD48\
1071 "movq %%mm7, 96(%1) \n\t"\
1073 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
1074 STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
1076 "movq 96(%1), %%mm7 \n\t"\
1077 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
1078 "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
1079 "movq %%mm6, %%mm7 \n\t"\
1080 "movq %%mm0, %%mm6 \n\t"\
1082 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
1084 HADAMARD48\
1085 "movq %%mm7, 64(%1) \n\t"\
1086 MMABS(%%mm0, %%mm7)\
1087 MMABS(%%mm1, %%mm7)\
1088 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1089 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1090 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1091 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1092 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1093 "movq 64(%1), %%mm2 \n\t"\
1094 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1095 "paddusw %%mm1, %%mm0 \n\t"\
1096 "movq %%mm0, 64(%1) \n\t"\
1098 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
1099 LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
1101 HADAMARD48\
1102 "movq %%mm7, (%1) \n\t"\
1103 MMABS(%%mm0, %%mm7)\
1104 MMABS(%%mm1, %%mm7)\
1105 MMABS_SUM(%%mm2, %%mm7, %%mm0)\
1106 MMABS_SUM(%%mm3, %%mm7, %%mm1)\
1107 MMABS_SUM(%%mm4, %%mm7, %%mm0)\
1108 MMABS_SUM(%%mm5, %%mm7, %%mm1)\
1109 MMABS_SUM(%%mm6, %%mm7, %%mm0)\
1110 "movq (%1), %%mm2 \n\t"\
1111 MMABS_SUM(%%mm2, %%mm7, %%mm1)\
1112 "paddusw 64(%1), %%mm0 \n\t"\
1113 "paddusw %%mm1, %%mm0 \n\t"\
1115 HSUM(%%mm0, %%mm1, %0)\
1117 : "=r" (sum)\
1118 : "r"(temp)\
1120 return sum&0xFFFF;\
1122 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1124 #define HADAMARD8_DIFF_SSE2(cpu) \
1125 static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
1126 DECLARE_ALIGNED_16(uint64_t, temp[4]);\
1127 int sum;\
1129 assert(h==8);\
1131 DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
1133 asm volatile(\
1134 HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
1135 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
1136 HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
1137 MMABS_SUM_8x8\
1138 HSUM_SSE2(%%xmm0, %%xmm1, %0)\
1139 : "=r" (sum)\
1140 : "r"(temp)\
1142 return sum&0xFFFF;\
1144 WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
1146 #define MMABS(a,z) MMABS_MMX(a,z)
1147 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1148 HADAMARD8_DIFF_MMX(mmx)
1149 #undef MMABS
1150 #undef HSUM
1152 #define MMABS(a,z) MMABS_MMX2(a,z)
1153 #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
1154 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1155 HADAMARD8_DIFF_MMX(mmx2)
1156 HADAMARD8_DIFF_SSE2(sse2)
1157 #undef MMABS
1158 #undef MMABS_SUM_8x8
1159 #undef HSUM
1161 #ifdef HAVE_SSSE3
1162 #define MMABS(a,z) MMABS_SSSE3(a,z)
1163 #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
1164 HADAMARD8_DIFF_SSE2(ssse3)
1165 #undef MMABS
1166 #undef MMABS_SUM_8x8
1167 #endif
1169 #define DCT_SAD4(m,mm,o)\
1170 "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
1171 "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
1172 "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
1173 "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
1174 MMABS_SUM(mm##2, mm##6, mm##0)\
1175 MMABS_SUM(mm##3, mm##7, mm##1)\
1176 MMABS_SUM(mm##4, mm##6, mm##0)\
1177 MMABS_SUM(mm##5, mm##7, mm##1)\
1179 #define DCT_SAD_MMX\
1180 "pxor %%mm0, %%mm0 \n\t"\
1181 "pxor %%mm1, %%mm1 \n\t"\
1182 DCT_SAD4(q, %%mm, 0)\
1183 DCT_SAD4(q, %%mm, 8)\
1184 DCT_SAD4(q, %%mm, 64)\
1185 DCT_SAD4(q, %%mm, 72)\
1186 "paddusw %%mm1, %%mm0 \n\t"\
1187 HSUM(%%mm0, %%mm1, %0)
1189 #define DCT_SAD_SSE2\
1190 "pxor %%xmm0, %%xmm0 \n\t"\
1191 "pxor %%xmm1, %%xmm1 \n\t"\
1192 DCT_SAD4(dqa, %%xmm, 0)\
1193 DCT_SAD4(dqa, %%xmm, 64)\
1194 "paddusw %%xmm1, %%xmm0 \n\t"\
1195 HSUM(%%xmm0, %%xmm1, %0)
1197 #define DCT_SAD_FUNC(cpu) \
1198 static int sum_abs_dctelem_##cpu(DCTELEM *block){\
1199 int sum;\
1200 asm volatile(\
1201 DCT_SAD\
1202 :"=r"(sum)\
1203 :"r"(block)\
1205 return sum&0xFFFF;\
1208 #define DCT_SAD DCT_SAD_MMX
1209 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
1210 #define MMABS(a,z) MMABS_MMX(a,z)
1211 DCT_SAD_FUNC(mmx)
1212 #undef MMABS
1213 #undef HSUM
1215 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
1216 #define MMABS(a,z) MMABS_MMX2(a,z)
1217 DCT_SAD_FUNC(mmx2)
1218 #undef HSUM
1219 #undef DCT_SAD
1221 #define DCT_SAD DCT_SAD_SSE2
1222 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
1223 DCT_SAD_FUNC(sse2)
1224 #undef MMABS
1226 #ifdef HAVE_SSSE3
1227 #define MMABS(a,z) MMABS_SSSE3(a,z)
1228 DCT_SAD_FUNC(ssse3)
1229 #undef MMABS
1230 #endif
1231 #undef HSUM
1232 #undef DCT_SAD
1234 static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
1235 int sum;
1236 x86_reg i=size;
1237 asm volatile(
1238 "pxor %%mm4, %%mm4 \n"
1239 "1: \n"
1240 "sub $8, %0 \n"
1241 "movq (%2,%0), %%mm2 \n"
1242 "movq (%3,%0,2), %%mm0 \n"
1243 "movq 8(%3,%0,2), %%mm1 \n"
1244 "punpckhbw %%mm2, %%mm3 \n"
1245 "punpcklbw %%mm2, %%mm2 \n"
1246 "psraw $8, %%mm3 \n"
1247 "psraw $8, %%mm2 \n"
1248 "psubw %%mm3, %%mm1 \n"
1249 "psubw %%mm2, %%mm0 \n"
1250 "pmaddwd %%mm1, %%mm1 \n"
1251 "pmaddwd %%mm0, %%mm0 \n"
1252 "paddd %%mm1, %%mm4 \n"
1253 "paddd %%mm0, %%mm4 \n"
1254 "jg 1b \n"
1255 "movq %%mm4, %%mm3 \n"
1256 "psrlq $32, %%mm3 \n"
1257 "paddd %%mm3, %%mm4 \n"
1258 "movd %%mm4, %1 \n"
1259 :"+r"(i), "=r"(sum)
1260 :"r"(pix1), "r"(pix2)
1262 return sum;
1265 #define PHADDD(a, t)\
1266 "movq "#a", "#t" \n\t"\
1267 "psrlq $32, "#a" \n\t"\
1268 "paddd "#t", "#a" \n\t"
1270 pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
1271 pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
1272 pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
1274 #define PMULHRW(x, y, s, o)\
1275 "pmulhw " #s ", "#x " \n\t"\
1276 "pmulhw " #s ", "#y " \n\t"\
1277 "paddw " #o ", "#x " \n\t"\
1278 "paddw " #o ", "#y " \n\t"\
1279 "psraw $1, "#x " \n\t"\
1280 "psraw $1, "#y " \n\t"
1281 #define DEF(x) x ## _mmx
1282 #define SET_RND MOVQ_WONE
1283 #define SCALE_OFFSET 1
1285 #include "dsputil_mmx_qns.h"
1287 #undef DEF
1288 #undef SET_RND
1289 #undef SCALE_OFFSET
1290 #undef PMULHRW
1292 #define DEF(x) x ## _3dnow
1293 #define SET_RND(x)
1294 #define SCALE_OFFSET 0
1295 #define PMULHRW(x, y, s, o)\
1296 "pmulhrw " #s ", "#x " \n\t"\
1297 "pmulhrw " #s ", "#y " \n\t"
1299 #include "dsputil_mmx_qns.h"
1301 #undef DEF
1302 #undef SET_RND
1303 #undef SCALE_OFFSET
1304 #undef PMULHRW
1306 #ifdef HAVE_SSSE3
1307 #undef PHADDD
1308 #define DEF(x) x ## _ssse3
1309 #define SET_RND(x)
1310 #define SCALE_OFFSET -1
1311 #define PHADDD(a, t)\
1312 "pshufw $0x0E, "#a", "#t" \n\t"\
1313 "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
1314 #define PMULHRW(x, y, s, o)\
1315 "pmulhrsw " #s ", "#x " \n\t"\
1316 "pmulhrsw " #s ", "#y " \n\t"
1318 #include "dsputil_mmx_qns.h"
1320 #undef DEF
1321 #undef SET_RND
1322 #undef SCALE_OFFSET
1323 #undef PMULHRW
1324 #undef PHADDD
1325 #endif //HAVE_SSSE3
1328 /* FLAC specific */
1329 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
1330 double *autoc);
1333 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
1335 if (mm_flags & MM_MMX) {
1336 const int dct_algo = avctx->dct_algo;
1337 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
1338 if(mm_flags & MM_SSE2){
1339 c->fdct = ff_fdct_sse2;
1340 }else if(mm_flags & MM_MMXEXT){
1341 c->fdct = ff_fdct_mmx2;
1342 }else{
1343 c->fdct = ff_fdct_mmx;
1347 c->get_pixels = get_pixels_mmx;
1348 c->diff_pixels = diff_pixels_mmx;
1349 c->pix_sum = pix_sum16_mmx;
1351 c->diff_bytes= diff_bytes_mmx;
1352 c->sum_abs_dctelem= sum_abs_dctelem_mmx;
1354 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
1355 c->hadamard8_diff[1]= hadamard8_diff_mmx;
1357 c->pix_norm1 = pix_norm1_mmx;
1358 c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
1359 c->sse[1] = sse8_mmx;
1360 c->vsad[4]= vsad_intra16_mmx;
1362 c->nsse[0] = nsse16_mmx;
1363 c->nsse[1] = nsse8_mmx;
1364 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1365 c->vsad[0] = vsad16_mmx;
1368 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1369 c->try_8x8basis= try_8x8basis_mmx;
1371 c->add_8x8basis= add_8x8basis_mmx;
1373 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
1376 if (mm_flags & MM_MMXEXT) {
1377 c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
1378 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
1379 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
1380 c->vsad[4]= vsad_intra16_mmx2;
1382 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1383 c->vsad[0] = vsad16_mmx2;
1386 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
1389 if(mm_flags & MM_SSE2){
1390 c->sum_abs_dctelem= sum_abs_dctelem_sse2;
1391 c->hadamard8_diff[0]= hadamard8_diff16_sse2;
1392 c->hadamard8_diff[1]= hadamard8_diff_sse2;
1393 if (ENABLE_FLAC_ENCODER)
1394 c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
1397 #ifdef HAVE_SSSE3
1398 if(mm_flags & MM_SSSE3){
1399 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1400 c->try_8x8basis= try_8x8basis_ssse3;
1402 c->add_8x8basis= add_8x8basis_ssse3;
1403 c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
1404 c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
1405 c->hadamard8_diff[1]= hadamard8_diff_ssse3;
1407 #endif
1409 if(mm_flags & MM_3DNOW){
1410 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
1411 c->try_8x8basis= try_8x8basis_3dnow;
1413 c->add_8x8basis= add_8x8basis_3dnow;
1417 dsputil_init_pix_mmx(c, avctx);