7 static unsigned long MMX_AVGDIFF_1
[] = {0x00010001, 0x00010001};
8 static unsigned long MMX_ACCUM_AND
[] = {0xffffffff, 0x00000000};
10 void inline mmx_start_block()
14 pxor %%mm7, %%mm7; // Zero totals \
18 void inline mmx_avgdiff(unsigned char *p1
, unsigned char *p2
, unsigned char *p3
)
22 movq (%%ebx), %%mm0; // Load 8 pixels from a \
23 pxor %%mm4, %%mm4; // Zero out temp for unpacking a \
24 movq %%mm0, %%mm2; // Make a copy of a for unpacking \
25 movq (%%ecx), %%mm1; // Load 8 pixels from b \
26 pxor %%mm3, %%mm3; // Zero out b's upper unpacked destination \
27 punpcklbw %%mm4, %%mm2; // Unpack lower 4 pixels from a for addition \
28 movq %%mm1, %%mm5; // Copy b for unpacking \
29 punpckhbw %%mm4, %%mm0; // Unpack upper 4 pixels from a for addition \
30 punpcklbw %%mm3, %%mm5; // Unpack lower 4 pixels from b for addition \
31 paddw %%mm2, %%mm5; // Add lower a and lower b unpacked \
32 punpckhbw %%mm3, %%mm1; // Unpack upper 4 pixels from b for addition \
33 paddw %%mm0, %%mm1; // Add upper a and upper b unpacked \
34 movq (%%edx), %%mm2; // Load c for difference \
35 paddw MMX_AVGDIFF_1, %%mm5; // Add 1 to the result of lower a + b \
36 pxor %%mm4, %%mm4; // Zero out temp for c unpacking \
37 movq %%mm2, %%mm3; // Make a copy of c for unpacking \
38 paddw MMX_AVGDIFF_1, %%mm1; // Add 1 to the result of upper a + b \
39 punpcklbw %%mm4, %%mm3; // Unpack lower 4 pixels from c for subtraction \
40 punpckhbw %%mm4, %%mm2; // Unpack upper 4 pixels from c \
41 movq %%mm3, %%mm0; // Make a copy of lower c for absdiff \
42 psraw $1, %%mm5; // Divide result of lower a + b by 2 \
43 movq %%mm2, %%mm4; // Make a copy of upper c for absdiff \
44 psraw $1, %%mm1; // Divide result of upper a + b by 2 \
45 psubusw %%mm5, %%mm3; // Subtract lower pixels one way \
46 psubusw %%mm1, %%mm2; // Subtract upper pixels one way \
47 psubusw %%mm0, %%mm5; // Subtract lower pixels the other way \
48 por %%mm5, %%mm3; // Or the result of the lower pixels \
49 psubusw %%mm4, %%mm1; // Subtract upper pixels the other way \
50 por %%mm1, %%mm2; // Or the result of the upper pixels \
51 paddw %%mm3, %%mm7; // Accumulate lower pixels \
52 paddw %%mm2, %%mm7; // Accumulate upper pixels \
55 : "b" (p1
), "c" (p2
), "d" (p3
));
58 unsigned int mmx_accum_avgdiff()
60 unsigned long long r
= 0;
63 pxor %%mm5, %%mm5; // Clear temp for unpacking \
64 movq %%mm7, %%mm6; // Make a copy for unpacking \
65 punpcklwd %%mm5, %%mm6; // Unpack lower 2 pixels for accumulation \
66 punpckhwd %%mm5, %%mm7; // Unpack high 2 pixels for accumulation \
67 paddw %%mm6, %%mm7; // Add 2 doublewords in each register \
68 movq %%mm7, %%mm6; // Copy the result for a final add \
69 pand MMX_ACCUM_AND, %%mm7; // And the result for accumulation \
70 psrlq $32, %%mm6; // Shift the copy right for accumulation \
71 paddd %%mm6, %%mm7; // Add the results \
72 movq %%mm7, (%%ebx); // Store result \
77 return (unsigned int)r
;
81 unsigned int mmx_test(unsigned char *result
)
83 unsigned long long r
= 255;
86 movq (%%ecx), %%mm0; \
87 movq (%%ecx), %%mm1; \
89 movq %%mm1, (%%ebx); \
90 movq %%mm0, (%%ecx); \
93 : "b" (result
), "c" (&r
));
97 int main(int argc
, char *argv
[])
99 unsigned char pixels1
[9] = { 13, 13, 12, 11, 11, 10, 9, 9, 10 };
100 unsigned char pixels3
[8] = { 15, 10, 7, 8, 14, 19, 21, 20 };
101 unsigned char *p1
, *p2
, *p3
;
108 printf("%d %d %d %d %d %d %d %d %d\n", p1
[0], p1
[1], p1
[2], p1
[3], p1
[4], p1
[5], p1
[6], p1
[7], p1
[8]);
109 printf("%d %d %d %d %d %d %d %d\n", p3
[0], p3
[1], p3
[2], p3
[3], p3
[4], p3
[5], p3
[6], p3
[7]);
110 printf("-----------------------\n");
112 mmx_avgdiff(p1
, p2
, p3
);
113 result
= mmx_accum_avgdiff();
114 // p3[0] = p3[1] = p3[2] = p3[4] = p3[5] = p3[6] = p3[7] = 0;
115 // result = mmx_test(p3);
116 printf("%d %d %d %d %d %d %d %d %d\n", p1
[0], p1
[1], p1
[2], p1
[3], p1
[4], p1
[5], p1
[6], p1
[7], p1
[8]);
117 printf("%d %d %d %d %d %d %d %d\n", p3
[0], p3
[1], p3
[2], p3
[3], p3
[4], p3
[5], p3
[6], p3
[7]);
118 printf("%d\n", result
);