2 /* Due to strange behaviour of as.exe we use this macros */
3 /* For all OS/2 coders - please use PGCC to compile this code */
4 #define PR_(foo) ___##foo
5 #define PT_(foo,func) ___##foo,func
8 .size ___##sym,.___end_##sym-___##sym; \
11 #define PR_(foo) __##foo
12 #define PT_(foo,func) __##foo,func
15 .size __##sym,.__end_##sym-__##sym; \
20 |*| MMX assembly scaling routine for Imlib2
21 |*| Written by Willem Monsuwe <willem@stack.nl>
26 .globl PR_(mimageScale_mmx_AARGBA)
27 /* .type PT_(mimageScale_mmx_AARGBA,@function) */
30 /*\ Prototype: __mimageScale_mmx_AARGBA(ImlibScaleInfo *isi, DATA32 *dest,
31 |*| int dxx, int dyy, int dx, int dy, int dw, int dh, int dow, int sow)
45 /*\ Local variables that didn't fit in registers \*/
55 #define sow_4 -40(%ebp)
57 /*\ When %edx points to ImlibScaleInfo, these are the members \*/
58 #define xpoints (%edx)
59 #define ypoints 4(%edx)
60 #define xapoints 8(%edx)
61 #define yapoints 12(%edx)
62 #define xup_yup 16(%edx)
64 PR_(mimageScale_mmx_AARGBA):
75 /*\ Check (dw > 0) && (dh > 0) \*/
81 /*\ X-based array pointers point to the end; we're looping up to 0 \*/
82 /*\ %edi = dest + dow * dy + dx + dw \*/
88 leal (%edi, %eax, 4), %edi
89 /*\ xp = xpoints + dxx + dw \*/
93 leal (%eax, %ebx, 4), %eax
95 /*\ xap = xapoints + dxx + dw \*/
97 leal (%eax, %ebx, 4), %eax
102 /*\ yp = ypoints + dyy \*/
105 leal (%eax, %ebx, 4), %eax
107 /*\ yap = yapoints + dyy \*/
109 leal (%eax, %ebx, 4), %eax
122 jnc .scale_x_up_y_down
125 /*\ Scaling up both ways \*/
136 /*\ %eax = *yap << 4 \*/
146 /*\ %esi = *yp + xp[x] \*/
150 movl (%eax, %ecx, 4), %eax
151 leal (%esi, %eax, 4), %esi
153 /*\ %eax = xap[x] << 4 \*/
155 movl (%eax, %ecx, 4), %eax
159 /*\ %mm0 = xap[x] << 4 \*/
164 /*\ Load and unpack four pixels in parralel
165 |*| %mm2 = ptr[0], %mm3 = ptr[1]
166 |*| %mm4 = ptr[sow], %mm5 = ptr[sow + 1]
169 movq (%esi, %ebx, 4), %mm4
177 /*\ X interpolation: r = l + (r - l) * xap \*/
186 /*\ Now %mm3 = I(ptr[0], ptr[1]), %mm5 = I(ptr[sow], ptr[sow + 1]) \*/
189 /*\ Load and unpack two pixels
190 |*| %mm3 = ptr[0], %mm5 = ptr[sow]
193 movd (%esi, %ebx, 4), %mm5
197 /*\ Y interpolation: d = u + (d - u) * yap \*/
203 movd %mm5, (%edi, %ecx, 4)
212 /*\ %esi = *yp + xp[x] \*/
216 movl (%eax, %ecx, 4), %eax
217 leal (%esi, %eax, 4), %esi
219 /*\ %eax = xap[x] << 4 \*/
221 movl (%eax, %ecx, 4), %eax
225 /*\ %mm0 = xap[x] << 4 \*/
230 /*\ Load and unpack two pixels in parralel
231 |*| %mm2 = ptr[0], %mm3 = ptr[1]
238 /*\ X interpolation: r = l + (r - l) * xap \*/
244 movd %mm3, (%edi, %ecx, 4)
247 /*\ dptr[x] = *sptr \*/
249 movl %eax, (%edi, %ecx, 4)
257 leal (%edi, %eax, 4), %edi
268 /*\ Scaling down vertically \*/
271 /*\ sow_4 = sow * 4 \*/
278 /*\ Setup My and Cy \*/
298 /*\ %esi = *yp + xp[x] \*/
302 movl (%eax, %ecx, 4), %eax
303 leal (%esi, %eax, 4), %esi
306 /*\ v = (*p * My) >> 10 \*/
312 /*\ i = 0x4000 - My \*/
318 /*\ p += sow; v += (*p * Cy) >> 10 \*/
326 /*\ i -= Cy; while (i > Cy) \*/
337 /*\ p += sow; v += (*p * i) >> 10 \*/
345 /*\ %eax = xap[x] << 5 \*/
347 movl (%eax, %ecx, 4), %eax
350 /*\ mm3 = xap[x] << 5 \*/
358 /*\ vv = (*p * My) >> 10 \*/
364 /*\ i = 0x4000 - My \*/
370 /*\ p += sow; vv += (*p * Cy) >> 10 \*/
378 /*\ i -= Cy; while (i > Cy) \*/
384 /*\ p += sow; v += (*p * i) >> 10 \*/
392 /*\ v = v + (vv - v) * xap \*/
398 /*\ dest[x] = v >> 4 \*/
401 movd %mm0, (%edi, %ecx, 4)
409 leal (%edi, %eax, 4), %edi
422 jnc .scale_x_down_y_down
425 /*\ Scaling down horizontally \*/
428 /*\ sow_4 = sow * 4 \*/
435 /*\ %eax = *yap << 5 \*/
439 /*\ mm3 = *yap << 5 \*/
448 /*\ %esi = *yp + xp[x] \*/
452 movl (%eax, %ecx, 4), %eax
453 leal (%esi, %eax, 4), %esi
455 /*\ Setup Mx and Cx \*/
457 movzwl (%eax, %ecx, 4), %ebx
459 movzwl 2(%eax, %ecx, 4), %eax
472 /*\ v = (*p * Mx) >> 10 \*/
478 /*\ i = 0x4000 - Mx \*/
484 /*\ p += sow; v += (*p * Cx) >> 10 \*/
492 /*\ i -= Cx; while (i > Cx) \*/
503 /*\ p += sow; v += (*p * i) >> 10 \*/
517 /*\ vv = (*p * Mx) >> 10 \*/
523 /*\ i = 0x4000 - Mx \*/
529 /*\ p += sow; vv += (*p * Cx) >> 10 \*/
537 /*\ i -= Cx; while (i > Cx) \*/
543 /*\ p += sow; v += (*p * i) >> 10 \*/
551 /*\ v = v + (vv - v) * yap \*/
557 /*\ dest[x] = v >> 4 \*/
560 movd %mm0, (%edi, %ecx, 4)
568 leal (%edi, %eax, 4), %edi
579 /*\ Scaling down both ways \*/
581 .scale_x_down_y_down:
582 /*\ sow_4 = sow * 4 \*/
589 /*\ Setup My and Cy \*/
600 /*\ %esi = *yp + xp[x] \*/
604 movl (%eax, %ecx, 4), %eax
605 leal (%esi, %eax, 4), %esi
607 /*\ Setup Mx and Cx \*/
609 movzwl (%eax, %ecx, 4), %ebx
611 movzwl 2(%eax, %ecx, 4), %eax
623 /*\ p = sptr; v = (*p * Mx) >> 9 \*/
630 /*\ i = 0x4000 - Mx \*/
636 /*\ v += (*++p * Cx) >> 9 \*/
644 /*\ i -= Cx; while (i > Cx) \*/
655 /*\ v += (*++p * i) >> 9 \*/
670 /*\ j = 0x4000 - My \*/
676 /*\ sptr += sow; p = sptr \*/
679 /*\ vx = (*p * Mx) >> 9 \*/
685 /*\ i = 0x4000 - Mx \*/
691 /*\ vx += (*++p * Cx) >> 9 \*/
699 /*\ i -= Cx; while (i > Cx) \*/
705 /*\ vx += (*++p * i) >> 9 \*/
713 /*\ v += (vx * Cy) >> 14 \*/
721 /*\ j -= Cy; while (j > Cy) \*/
727 /*\ sptr += sow; p = sptr \*/
730 /*\ vx = (*p * Mx) >> 9 \*/
736 /*\ i = 0x4000 - Mx \*/
742 /*\ vx += (*++p * Cx) >> 9 \*/
750 /*\ i -= Cx; while (i > Cx) \*/
756 /*\ vx += (*++p * i) >> 9 \*/
764 /*\ v += (vx * j) >> 14 \*/
772 /*\ dptr[x] = mm0 >> 5 \*/
775 movd %mm0, (%edi, %ecx, 4)
779 jnz .down_down_loop_x
783 leal (%edi, %eax, 4), %edi
789 jnz .down_down_loop_y
804 SIZE(mimageScale_mmx_AARGBA)
806 .section .note.GNU-stack,"",@progbits