1 ; vim
: set filetype
=fasm foldmethod
=marker commentstring
=;
%s colorcolumn
=101 :
3 include
'format/elf32.inc'
6 section
'.text' executable writeable align
64
7 ;
***************************************************************************************************
8 public glTexXImage2D_avx2
10 glTexXImage2D_avx2
: namespace glTexXImage2D_avx2
11 ;
---------------------------------------------------------------------------------------------------
16 dst.width.pixs_n
:= 12 ; the real width
19 src.line.pixs_n
:= 24 ; the src width is the dst.width
21 ;
---------------------------------------------------------------------------------------------------
24 define dst.adjust esp
+ 4 * -6
25 define src.adjust esp
+ 4 * -5
26 define ebx_save esp
+ 4 * -4
27 define edi_save esp
+ 4 * -3
28 define esi_save esp
+ 4 * -2
29 define ebp_save esp
+ 4 * -1
31 define ret_addr esp
+ 4 * 0
32 define input esp
+ 4 * 1
33 ;
---------------------------------------------------------------------------------------------------
38 ; input
(right after
dst adjust
, src adjust
, ebx
, edi
, esi
, ebp
, "return addr")
39 mov ebx
, dword
[input
]
40 mov edi
, dword
[ebx
+ dst]
41 mov esi
, dword
[ebx
+ src
]
42 ;
dst --------------------------------------------------------------------------------------
43 mov eax
, dword
[ebx
+ dst.line.pixs_n
]
45 mov ecx
, eax ;
= dst.line.bytes_n
46 mul dword
[ebx
+ dst.y_offset
] ; edx
:eax
= offset of the line of the first
dst rect width
47 add edi
, eax ; edi
= points on the first pix of the line of the first
dst rect width
48 mov eax
, dword
[ebx
+ dst.x_offset
]
49 shl eax
, 2 ;
= dst.x_offset.bytes_n
50 add edi
, eax ; edi
= points on the first pix of the
dst rect
51 ; adjust value from the end pix
(past the last pix
) of
a dst rect width to the first pix of the
dst rect width on next line
52 mov ebp
, [ebx
+ dst.width.pixs_n
]
53 shl ebp
, 2 ;
= dst.width.bytes_n
54 sub ecx
, ebp ;
= dst.line.bytes_n
- dst.width.bytes_n
= adjust value
55 mov dword
[dst.adjust
], ecx
56 ; src
--------------------------------------------------------------------------------------
57 mov edx
, dword
[ebx
+ src.line.pixs_n
]
58 shl edx
, 2 ;
= src.line.bytes_n
59 ; adjust value from the end pix
(past the last pix
) of
a src rect width to the first pix of the src rect width on next line
60 sub edx
, ebp ;
= src.line.bytes_n
- dst.width.bytes_n
= adjust value
61 mov dword
[src.adjust
], edx
62 ;
------------------------------------------------------------------------------------------
63 mov ebx
, [ebx
+ height.lines_n
]
66 mov ecx
, ebp ;
= dst.width.bytes_n
69 ; aggressive unrolled line cpy
(useless from AMD zen3 with optimized REP instructions
)
72 vmovdqu ymm0
, yword
[esi
+ 32 * 0]
73 vmovdqu ymm1
, yword
[esi
+ 32 * 1]
74 vmovdqu ymm2
, yword
[esi
+ 32 * 2]
75 vmovdqu ymm3
, yword
[esi
+ 32 * 3]
76 vmovdqu ymm4
, yword
[esi
+ 32 * 4]
77 vmovdqu ymm5
, yword
[esi
+ 32 * 5]
78 vmovdqu ymm6
, yword
[esi
+ 32 * 6]
79 vmovdqu ymm7
, yword
[esi
+ 32 * 7]
81 vmovdqu yword
[edi
+ 32 * 0], ymm0
82 vmovdqu yword
[edi
+ 32 * 1], ymm1
83 vmovdqu yword
[edi
+ 32 * 2], ymm2
84 vmovdqu yword
[edi
+ 32 * 3], ymm3
85 vmovdqu yword
[edi
+ 32 * 4], ymm4
86 vmovdqu yword
[edi
+ 32 * 5], ymm5
87 vmovdqu yword
[edi
+ 32 * 6], ymm6
88 vmovdqu yword
[edi
+ 32 * 7], ymm7
99 vmovdqu ymm0
, yword
[esi
+ 32 * 0]
100 vmovdqu ymm1
, yword
[esi
+ 32 * 1]
101 vmovdqu ymm2
, yword
[esi
+ 32 * 2]
102 vmovdqu ymm3
, yword
[esi
+ 32 * 3]
103 vmovdqu ymm4
, yword
[esi
+ 32 * 4]
104 vmovdqu ymm5
, yword
[esi
+ 32 * 5]
105 vmovdqu ymm6
, yword
[esi
+ 32 * 6]
107 vmovdqu yword
[edi
+ 32 * 0], ymm0
108 vmovdqu yword
[edi
+ 32 * 1], ymm1
109 vmovdqu yword
[edi
+ 32 * 2], ymm2
110 vmovdqu yword
[edi
+ 32 * 3], ymm3
111 vmovdqu yword
[edi
+ 32 * 4], ymm4
112 vmovdqu yword
[edi
+ 32 * 5], ymm5
113 vmovdqu yword
[edi
+ 32 * 6], ymm6
122 vmovdqu ymm0
, yword
[esi
+ 32 * 0]
123 vmovdqu ymm1
, yword
[esi
+ 32 * 1]
124 vmovdqu ymm2
, yword
[esi
+ 32 * 2]
125 vmovdqu ymm3
, yword
[esi
+ 32 * 3]
126 vmovdqu ymm4
, yword
[esi
+ 32 * 4]
127 vmovdqu ymm5
, yword
[esi
+ 32 * 5]
129 vmovdqu yword
[edi
+ 32 * 0], ymm0
130 vmovdqu yword
[edi
+ 32 * 1], ymm1
131 vmovdqu yword
[edi
+ 32 * 2], ymm2
132 vmovdqu yword
[edi
+ 32 * 3], ymm3
133 vmovdqu yword
[edi
+ 32 * 4], ymm4
134 vmovdqu yword
[edi
+ 32 * 5], ymm5
143 vmovdqu ymm0
, yword
[esi
+ 32 * 0]
144 vmovdqu ymm1
, yword
[esi
+ 32 * 1]
145 vmovdqu ymm2
, yword
[esi
+ 32 * 2]
146 vmovdqu ymm3
, yword
[esi
+ 32 * 3]
147 vmovdqu ymm4
, yword
[esi
+ 32 * 4]
149 vmovdqu yword
[edi
+ 32 * 0], ymm0
150 vmovdqu yword
[edi
+ 32 * 1], ymm1
151 vmovdqu yword
[edi
+ 32 * 2], ymm2
152 vmovdqu yword
[edi
+ 32 * 3], ymm3
153 vmovdqu yword
[edi
+ 32 * 4], ymm4
162 vmovdqu ymm0
, yword
[esi
+ 32 * 0]
163 vmovdqu ymm1
, yword
[esi
+ 32 * 1]
164 vmovdqu ymm2
, yword
[esi
+ 32 * 2]
165 vmovdqu ymm3
, yword
[esi
+ 32 * 3]
167 vmovdqu yword
[edi
+ 32 * 0], ymm0
168 vmovdqu yword
[edi
+ 32 * 1], ymm1
169 vmovdqu yword
[edi
+ 32 * 2], ymm2
170 vmovdqu yword
[edi
+ 32 * 3], ymm3
179 vmovdqu ymm0
, yword
[esi
+ 32 * 0]
180 vmovdqu ymm1
, yword
[esi
+ 32 * 1]
181 vmovdqu ymm2
, yword
[esi
+ 32 * 2]
183 vmovdqu yword
[edi
+ 32 * 0], ymm0
184 vmovdqu yword
[edi
+ 32 * 1], ymm1
185 vmovdqu yword
[edi
+ 32 * 2], ymm2
194 vmovdqu ymm0
, yword
[esi
+ 32 * 0]
195 vmovdqu ymm1
, yword
[esi
+ 32 * 1]
197 vmovdqu yword
[edi
+ 32 * 0], ymm0
198 vmovdqu yword
[edi
+ 32 * 1], ymm1
207 vmovdqu ymm0
, yword
[esi
+ 32 * 0]
208 vmovdqu yword
[edi
+ 32 * 0], ymm0
217 vmovdqu xmm0
, xword
[esi
]
218 vmovdqu xword
[edi
], xmm0
227 mov eax
, dword
[esi
+ 4 * 0]
228 mov edx
, dword
[esi
+ 4 * 1]
229 mov dword
[edi
+ 4 * 0], eax
230 mov dword
[edi
+ 4 * 1], edx
268 add edi
, dword
[dst.adjust
]
269 add esi
, dword
[src.adjust
]
277 vzeroupper ; end of AVX2 code
279 end namespace ; glTexXImage2D_avx2
280 ;
***************************************************************************************************
281 public clearcolor_avx2
283 clearcolor_avx2
: namespace clearcolor_avx2
284 ;
---------------------------------------------------------------------------------------------------
287 dst.width.pixs_n
:= 4
288 dst.line.bytes_n
:= 8
289 dst.height.lines_n
:= 12
290 ;
---------------------------------------------------------------------------------------------------
293 define ebx_save esp
+ 4 * -4
294 define edi_save esp
+ 4 * -3
295 define esi_save esp
+ 4 * -2
296 define ebp_save esp
+ 4 * -1
298 define ret_addr esp
+ 4 * 0
299 define input esp
+ 4 * 1
300 ;
---------------------------------------------------------------------------------------------------
303 vpbroadcastd ymm0
, xmm0
316 ; input
(right after ebx
, edi
, esi
, ebp
, "return addr")
317 mov ebx
, dword
[input
]
318 mov edi
, dword
[ebx
+ dst]
319 ;
dst --------------------------------------------------------------------------------------
320 mov ebp
, [ebx
+ dst.width.pixs_n
]
321 mov esi
, [ebx
+ dst.line.bytes_n
]
322 shl ebp
, 2 ;
= dst.width.bytes_n
323 ; adjust value from the end pix
(past the last pix
) of
a dst rect width to the first pix of the
dst rect width on next line
324 sub esi
, ebp ;
= dst.line.bytes_n
- dst.width.bytes_n
= adjust value
325 ;
------------------------------------------------------------------------------------------
326 mov ebx
, [ebx
+ dst.height.lines_n
]
329 mov ecx
, ebp ;
= dst.width.bytes_n
332 ; aggressive unrolled line cpy
(useless from AMD zen3 with optimized REP instructions
)
335 vmovdqu yword
[edi
+ 32 * 0], ymm0
336 vmovdqu yword
[edi
+ 32 * 1], ymm1
337 vmovdqu yword
[edi
+ 32 * 2], ymm2
338 vmovdqu yword
[edi
+ 32 * 3], ymm3
339 vmovdqu yword
[edi
+ 32 * 4], ymm4
340 vmovdqu yword
[edi
+ 32 * 5], ymm5
341 vmovdqu yword
[edi
+ 32 * 6], ymm6
342 vmovdqu yword
[edi
+ 32 * 7], ymm7
352 vmovdqu yword
[edi
+ 32 * 0], ymm0
353 vmovdqu yword
[edi
+ 32 * 1], ymm1
354 vmovdqu yword
[edi
+ 32 * 2], ymm2
355 vmovdqu yword
[edi
+ 32 * 3], ymm3
356 vmovdqu yword
[edi
+ 32 * 4], ymm4
357 vmovdqu yword
[edi
+ 32 * 5], ymm5
358 vmovdqu yword
[edi
+ 32 * 6], ymm6
366 vmovdqu yword
[edi
+ 32 * 0], ymm0
367 vmovdqu yword
[edi
+ 32 * 1], ymm1
368 vmovdqu yword
[edi
+ 32 * 2], ymm2
369 vmovdqu yword
[edi
+ 32 * 3], ymm3
370 vmovdqu yword
[edi
+ 32 * 4], ymm4
371 vmovdqu yword
[edi
+ 32 * 5], ymm5
379 vmovdqu yword
[edi
+ 32 * 0], ymm0
380 vmovdqu yword
[edi
+ 32 * 1], ymm1
381 vmovdqu yword
[edi
+ 32 * 2], ymm2
382 vmovdqu yword
[edi
+ 32 * 3], ymm3
383 vmovdqu yword
[edi
+ 32 * 4], ymm4
391 vmovdqu yword
[edi
+ 32 * 0], ymm0
392 vmovdqu yword
[edi
+ 32 * 1], ymm1
393 vmovdqu yword
[edi
+ 32 * 2], ymm2
394 vmovdqu yword
[edi
+ 32 * 3], ymm3
402 vmovdqu yword
[edi
+ 32 * 0], ymm0
403 vmovdqu yword
[edi
+ 32 * 1], ymm1
404 vmovdqu yword
[edi
+ 32 * 2], ymm2
412 vmovdqu yword
[edi
+ 32 * 0], ymm0
413 vmovdqu yword
[edi
+ 32 * 1], ymm1
421 vmovdqu yword
[edi
+ 32 * 0], ymm0
429 vmovdqu xword
[edi
], xmm0
437 mov dword
[edi
+ 4 * 0], 0xff252525
438 mov dword
[edi
+ 4 * 1], 0xff252525
446 mov dword
[edi
], 0xff252525
453 add edi
, esi ; adjust
461 vzeroupper ; end of AVX2 code
463 end namespace ; clearcolor_avx2
464 ;
***************************************************************************************************
467 minmax_avx2
: namespace minmax_avx2
468 ;
---------------------------------------------------------------------------------------------------
492 minmax_scale
:= 16 * 4
493 minmax_scale_x
:= 16 * 4
494 minmax_scale_y
:= 17 * 4
495 minmax_scale_s
:= 18 * 4
496 minmax_scale_t
:= 19 * 4
497 ;
---------------------------------------------------------------------------------------------------
500 define ret_addr esp
+ 4 * 0
501 define ctx esp
+ 4 * 1
502 ;
---------------------------------------------------------------------------------------------------
505 vmovups xmm0
, [eax
+ v0
]
506 vmovups xmm1
, [eax
+ v1
]
507 vmovups xmm2
, [eax
+ v2
]
508 vmovups xmm3
, [eax
+ v3
]
509 vmovups xmm6
, [eax
+ minmax_scale
]
511 vminps xmm5
, xmm0
, xmm1
512 vminps xmm5
, xmm5
, xmm2
513 vminps xmm5
, xmm5
, xmm3
514 vcvtdq2ps xmm6
, xmm6 ; to f32
515 vmulps xmm5
, xmm5
, xmm6
517 vmovups
[eax
+ min
], xmm5
519 vmaxps xmm4
, xmm0
, xmm1
520 vmaxps xmm4
, xmm4
, xmm2
521 vmaxps xmm4
, xmm4
, xmm3
522 vmulps xmm4
, xmm4
, xmm6
524 vmovups
[eax
+ max
], xmm4
526 vzeroupper ; end of AVX2 code
528 end namespace ; minmax_avx2
529 ;
***************************************************************************************************
530 ; TODO
: test if the steam client is actually using argb
(the blue seems to go away
)
531 public alphablend_rgba_avx2
533 alphablend_rgba_avx2
: namespace alphablend_rgba_avx2
534 ;
---------------------------------------------------------------------------------------------------
537 dst_adjust_bytes_n
:= 4
539 src_adjust_bytes_n
:= 12
542 ;
---------------------------------------------------------------------------------------------------
545 define ebx_save esp
+ 4 * -4
546 define edi_save esp
+ 4 * -3
547 define esi_save esp
+ 4 * -2
548 define ebp_save esp
+ 4 * -1
550 define ret_addr esp
+ 4 * 0
551 define input esp
+ 4 * 1
552 ;
---------------------------------------------------------------------------------------------------
557 mov ebx
, dword
[input
]
559 mov edi
, dword
[ebx
+ dst]
560 mov esi
, dword
[ebx
+ src
]
561 mov edx
, dword
[ebx
+ height_lines_n
]
563 ; CONSTANTS
-- START
-----------------------------------------------------------------------
564 ;
0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
565 ; ff00ff00ff00ff00 ff00ff00ff00ff00 ff00ff00ff00ff00 ff00ff00ff00ff00
566 vpcmpeqb ymm7
, ymm7
, ymm7
568 ;
0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
569 ;
000000ff000000ff
000000ff000000ff
000000ff000000ff
000000ff000000ff
570 vpcmpeqb ymm6
, ymm6
, ymm6
571 vpsrld ymm6
, ymm6
, 24
572 vpslld ymm6
, ymm6
, 24
574 mov eax
, 0x808080 ; see below for why
(maths
)
575 mov ecx
, 0x807f807f ; see below for why
(maths
)
577 ;
0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
578 ;
8080800080808000 8080800080808000 8080800080808000 8080800080808000
579 vpbroadcastd ymm5
, xmm5
581 ;
0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
582 ;
7f807f807f807f80
7f807f807f807f80
7f807f807f807f80
7f807f807f807f80
583 vpbroadcastd ymm4
, xmm4
584 ; CONSTANTS
-- END
-------------------------------------------------------------------------
587 mov ecx
, dword
[ebx
+ width_pixs_n
]
593 ; load
4 pixels from the src
and 4 pixels from the
dst
594 vmovdqu xmm0
, xword
[edi
] ; clear hi xmm from ymm
595 vmovdqu xmm1
, xword
[esi
] ; clear hi xmm from ymm
596 ; from positive to translated
and signed values
b = B - 0x7f, but _not_ for alpha
597 ; WE DO THAT BECAUSE WE WILL USE THE VPMADDUBSW INSTRUCTION BELOW
AND A XOR 0xFF
598 ;
0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
599 ; from
: R0G0B0A0R1G1B1A1 R2G2B2A2R3G3B3A3
0000000000000000 0000000000000000
600 ; to
: r0g0b0a0b1g1r1A1 r2g2b2a2r3g3b3A3
0000000000000000 0000000000000000
601 vpsubb xmm0
, xmm0
, xmm5
602 vpsubb xmm1
, xmm1
, xmm5
603 ;
0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
604 ; from
: r0g0b0a0r1g1b1A1 r2g2b2a2r3g3b3A3
0000000000000000 0000000000000000
605 ; to
: r0g0b0a0r1g1b1A1
0000000000000000 r2g2b2a2r3g3b3A3
0000000000000000
606 vpermq ymm0
, ymm0
, 10011000b
607 vpermq ymm1
, ymm1
, 10011000b
608 ; f-
>framebuffer
, t-
>texture
609 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
610 ; from
: r0f g0f b0f A0f r1f g1f b1f A1f
00 00 00 00 00 00 00 00 r2f g2f b2f A2f r3f g3f b3f A3f
00 00 00 00 00 00 00 00
611 ; r0t g0t b0t A0t r1t g1t b1t A1t
00 00 00 00 00 00 00 00 r2t g2t b2t A2t r3t g3t b3t A3t
00 00 00 00 00 00 00 00
612 ; to
: r0f r0t g0f g0t b0f b0t A0f A0t r1f r1t g1f g1t b1f b1t A1f A1t r2f r2t g2f g2t b2f b2t a2f A2t r3f r3t g3f g3t b3f b3t A3f A3t
613 vpunpcklbw ymm0
, ymm0
, ymm1
614 ; rgba
-> bgra
, switch r
and b
615 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
616 ; from
: r0f b0t g0f g0t b0f b0t A0f A0t r1f r1t g1f g1t b1f b1t A1f A1t r2f r2t g2f g2t b2f b2t a2f A2t r3f r3t g3f g3t b3f b3t A3f A3t
617 ; to
: b0f b0t g0f g0t r0f r0t A0f A0t b1f b1t g1f g1t r1f r1t A1f A1t b2f b2t g2f g2t r2f r2t a2f A2t b3f b3t g3f g3t r3f r3t A3f A3t
618 vpshuflw ymm0
, ymm0
, 11000110b
619 vpshufhw ymm0
, ymm0
, 11000110b
620 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
621 ; from
: b0f b0t g0f g0t r0f r0t A0f A0t b1f b1t g1f g1t r1f r1t A1f A1t b2f b2t g2f g2t r2f r2t A2f A2t b3f b3t g3f g3t r3f r3t A3f A3t
622 ; to
: a0t
00 00 00 00 00 00 00 a1t
00 00 00 00 00 00 00 a2t
00 00 00 00 00 00 00 a3t
00 00 00 00 00 00 00
623 vpsrlq ymm1
, ymm0
, 56
624 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
625 ; from
: A0t
00 00 00 00 00 00 00 A1t
00 00 00 00 00 00 00 A2t
00 00 00 00 00 00 00 A3t
00 00 00 00 00 00 00
626 ; to
: 00 A0t
00 00 00 00 00 00 00 A1t
00 00 00 00 00 00 00 A2t
00 00 00 00 00 00 00 A3t
00 00 00 00 00 00
628 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
629 ; from
: A0t
00 00 00 00 00 00 00 A1t
00 00 00 00 00 00 00 A2t
00 00 00 00 00 00 00 A3t
00 00 00 00 00 00 00
630 ; to
: A0t A0t
00 00 00 00 00 00 A1t A1t
00 00 00 00 00 00 A2t A2t
00 00 00 00 00 00 A3t A3t
00 00 00 00 00 00
631 vpor ymm1
, ymm2
, ymm1
632 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
633 ; from
: A0t A0t
00 00 00 00 00 00 A1t A1t
00 00 00 00 00 00 A2t A2t
00 00 00 00 00 00 A3t A3t
00 00 00 00 00 00
634 ; to
: A0t A0t A0t A0t A0t A0t A0t A0t A1t A1t
00 00 00 00 00 00 A2t A2t A2t A2t A2t A2t d2f A2t A3f A3t
00 00 00 00 00 00
635 vpshuflw ymm1
, ymm1
, 0
636 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
637 ; from
: A0t A0t A0t A0t A0t A0t A0t A0t A1t A1t
00 00 00 00 00 00 A2t A2t A2t A2t A2t A2t A2t A2t A3t A3t
00 00 00 00 00 00
638 ; to
: A0t A0t A0t A0t A0t A0t A0t A0t A1t A1t A1t A1t A1t A1t A1t A1t A2t A2t A2t A2t A2t A2t A2t A2t A3t A3t A3t A3t A3t A3t A3t A3t
639 vpshufhw ymm1
, ymm1
, 0
640 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
641 ; from
: A0t A0t A0t A0t A0t A0t A0t A0t A1t A1t A1t A1t A1t A1t A1t A1t A2t A2t A2t A2t A2t A2t A2t A2t A3t A3t A3t A3t A3t A3t A3t A3t
642 ; ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00 ff
00
643 ; to
: D
= (255 - A) = xor A, 0xff
644 ; D0t A0t D0t A0t D0t A0t D0t A0t D1t A1t D1t A1t D1t A1t D1t A1t D2t A2t D2t A2t D2t A2t D2t A2t D3t A3t D3t A3t D3t A3t D3t A3t
645 vpxor ymm1
, ymm1
, ymm7
647 ; integer pixel alpha blending with
2^
8 = 0x100 divisor instead of
0xff, on
a 16bits scale
648 ; F
= (F
* (0xff - A) + T * A + 0xff) >> 8
649 ; BUT we are working with f
and t which are
:
650 ; f
= (F
- 0x80) and t = (T - 0x80)
651 ; If you do the maths
, you get
:
652 ; F
= f
* (0xff * A) + t * A + 0x807f (0xff * 0x80) + 0xff
653 ;
= f
* (0xff * A) + t * A + 0x807f
655 ; f
* (0xff -a) + t * a
656 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
657 ; src1
(unsigned
8bits
): D0f A0t D0f A0t D0f A0t D0f A0t D1f A1t D1f A1t D1f A1t D1f A1t D2f A2t D2f A2t D2f A2t D2f A2t D3f A3t D3f A3t D3f A3t D3f A3t
658 ; src2
(signed
8bits
): b0f b0t g0f g0t r0f r0t A0f A0t b1f b1t g1f g1t r1f r1t A1f A1t b2f b2t g2f g2t r2f r2t A2f A2t b3f b3t g3f g3t r3f r3t A3f A3t
659 ; to
: b0-
---- g0-
---- r0-
---- a0-
---- b1-
---- g1-
---- r1-
---- a1-
---- b2-
---- g2-
---- r2-
---- a1-
---- b3-
---- g3-
---- r3-
---- a3-
----
660 vpmaddubsw ymm0
, ymm1
, ymm0 ; ymm1 is unsigned
, ymm0 is signed
662 vpaddw ymm0
, ymm0
, ymm4
665 ; we have to do it this way because of the weird handling of ymm regs by vpackuswb
666 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
667 ; from
: b0-
---- g0-
---- r0-
---- A0-
---- b1-
---- g1-
---- r1-
---- A1-
----
668 ; from
: b2-
---- g2-
---- r2-
---- A1-
---- b3-
---- g3-
---- r3-
---- A3-
----
669 ; to
: b0 g0
r0 A0 b1 g1
r1 A1 b2 g2
r2 A2 b3 g3
r3 A3
670 vextracti128 xmm1
, ymm0
, 1
671 vpackuswb xmm0
, xmm0
, xmm1
672 ;
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
673 ; from
: b0 g0
r0 A0 b1 g1
r1 A1 b2 g2
r2 A2 b3 g3
r3 A3 b0 g0
r0 A0 b1 g1
r1 A1 b2 g2
r2 A2 b3 g3
r3 A3
674 ; to
: b0 g0
r0 ff b1 g1
r1 ff b2 g2
r2 ff b3 g3
r3 ff b0 g0
r0 A0 b1 g1
r1 ff b2 g2
r2 ff b3 g3
r3 ff
675 vpor xmm0
, xmm0
, xmm6
680 add edi
, 4 * 4 ; bytes
681 add esi
, 4 * 4 ; bytes
688 vmovq xmm0
, qword
[edi
] ; zero extended
689 vmovq xmm1
, qword
[esi
] ; zero extended
690 vpsubb xmm0
, xmm0
, xmm5
691 vpsubb xmm1
, xmm1
, xmm5
692 vpunpcklbw xmm0
, xmm0
, xmm1
693 vpshuflw xmm0
, xmm0
, 11000110b
694 vpsrlq xmm1
, xmm0
, 56
696 vpor xmm1
, xmm2
, xmm1
697 vpshuflw xmm1
, xmm1
, 0
698 vpshufhw xmm1
, xmm1
, 0
699 vpxor xmm1
, xmm1
, xmm7
700 vpmaddubsw xmm0
, xmm1
, xmm0 ; xmm1 is unsigned
, xmm0 is signed
701 vpaddw xmm0
, xmm0
, xmm4
703 vpackuswb xmm0
, xmm0
, xmm0
704 vpor xmm0
, xmm0
, xmm6
705 vmovq qword
[edi
], xmm0
708 add edi
, 2 * 4 ; bytes
709 add esi
, 2 * 4 ; bytes
714 vmovd xmm0
, dword
[edi
] ; zero extended
715 vmovd xmm1
, dword
[esi
] ; zero extended
716 vpsubb xmm0
, xmm0
, xmm5
717 vpsubb xmm1
, xmm1
, xmm5
718 vpunpcklbw xmm0
, xmm0
, xmm1
719 vpshuflw xmm0
, xmm0
, 11000110b
720 vpsrlq xmm1
, xmm0
, 56
722 vpor xmm1
, xmm2
, xmm1
723 vpshuflw xmm1
, xmm1
, 0
724 vpshufhw xmm1
, xmm1
, 0
725 vpxor xmm1
, xmm1
, xmm7
726 vpmaddubsw xmm0
, xmm1
, xmm0 ; xmm1 is unsigned
, xmm0 is signed
727 vpaddw xmm0
, xmm0
, xmm4
729 vpackuswb xmm0
, xmm0
, xmm0
730 vpor xmm0
, xmm0
, xmm6
731 vmovd dword
[edi
], xmm0
736 add edi
, dword
[ebx
+ dst_adjust_bytes_n
]
737 add esi
, dword
[ebx
+ src_adjust_bytes_n
]
746 vzeroupper ; end of AVX2 code
748 end namespace ; alphablend_bgra_avx2
749 ;
***************************************************************************************************
757 db
0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
758 db
0x66, 0xf, 0x1f, 0x44, 0x00, 0x00
760 db
0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
761 db
0xf, 0x1f, 0x44, 0x00, 0x00
763 db
0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
764 db
0xf, 0x1f, 0x40, 0x00
766 db
0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
769 db
0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
772 db
0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
775 db
0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
777 db
0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
779 db
0xf, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
781 db
0x66, 0xf, 0x1f, 0x44, 0x00, 0x00
783 db
0xf, 0x1f, 0x44, 0x00, 0x00
785 db
0xf, 0x1f, 0x40, 0x00