From 342253c45ae6ca0b3ac3828f80d3db8eaf77ac4b Mon Sep 17 00:00:00 2001 From: ketmar Date: Wed, 24 Nov 2021 01:51:36 +0000 Subject: [PATCH] egra: moved low-level SSE optimised code to separate module; also, optimised SSE code a little more ;-) FossilOrigin-Name: e6d3c9c3c8f54d168f60b92532b945c3e50a5db24f7519305b4110525bfac193 --- egra/gfx/backgl.d | 2 + egra/gfx/backx11.d | 2 + egra/gfx/base.d | 683 +---------------------------------------------- egra/gfx/config.d | 2 + egra/gfx/lowlevel.d | 751 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 758 insertions(+), 682 deletions(-) create mode 100644 egra/gfx/lowlevel.d diff --git a/egra/gfx/backgl.d b/egra/gfx/backgl.d index 0178b00..29297b1 100644 --- a/egra/gfx/backgl.d +++ b/egra/gfx/backgl.d @@ -27,6 +27,7 @@ import iv.cmdcon; import iv.egra.gfx.config; import iv.egra.gfx.base; +import iv.egra.gfx.lowlevel; // ////////////////////////////////////////////////////////////////////////// // @@ -41,6 +42,7 @@ public __gshared uint vArrowTextureId = 0; // ////////////////////////////////////////////////////////////////////////// // shared static this () { import core.stdc.stdlib : malloc; + egfxCheckCPU(); // always allocate additional 16 bytes for SSE routines vglTexBuf = cast(uint*)malloc((cast(uint)VBufWidth*cast(uint)VBufHeight)*vglTexBuf[0].sizeof+16u); if (vglTexBuf is null) { import core.exception : onOutOfMemoryErrorNoGC; onOutOfMemoryErrorNoGC(); } diff --git a/egra/gfx/backx11.d b/egra/gfx/backx11.d index 0f70959..336d17a 100644 --- a/egra/gfx/backx11.d +++ b/egra/gfx/backx11.d @@ -27,6 +27,7 @@ import iv.cmdcongl; import iv.egra.gfx.config; import iv.egra.gfx.base; +import iv.egra.gfx.lowlevel; // ////////////////////////////////////////////////////////////////////////// // @@ -38,6 +39,7 @@ public enum vArrowTextureId = 0; // ////////////////////////////////////////////////////////////////////////// // shared static this () { import core.stdc.stdlib : malloc; + egfxCheckCPU(); // always allocate additional 16 bytes for SSE routines vglTexBuf = cast(uint*)malloc((cast(uint)VBufWidth*cast(uint)VBufHeight)*vglTexBuf[0].sizeof+16u); if (vglTexBuf is null) { import core.exception : onOutOfMemoryErrorNoGC; onOutOfMemoryErrorNoGC(); } diff --git a/egra/gfx/base.d b/egra/gfx/base.d index aa34704..0a32a52 100644 --- a/egra/gfx/base.d +++ b/egra/gfx/base.d @@ -24,14 +24,7 @@ import iv.bclamp; import iv.cmdcon; import iv.egra.gfx.config; - - -// ////////////////////////////////////////////////////////////////////////// // -version (DigitalMars) { - version(X86) { - version = EGRA_USE_SSE_ASM; - } -} +import iv.egra.gfx.lowlevel; // ////////////////////////////////////////////////////////////////////////// // @@ -610,40 +603,6 @@ public ubyte gxGetBlue (in uint clr) pure nothrow @safe @nogc { pragma(inline, t public ubyte gxGetAlpha (in uint clr) pure nothrow @safe @nogc { pragma(inline, true); return cast(ubyte)(clr>>24); } -// ////////////////////////////////////////////////////////////////////////// // -// mix `dcvar` with ARGB (or ABGR) `colvar`; dc A is ignored (set to 255) -// main code almost never calls this with solid or transparent `colvar` -// the result will be put to `destvar` (it is written only once, at the end) -// `colvar` and `dcvar` may be read several times -// see http://stereopsis.com/doubleblend.html for the inspiration -version(none) { -// this works for solid and transparent colors too -enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{ - immutable uint col_ = `~colvar~`; - immutable uint dc_ = (`~dcvar~`)&0xffffffu; - /*immutable uint a_ = 256-(col_>>24);*/ /* to not loose bits */ - immutable uint a_ = (col_>>24)+1; /* so it will work for both 0 and 255 correctly */ - immutable uint srb_ = (col_&0xff00ffu); - immutable uint sg_ = (col_&0x00ff00u); - immutable uint drb_ = (dc_&0xff00ffu); - immutable uint dg_ = (dc_&0x00ff00u); - immutable uint orb_ = (drb_+(((srb_-drb_)*a_+0x800080u)>>8))&0xff00ffu; - immutable uint og_ = (dg_+(((sg_-dg_)*a_+0x008000u)>>8))&0x00ff00u; - (`~destvar~`) = orb_|og_|0xff_00_00_00u; -}`; -} else { -// this works for solid and transparent colors too -enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{ - immutable uint a_ = ((`~colvar~`)>>24)+1u; /* to not loose bits */ - uint rb_ = (`~dcvar~`)&0xff00ffu; - uint g_ = (`~dcvar~`)&0x00ff00u; - rb_ += ((cast(uint)((`~colvar~`)&0xff00ffu)-rb_)*a_)>>8; - g_ += ((cast(uint)((`~colvar~`)&0x00ff00u)-g_)*a_)>>8; - /* g is mixed with solid alpha; replace "0xff_" with other alpha if you want to */ - (`~destvar~`) = (rb_&0xff00ffu)|(g_&0xff_00ff00u)|0xff_00_00_00u; -}`; -} - public uint gxColMix (in uint dc, in uint clr) pure nothrow @trusted @nogc { pragma(inline, true); if (gxIsSolid(clr)) return clr; @@ -1453,643 +1412,3 @@ public void gxDrawLine (int x0, int y0, int x1, int y1, in uint clr, bool lastPo public void gxDrawLine() (in auto ref GxPoint p0, in int x1, in int y1, in uint clr, in bool lastPoint=true) { pragma(inline, true); gxDrawLine(p0.x, p0.y, x1, y1, clr, lastPoint); } public void gxDrawLine() (in int x0, in int y0, auto ref GxPoint p1, in uint clr, in bool lastPoint=true) { pragma(inline, true); gxDrawLine(x0, y0, p1.x, p1.y, clr, lastPoint); } public void gxDrawLine() (in auto ref GxPoint p0, auto ref GxPoint p1, in uint clr, in bool lastPoint=true) { pragma(inline, true); gxDrawLine(p0.x, p0.y, p1.x, p1.y, clr, lastPoint); } - - -// ////////////////////////////////////////////////////////////////////////// // -// size is in dwords -version(EGRA_USE_SSE_ASM) { - -align(16) immutable ubyte[16] sseSpreadOneColor = [ -0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, -0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, -]; - -// for x86 naked functions, DMD will pass last arg in EAX -// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right -// we need to preserve ESI and EDI (and EBX in case of PIC code) -public uint* memFillDW (uint* mptr, in uint value, in int count) nothrow @trusted @nogc { - asm nothrow @trusted @nogc { - naked; - xchg EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI - cmp EAX,0; - jle done; - - mov ECX,EAX; // ECX=count (because last arg is in EAX) - mov EAX,SS:[ESP+4]; // EAX=value - - cmp ECX,8; - jc simplestore; // too small - - // load XMM0 with our color - push EAX; - movdqu XMM0,SS:[ESP]; - pshufb XMM0,[sseSpreadOneColor]; - add ESP,4; - - // if we cannot align at all, use "rep stosd" - // this should not happen, so i won't bother optimising it - test EDI,0x03; - jnz simplestore; - - // align EDI (we have at least 8 pixels to fill here, so it is safe) -alignloop: - test EDI,0x0f; - jz alignok; - stosd; - dec ECX; - jmp alignloop; - -alignok: - // ECX is never zero here - cmp ECX,4; - jc simplestore; // too small - - // save last 2 bits of counter (we'll mask them later) - movzx EDX,CL; - - // fill by 4 pixels while we can - shr ECX,2; - //align 16; // why not -alignfill: - movaps [EDI],XMM0; - add EDI,16; - dec ECX; - jnz alignfill; - - // fill last 1-3 pixels - mov ECX,EDX; - and CL,0x03; - jz done; - -simplestore: - rep; stosd; - -done: - mov EAX,EDI; // return new mptr - mov EDI,SS:[ESP+8]; // restore EDI - ret 4*2; - } -} - -// WARNING! this function is not quite right (0 and 255 alphas will still modify the colors) -// WARNING! do not call it with fully opaque or fully transparent `clr`! -public alias memBlendColor = sseBlendColor; -/* -public uint* memBlendColor (uint* mptr, in uint clr, int count) nothrow @trusted @nogc { - pragma(inline, true); - version(all) { - if (count < 1) return mptr; - immutable int c4 = (count>>2); // it is actually unsigned - if (c4) { mptr = sseBlendColor4px(mptr, clr, cast(uint)c4); count -= (c4<<2); } - return (count ? memBlendColorSlow(mptr, clr, count) : mptr); - } else { - return memBlendColorSlow(mptr, clr, count); - } -} -*/ - - -align(16) immutable ubyte[16] sseSpreadAlpha = [ -0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff, -0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff, -]; - -align(16) immutable ubyte[16] sseMaxAlpha = [ -0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, -0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, -]; - -align(16) immutable ubyte[16] sseFullByteAlpha = [ -0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, -0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, -]; - -// mix foreground to background -// EAX is pixel count -// background = (alpha * foreground) + (1-alpha)*background -// WARNING! this function is not quite right (0 and 255 alphas will still modify the colors) -// WARNING! do not call it with fully opaque or fully transparent `clr`! -uint* sseBlendColor (uint* dest, uint clr, uint count) nothrow @trusted @nogc { - asm nothrow @trusted @nogc { - naked; - enter 0,0; - // save modified registers - push EDI; - - mov EDI,[EBP+12]; // dest - // it can be negative - cmp EAX,0; - jle done; - mov ECX,EAX; // counter - - // EAX: count - // [EBP+8]: clr - // [EBP+16]: dest - - // align stack - sub ESP,16; - and ESP,0xfffffff0u; - - mov EAX,[EBP+8]; // clr - - // we can premultiply clr first, and convert alpha to 255-alpha - - // prepare SSE data -- 2 pixels - mov SS:[ESP],EAX; - mov SS:[ESP+4],EAX; - - movdqa XMM0,SS:[ESP]; - // expand 8 ubytes to 8 ushorts - pmovzxbw XMM1,XMM0; - // XMM0: xx xx xx xx ar gb ar gb - // XMM1: 0a 0r 0g 0b 0a 0r 0g 0b - pshufb XMM0,[sseSpreadAlpha]; - // XMM0: 00 0a 0a 0a 00 0a 0a 0a - movdqa XMM7,[sseMaxAlpha]; - psubw XMM7,XMM0; // XMM7 is 255-alpha - // XMM7: 00 0a 0a 0a 00 0a 0a 0a - pmulhuw XMM0,XMM1; - // XMM0: 00 0r 0g 0b 00 0r 0g 0b - movdqa XMM6,[sseFullByteAlpha]; - - //XMM0: 2 premultiplied colors - //XMM7: 2 inverted alphas - //XMM6: destination alpha (replace value) - - // totally unaligned? - // this should never happen, but meh... - test EDI,0x03; - jnz slowestpath; // alas, the slowest path - - // align the address (if necessary) - test EDI,0x0f; - jz trymix8aligned; - - // we need to mix 1-3 pixels to make the address aligned - // check counter here to allow "slow, but aligned" path (see the code below) - cmp ECX,4; - jc slowestpath; // alas - - // process 4 pixels (we will drop unused ones) - movdqu XMM5,[EDI]; // 4 background pixels - pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1 - // copy high part of XMM5 to low part of XMM5 - movhlps XMM5,XMM5; - pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2 - //XMM1: 2 lower pixels - //XMM2: 2 upper pixels - - pmulhuw XMM1,XMM7; // multiply by alpha - pmulhuw XMM2,XMM7; // multiply by alpha - - paddusw XMM1,XMM0; // add premultiplied colors - paddusw XMM2,XMM0; // add premultiplied colors - - packuswb XMM1,XMM2; - - // set destination alpha - por XMM1,XMM6; - - // now write 1-3 pixels to align the address - // we are guaranteed to have at least 4 pixels to mix here - // i.e. 4 processed pixels, and at least 4 pixels in the counter - - // put in temp storage (it is aligned) - movdqa SS:[ESP],XMM1; - mov EDX,ESI; // save ESI (DMD expects it unchanged) - lea ESI,[ESP+4]; -uastoreloop: - movsd; - dec ECX; - test EDI,0x0f; - jnz uastoreloop; - mov ESI,EDX; // restore ESI - // ECX is at least 1 here, and EDI is aligned - -trymix8aligned: - // ECX is never zero here - // use "slow, but aligned" path if we have less than 8 pixels to process - cmp ECX,8; - jc slowalignedpath; - - // save last 3 bits in EAX - // we'll mask it later - movzx EAX,CL; - - // process by 8 pixels while we can - shr ECX,3; - -mix8aligned: - movdqa XMM5,[EDI]; // 4 background pixels - pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1 - // copy high part of XMM5 to low part of XMM5 - movhlps XMM5,XMM5; - pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2 - //XMM1: 2 lower pixels - //XMM2: 2 upper pixels - - movdqa XMM5,[EDI+16]; // 4 background pixels - pmovzxbw XMM3,XMM5; // expand 2 lower pixels to XMM3 - // copy high part of XMM5 to low part of XMM5 - movhlps XMM5,XMM5; - pmovzxbw XMM4,XMM5; // expand 2 upper pixels to XMM4 - //XMM3: 2 lower pixels - //XMM4: 2 upper pixels - - pmulhuw XMM1,XMM7; // multiply by alpha - pmulhuw XMM2,XMM7; // multiply by alpha - pmulhuw XMM3,XMM7; // multiply by alpha - pmulhuw XMM4,XMM7; // multiply by alpha - - paddusw XMM1,XMM0; // add premultiplied colors - paddusw XMM2,XMM0; // add premultiplied colors - paddusw XMM3,XMM0; // add premultiplied colors - paddusw XMM4,XMM0; // add premultiplied colors - - packuswb XMM1,XMM2; - packuswb XMM3,XMM4; - - // set destination alpha - por XMM1,XMM6; - por XMM3,XMM6; - - movdqa [EDI],XMM1; - movdqa [EDI+16],XMM3; - - add EDI,32; - dec ECX; - jnz mix8aligned; - - // do last 1-7 pixels (last counter is in EAX) - // EDI is guaranteed to be aligned here - mov ECX,EAX; - and CL,0x07; - jnz slowalignedpath; - - // we're done - mov EAX,EDI; - mov EDI,[EBP-4]; // restore EDI - leave; - ret 4*2; - - align 16; - // mix by 4 pixels, unaligned -slowestpath: - // mix 4 pixels - movdqu XMM5,[EDI]; // 4 background pixels - pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1 - // copy high part of XMM5 to low part of XMM5 - movhlps XMM5,XMM5; - pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2 - //XMM1: 2 lower pixels - //XMM2: 2 upper pixels - - pmulhuw XMM1,XMM7; // multiply by alpha - pmulhuw XMM2,XMM7; // multiply by alpha - - paddusw XMM1,XMM0; // add premultiplied colors - paddusw XMM2,XMM0; // add premultiplied colors - - packuswb XMM1,XMM2; - - // set destination alpha - por XMM1,XMM6; - - sub ECX,4; - jc slowestlast; - - movdqu [EDI],XMM1; - add EDI,16; - jecxz done; - jmp slowestpath; - - // last 1-3 pixels (never 0) -slowestlast: - // put in temp storage (it is aligned) - movdqa SS:[ESP],XMM1; - mov EDX,ESI; // save ESI (DMD expects it unchanged) - push ESI; - lea ESI,[ESP+4]; - and ECX,0x03; // left counter - rep; movsd; - mov ESI,EDX; // restore ESI - jmp done; - -done: - mov EAX,EDI; - mov EDI,[EBP-4]; // restore EDI - leave; - ret 4*2; - - - align 16; - // mix by 4 pixels, aligned (used for 1-7 pixels) -slowalignedpath: - // mix 4 pixels - movdqa XMM5,[EDI]; // 4 background pixels - pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1 - // copy high part of XMM5 to low part of XMM5 - movhlps XMM5,XMM5; - pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2 - //XMM1: 2 lower pixels - //XMM2: 2 upper pixels - - pmulhuw XMM1,XMM7; // multiply by alpha - pmulhuw XMM2,XMM7; // multiply by alpha - - paddusw XMM1,XMM0; // add premultiplied colors - paddusw XMM2,XMM0; // add premultiplied colors - - packuswb XMM1,XMM2; - - // set destination alpha - por XMM1,XMM6; - - sub ECX,4; - jc slowestlast; - - movdqa [EDI],XMM1; - add EDI,16; - jecxz done; - jmp slowalignedpath; - } -} - -// EAX is `count` -// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right -// we need to preserve ESI and EDI (and EBX in case of PIC code) -/+ -public uint* memBlendColorSlow (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc { - asm nothrow @trusted @nogc { - naked; - - xchg EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI - cmp EAX,0; - jle quit; - - push EBP; // EBP will contain the counter - push EBX; // EBX is temporary register - push ESI; // DMD expects ESI to be unmodified at exit - mov EBP,EAX; // EBP=counter - - mov EAX,SS:[ESP+16]; // EAX=clr - mov ECX,EAX; // ECX will be clrA - // clrG=clr&0x00ff00u; - and EAX,0x00ff00u; - push EAX; - // clrRB=clr&0xff00ffu; - mov EAX,ECX; - and EAX,0xff00ffu; - push EAX; - // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision - shr ECX,24; - inc ECX; - - // [ESP+0]: clrRB - // [ESP+4]: clrG - // ESI - // EBX - // EBP - // ret addr - // clr - // mptr - // EBP=counter - // EDI=mptr - // ECX=clrA - - align 16; // why not - - /+ - clrA = (clr>>24)+1; - clrRB = clr&0xff00ffu; - clrG = clr&0x00ff00u; - - rb = (*mptr)&0xff00ffu; - rb += ((clrRB-rb)*clrA)>>8; - rb &= 0xff00ffu; - - g = (*mptr)&0x00ff00u; - g += ((clrG-g)*clrA)>>8; - g &= 0x00ff00u; - - *mptr++ = rb|g|0xff000000u; - +/ - - mixloop: - // rb = (*mptr)&0xff00ffu; - // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu; - mov EBX,[EDI]; - mov ESI,EBX; // save `*mptr` - and EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu - mov EAX,SS:[ESP]; // EAX=clrRB - sub EAX,EBX; // EAX=clrRB-rb - mul ECX; // EAX=(clrRB-rb)*clrA (EDX is dead) - shr EAX,8; // EAX=((clrRB-rb)*clrA)>>8 - add EBX,EAX; // EBX=rb+(((clrRB-rb)*clrA)>>8) - and EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu - - // g = (*mptr)&0x00ff00u; - // g += (((clrG-g)*clrA)>>8)&0x00ff00u; - mov EDX,ESI; // EDX=*mptr - and EDX,0x00ff00u; // EDX=g=(*mptr)&0x00ff00u - mov ESI,EDX; // save g, we well need it later - mov EAX,SS:[ESP+4]; // EAX=clrG - sub EAX,EDX; // EAX=clrG-g - mul ECX; // EAX=(clrG-g)*clrA (EDX is dead) - shr EAX,8; // EAX=((clrG-g)*clrA)>>8 - add EAX,ESI; // EAX=(((clrG-g)*clrA)>>8)+g - and EAX,0x00ff00u; // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u - - // mix - or EAX,EBX; - or EAX,0xff000000u; - - stosd; - dec EBP; - jnz mixloop; - - add ESP,2*4; // drop temp vars - // restore registers - pop ESI; - pop EBX; - pop EBP; - -quit: - mov EAX,EDI; // result - mov EDI,SS:[ESP+8]; // restore EDI - ret 8; - } -} -+/ - - -// for x86 naked functions, DMD will pass last arg in EAX -// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right -// we need to preserve ESI and EDI (and EBX in case of PIC code) -// this doesn't change every 2nd pixel; `count` is count of ALL pixels -public uint* memFillDWDash (uint* mptr, in uint value, in int count) nothrow @trusted @nogc { - asm nothrow @trusted @nogc { - naked; - xchg EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI - // it can be negative - cmp EAX,0; - jle quit; - mov ECX,EAX; // ECX=count (because last arg is in EAX) - mov EAX,SS:[ESP+4]; // EAX=value - - align 16; -simplestore: - stosd; - dec ECX; - jz quit; - add EDI,4; - dec ECX; - jnz simplestore; - -quit: - mov EAX,EDI; // return new mptr - mov EDI,SS:[ESP+8]; // restore EDI - ret 8; - } -} - - -//TODO: rewrite this with SSE -// EAX is `count` -// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right -// we need to preserve ESI and EDI (and EBX in case of PIC code) -// this doesn't change every 2nd pixel; `count` is count of ALL pixels -public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc { - asm nothrow @trusted @nogc { - naked; - - xchg EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI - cmp EAX,0; - jle quit; - - push EBP; // EBP will contain the counter - push EBX; // EBX is temporary register - push ESI; // DMD expects ESI to be unmodified at exit - mov EBP,EAX; // EBP=counter - - mov EAX,SS:[ESP+16]; // EAX=clr - mov ECX,EAX; // ECX will be clrA - // clrG=clr&0x00ff00u; - and EAX,0x00ff00u; - push EAX; - // clrRB=clr&0xff00ffu; - mov EAX,ECX; - and EAX,0xff00ffu; - push EAX; - // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision - shr ECX,24; - inc ECX; - - // [ESP+0]: clrRB - // [ESP+4]: clrG - // ESI - // EBX - // EBP - // ret addr - // clr - // mptr - // EBP=counter - // EDI=mptr - // ECX=clrA - - align 16; // why not - - /+ - clrA = (clr>>24)+1; - clrRB = clr&0xff00ffu; - clrG = clr&0x00ff00u; - - rb = (*mptr)&0xff00ffu; - rb += ((clrRB-rb)*clrA)>>8; - rb &= 0xff00ffu; - - g = (*mptr)&0x00ff00u; - g += ((clrG-g)*clrA)>>8; - g &= 0x00ff00u; - - *mptr++ = rb|g|0xff000000u; - +/ - - mixloop: - // rb = (*mptr)&0xff00ffu; - // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu; - mov EBX,[EDI]; - mov ESI,EBX; // save `*mptr` - and EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu - mov EAX,SS:[ESP]; // EAX=clrRB - sub EAX,EBX; // EAX=clrRB-rb - mul ECX; // EAX=(clrRB-rb)*clrA (EDX is dead) - shr EAX,8; // EAX=((clrRB-rb)*clrA)>>8 - add EBX,EAX; // EBX=rb+(((clrRB-rb)*clrA)>>8) - and EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu - - // g = (*mptr)&0x00ff00u; - // g += (((clrG-g)*clrA)>>8)&0x00ff00u; - mov EDX,ESI; // EDX=*mptr - and EDX,0x00ff00u; // EDX=g=(*mptr)&0x00ff00u - mov ESI,EDX; // save g, we well need it later - mov EAX,SS:[ESP+4]; // EAX=clrG - sub EAX,EDX; // EAX=clrG-g - mul ECX; // EAX=(clrG-g)*clrA (EDX is dead) - shr EAX,8; // EAX=((clrG-g)*clrA)>>8 - add EAX,ESI; // EAX=(((clrG-g)*clrA)>>8)+g - and EAX,0x00ff00u; // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u - - // mix - or EAX,EBX; - or EAX,0xff000000u; - - stosd; - dec EBP; - jz mixdone; - add EDI,4; - dec EBP; - jnz mixloop; - -mixdone: - add ESP,2*4; // drop temp vars - // restore registers - pop ESI; - pop EBX; - pop EBP; - -quit: - mov EAX,EDI; // result - mov EDI,SS:[ESP+8]; // restore EDI - ret 8; - } -} - -} else { -// no SSE -public uint* memFillDW (uint* ptr, in uint value, in int count) nothrow @trusted @nogc { - pragma(inline, true); - if (count > 0) { - ptr[0..cast(usize)count] = value; - ptr += cast(usize)count; - } - return ptr; -} -public uint* memFillDWDash (uint* ptr, in uint value, in int count) nothrow @trusted @nogc { - pragma(inline, true); - foreach (immutable c; 0..count) { if (!(c&1)) *ptr++ = value; else ++ptr; } - if (count > 0) { - ptr[0..cast(usize)count] = value; - ptr += cast(usize)count; - } - return ptr; -} -public uint* memBlendColor (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc { - foreach (immutable _; 0..count) { mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); } - return mptr; -} -public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc { - foreach (immutable c; 0..count) { if (!(c&1)) mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); else ++mptr; } - return mptr; -} -} diff --git a/egra/gfx/config.d b/egra/gfx/config.d index 08e75b8..48e6654 100644 --- a/egra/gfx/config.d +++ b/egra/gfx/config.d @@ -18,8 +18,10 @@ */ module iv.egra.gfx.config /*is aliced*/; +// uncomment this to use OpenGL backend instead of X11 //version = egfx_opengl_backend; + version(egfx_opengl_backend) { public enum EGfxOpenGLBackend = true; } else { diff --git a/egra/gfx/lowlevel.d b/egra/gfx/lowlevel.d new file mode 100644 index 0000000..27f32c2 --- /dev/null +++ b/egra/gfx/lowlevel.d @@ -0,0 +1,751 @@ +/* + * Simple Framebuffer Gfx/GUI lib + * + * coded by Ketmar // Invisible Vector + * Understanding is not required. Only obedience. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License ONLY. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +module iv.egra.gfx.lowlevel /*is aliced*/; +private: + +// uncomment this to disable SSE4.1 optimisations +//version = egfx_disable_sse41; + + +version(egfx_disable_sse41) { + version(egfx_use_sse41) { + static assert(false, "EGRA: SSE4.1 is both forced and disabled. wtf?!"); + } +} else { + version(DigitalMars) { + version(X86) { + version = egfx_use_sse41; + } else { + version(egfx_use_sse41) { + static assert(false, "EGRA: SSE4.1 is not supported on 64-bit architectures."); + } + } + } else { + version(egfx_use_sse41) { + static assert(false, "EGRA: SSE4.1 is not supported on non-DMD compilers."); + } + } +} + +version(egfx_use_sse41) { + public enum EGfxUseSSE41 = true; +} else { + public enum EGfxUseSSE41 = false; +} + + +// ////////////////////////////////////////////////////////////////////////// // +public void egfxCheckCPU () nothrow @trusted @nogc { + version(egfx_use_sse41) { + import core.cpuid : sse41; + if (!sse41) { + import core.stdc.stdio : stderr, fprintf; + fprintf(stderr, "ERROR: EGRA requires CPU with SSE4.1 support!"); + assert(0, "ERROR: EGRA requires CPU with SSE4.1 support!"); + } + } +} + + +// ////////////////////////////////////////////////////////////////////////// // +// mix `dcvar` with ARGB (or ABGR) `colvar`; dc A is ignored (set to 255) +// main code almost never calls this with solid or transparent `colvar` +// the result will be put to `destvar` (it is written only once, at the end) +// `colvar` and `dcvar` may be read several times +// see http://stereopsis.com/doubleblend.html for the inspiration +version(none) { +// this works for solid and transparent colors too +public enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{ + immutable uint col_ = `~colvar~`; + immutable uint dc_ = (`~dcvar~`)&0xffffffu; + /*immutable uint a_ = 256-(col_>>24);*/ /* to not loose bits */ + immutable uint a_ = (col_>>24)+1; /* so it will work for both 0 and 255 correctly */ + immutable uint srb_ = (col_&0xff00ffu); + immutable uint sg_ = (col_&0x00ff00u); + immutable uint drb_ = (dc_&0xff00ffu); + immutable uint dg_ = (dc_&0x00ff00u); + immutable uint orb_ = (drb_+(((srb_-drb_)*a_+0x800080u)>>8))&0xff00ffu; + immutable uint og_ = (dg_+(((sg_-dg_)*a_+0x008000u)>>8))&0x00ff00u; + (`~destvar~`) = orb_|og_|0xff_00_00_00u; +}`; +} else { +// this works for solid and transparent colors too +public enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{ + immutable uint a_ = ((`~colvar~`)>>24)+1u; /* to not loose bits */ + uint rb_ = (`~dcvar~`)&0xff00ffu; + uint g_ = (`~dcvar~`)&0x00ff00u; + rb_ += ((cast(uint)((`~colvar~`)&0xff00ffu)-rb_)*a_)>>8; + g_ += ((cast(uint)((`~colvar~`)&0x00ff00u)-g_)*a_)>>8; + /* g is mixed with solid alpha; replace "0xff_" with other alpha if you want to */ + (`~destvar~`) = (rb_&0xff00ffu)|(g_&0xff_00ff00u)|0xff_00_00_00u; +}`; +} + + +// ////////////////////////////////////////////////////////////////////////// // +// size is in dwords +version(egfx_use_sse41) { + //pragma(msg," !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! "); + +/* +align(16) immutable ubyte[16] sseSpreadOneColor = [ +0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, +0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, +]; +*/ + +// for x86 naked functions, DMD will pass last arg in EAX +// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right +// we need to preserve ESI and EDI (and EBX in case of PIC code) +public uint* memFillDW (uint* mptr, in uint value, in int count) nothrow @trusted @nogc { + asm nothrow @trusted @nogc { + naked; + xchg EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI + cmp EAX,0; + jle done; + + mov ECX,EAX; // ECX=count (because last arg is in EAX) + mov EAX,SS:[ESP+4]; // EAX=value + + cmp ECX,8; + jc simplestore; // too small + + // load XMM0 with our color + push EAX; + push EAX; + // used `movdqu`, because it indicates int type + // this doesn't matter, it just looks nicer + // also, `movlps` is one byte shorter + movlps XMM0,SS:[ESP]; + movlhps XMM0,XMM0; // copy low 64 bits of XMM0 to high 64 bits of XMM0 + //movdqu XMM0,SS:[ESP]; + //pshufb XMM0,[sseSpreadOneColor]; + add ESP,8; + + // if we cannot align at all, use "rep stosd" + // this should not happen, so i won't bother optimising it + test EDI,0x03; + jnz simplestore; + + // align EDI (we have at least 8 pixels to fill here, so it is safe) +alignloop: + test EDI,0x0f; + jz alignok; + stosd; + dec ECX; + jmp alignloop; + +alignok: + // ECX is never zero here + cmp ECX,4; + jc simplestore; // too small + + // save last 2 bits of counter (we'll mask them later) + movzx EDX,CL; + + // fill by 4 pixels while we can + shr ECX,2; + //align 16; // why not +alignfill: + movaps [EDI],XMM0; + add EDI,16; + dec ECX; + jnz alignfill; + + // fill last 1-3 pixels + mov ECX,EDX; + and CL,0x03; + jz done; + +simplestore: + rep; stosd; + +done: + mov EAX,EDI; // return new mptr + mov EDI,SS:[ESP+8]; // restore EDI + ret 4*2; + } +} + +// WARNING! this function is not quite right (0 and 255 alphas will still modify the colors) +// WARNING! do not call it with fully opaque or fully transparent `clr`! +public alias memBlendColor = sseBlendColor; +/* +public uint* memBlendColor (uint* mptr, in uint clr, int count) nothrow @trusted @nogc { + pragma(inline, true); + version(all) { + if (count < 1) return mptr; + immutable int c4 = (count>>2); // it is actually unsigned + if (c4) { mptr = sseBlendColor4px(mptr, clr, cast(uint)c4); count -= (c4<<2); } + return (count ? memBlendColorSlow(mptr, clr, count) : mptr); + } else { + return memBlendColorSlow(mptr, clr, count); + } +} +*/ + + +align(16) immutable ubyte[16] sseSpreadAlpha = [ +0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff, +0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff, +]; + +align(16) immutable ubyte[16] sseMaxAlpha = [ +0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, +0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00, +]; + +align(16) immutable ubyte[16] sseFullByteAlpha = [ +0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, +0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, +]; + +// mix foreground to background +// EAX is pixel count +// background = (alpha * foreground) + (1-alpha)*background +// WARNING! this function is not quite right (0 and 255 alphas will still modify the colors) +// WARNING! do not call it with fully opaque or fully transparent `clr`! +public uint* sseBlendColor (uint* dest, uint clr, uint count) nothrow @trusted @nogc { + asm nothrow @trusted @nogc { + naked; + enter 0,0; + // save modified registers + push EDI; + + mov EDI,[EBP+12]; // dest + // it can be negative + cmp EAX,0; + jle done; + mov ECX,EAX; // counter + + // EAX: count + // [EBP+8]: clr + // [EBP+16]: dest + + // align stack + sub ESP,16; + and ESP,0xfffffff0u; + + mov EAX,[EBP+8]; // clr + + // we can premultiply clr first, and convert alpha to 255-alpha + + // prepare SSE data -- 2 pixels + mov SS:[ESP],EAX; + mov SS:[ESP+4],EAX; + + // used `movdqa`, because it indicates int type + // this doesn't matter, it just looks nicer + // also, `movlps` is one byte shorter + movlps XMM0,SS:[ESP]; + //movdqa XMM0,SS:[ESP]; + // expand 8 ubytes to 8 ushorts + pmovzxbw XMM1,XMM0; + // XMM0: xx xx xx xx ar gb ar gb + // XMM1: 0a 0r 0g 0b 0a 0r 0g 0b + pshufb XMM0,[sseSpreadAlpha]; + // XMM0: 00 0a 0a 0a 00 0a 0a 0a + movdqa XMM7,[sseMaxAlpha]; + psubw XMM7,XMM0; // XMM7 is 255-alpha + // XMM7: 00 0a 0a 0a 00 0a 0a 0a + pmulhuw XMM0,XMM1; + // XMM0: 00 0r 0g 0b 00 0r 0g 0b + movdqa XMM6,[sseFullByteAlpha]; + + //XMM0: 2 premultiplied colors + //XMM7: 2 inverted alphas + //XMM6: destination alpha (replace value) + + // totally unaligned? + // this should never happen, but meh... + test EDI,0x03; + jnz slowestpath; // alas, the slowest path + + // align the address (if necessary) + test EDI,0x0f; + jz trymix8aligned; + + // we need to mix 1-3 pixels to make the address aligned + // check counter here to allow "slow, but aligned" path (see the code below) + cmp ECX,4; + jc slowestpath; // alas + + // process 4 pixels (we will drop unused ones) + movdqu XMM5,[EDI]; // 4 background pixels + pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1 + // copy high part of XMM5 to low part of XMM5 + movhlps XMM5,XMM5; + pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2 + //XMM1: 2 lower pixels + //XMM2: 2 upper pixels + + pmulhuw XMM1,XMM7; // multiply by alpha + pmulhuw XMM2,XMM7; // multiply by alpha + + paddusw XMM1,XMM0; // add premultiplied colors + paddusw XMM2,XMM0; // add premultiplied colors + + packuswb XMM1,XMM2; + + // set destination alpha + por XMM1,XMM6; + + // now write 1-3 pixels to align the address + // we are guaranteed to have at least 4 pixels to mix here + // i.e. 4 processed pixels, and at least 4 pixels in the counter + + // put in temp storage (it is aligned) + movdqa SS:[ESP],XMM1; + mov EDX,ESI; // save ESI (DMD expects it unchanged) + lea ESI,[ESP+4]; +uastoreloop: + movsd; + dec ECX; + test EDI,0x0f; + jnz uastoreloop; + mov ESI,EDX; // restore ESI + // ECX is at least 1 here, and EDI is aligned + +trymix8aligned: + // ECX is never zero here + // use "slow, but aligned" path if we have less than 8 pixels to process + cmp ECX,8; + jc slowalignedpath; + + // save last 3 bits in EAX + // we'll mask it later + movzx EAX,CL; + + // process by 8 pixels while we can + shr ECX,3; + +mix8aligned: + movdqa XMM5,[EDI]; // 4 background pixels + pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1 + // copy high part of XMM5 to low part of XMM5 + movhlps XMM5,XMM5; + pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2 + //XMM1: 2 lower pixels + //XMM2: 2 upper pixels + + movdqa XMM5,[EDI+16]; // 4 background pixels + pmovzxbw XMM3,XMM5; // expand 2 lower pixels to XMM3 + // copy high part of XMM5 to low part of XMM5 + movhlps XMM5,XMM5; + pmovzxbw XMM4,XMM5; // expand 2 upper pixels to XMM4 + //XMM3: 2 lower pixels + //XMM4: 2 upper pixels + + pmulhuw XMM1,XMM7; // multiply by alpha + pmulhuw XMM2,XMM7; // multiply by alpha + pmulhuw XMM3,XMM7; // multiply by alpha + pmulhuw XMM4,XMM7; // multiply by alpha + + paddusw XMM1,XMM0; // add premultiplied colors + paddusw XMM2,XMM0; // add premultiplied colors + paddusw XMM3,XMM0; // add premultiplied colors + paddusw XMM4,XMM0; // add premultiplied colors + + packuswb XMM1,XMM2; + packuswb XMM3,XMM4; + + // set destination alpha + por XMM1,XMM6; + por XMM3,XMM6; + + movdqa [EDI],XMM1; + movdqa [EDI+16],XMM3; + + add EDI,32; + dec ECX; + jnz mix8aligned; + + // do last 1-7 pixels (last counter is in EAX) + // EDI is guaranteed to be aligned here + mov ECX,EAX; + and CL,0x07; + jnz slowalignedpath; + + // we're done + mov EAX,EDI; + mov EDI,[EBP-4]; // restore EDI + leave; + ret 4*2; + + align 16; + // mix by 4 pixels, unaligned +slowestpath: + // mix 4 pixels + movdqu XMM5,[EDI]; // 4 background pixels + pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1 + // copy high part of XMM5 to low part of XMM5 + movhlps XMM5,XMM5; + pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2 + //XMM1: 2 lower pixels + //XMM2: 2 upper pixels + + pmulhuw XMM1,XMM7; // multiply by alpha + pmulhuw XMM2,XMM7; // multiply by alpha + + paddusw XMM1,XMM0; // add premultiplied colors + paddusw XMM2,XMM0; // add premultiplied colors + + packuswb XMM1,XMM2; + + // set destination alpha + por XMM1,XMM6; + + sub ECX,4; + jc slowestlast; + + movdqu [EDI],XMM1; + add EDI,16; + jecxz done; + jmp slowestpath; + + // last 1-3 pixels (never 0) +slowestlast: + // put in temp storage (it is aligned) + movdqa SS:[ESP],XMM1; + mov EDX,ESI; // save ESI (DMD expects it unchanged) + push ESI; + lea ESI,[ESP+4]; + and ECX,0x03; // left counter + rep; movsd; + mov ESI,EDX; // restore ESI + jmp done; + +done: + mov EAX,EDI; + mov EDI,[EBP-4]; // restore EDI + leave; + ret 4*2; + + + align 16; + // mix by 4 pixels, aligned (used for 1-7 pixels) +slowalignedpath: + // mix 4 pixels + movdqa XMM5,[EDI]; // 4 background pixels + pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1 + // copy high part of XMM5 to low part of XMM5 + movhlps XMM5,XMM5; + pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2 + //XMM1: 2 lower pixels + //XMM2: 2 upper pixels + + pmulhuw XMM1,XMM7; // multiply by alpha + pmulhuw XMM2,XMM7; // multiply by alpha + + paddusw XMM1,XMM0; // add premultiplied colors + paddusw XMM2,XMM0; // add premultiplied colors + + packuswb XMM1,XMM2; + + // set destination alpha + por XMM1,XMM6; + + sub ECX,4; + jc slowestlast; + + movdqa [EDI],XMM1; + add EDI,16; + jecxz done; + jmp slowalignedpath; + } +} + +// EAX is `count` +// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right +// we need to preserve ESI and EDI (and EBX in case of PIC code) +/+ +public uint* memBlendColorSlow (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc { + asm nothrow @trusted @nogc { + naked; + + xchg EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI + cmp EAX,0; + jle quit; + + push EBP; // EBP will contain the counter + push EBX; // EBX is temporary register + push ESI; // DMD expects ESI to be unmodified at exit + mov EBP,EAX; // EBP=counter + + mov EAX,SS:[ESP+16]; // EAX=clr + mov ECX,EAX; // ECX will be clrA + // clrG=clr&0x00ff00u; + and EAX,0x00ff00u; + push EAX; + // clrRB=clr&0xff00ffu; + mov EAX,ECX; + and EAX,0xff00ffu; + push EAX; + // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision + shr ECX,24; + inc ECX; + + // [ESP+0]: clrRB + // [ESP+4]: clrG + // ESI + // EBX + // EBP + // ret addr + // clr + // mptr + // EBP=counter + // EDI=mptr + // ECX=clrA + + align 16; // why not + + /+ + clrA = (clr>>24)+1; + clrRB = clr&0xff00ffu; + clrG = clr&0x00ff00u; + + rb = (*mptr)&0xff00ffu; + rb += ((clrRB-rb)*clrA)>>8; + rb &= 0xff00ffu; + + g = (*mptr)&0x00ff00u; + g += ((clrG-g)*clrA)>>8; + g &= 0x00ff00u; + + *mptr++ = rb|g|0xff000000u; + +/ + + mixloop: + // rb = (*mptr)&0xff00ffu; + // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu; + mov EBX,[EDI]; + mov ESI,EBX; // save `*mptr` + and EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu + mov EAX,SS:[ESP]; // EAX=clrRB + sub EAX,EBX; // EAX=clrRB-rb + mul ECX; // EAX=(clrRB-rb)*clrA (EDX is dead) + shr EAX,8; // EAX=((clrRB-rb)*clrA)>>8 + add EBX,EAX; // EBX=rb+(((clrRB-rb)*clrA)>>8) + and EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu + + // g = (*mptr)&0x00ff00u; + // g += (((clrG-g)*clrA)>>8)&0x00ff00u; + mov EDX,ESI; // EDX=*mptr + and EDX,0x00ff00u; // EDX=g=(*mptr)&0x00ff00u + mov ESI,EDX; // save g, we well need it later + mov EAX,SS:[ESP+4]; // EAX=clrG + sub EAX,EDX; // EAX=clrG-g + mul ECX; // EAX=(clrG-g)*clrA (EDX is dead) + shr EAX,8; // EAX=((clrG-g)*clrA)>>8 + add EAX,ESI; // EAX=(((clrG-g)*clrA)>>8)+g + and EAX,0x00ff00u; // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u + + // mix + or EAX,EBX; + or EAX,0xff000000u; + + stosd; + dec EBP; + jnz mixloop; + + add ESP,2*4; // drop temp vars + // restore registers + pop ESI; + pop EBX; + pop EBP; + +quit: + mov EAX,EDI; // result + mov EDI,SS:[ESP+8]; // restore EDI + ret 8; + } +} ++/ + + +// for x86 naked functions, DMD will pass last arg in EAX +// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right +// we need to preserve ESI and EDI (and EBX in case of PIC code) +// this doesn't change every 2nd pixel; `count` is count of ALL pixels +public uint* memFillDWDash (uint* mptr, in uint value, in int count) nothrow @trusted @nogc { + asm nothrow @trusted @nogc { + naked; + xchg EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI + // it can be negative + cmp EAX,0; + jle quit; + mov ECX,EAX; // ECX=count (because last arg is in EAX) + mov EAX,SS:[ESP+4]; // EAX=value + + align 16; +simplestore: + stosd; + dec ECX; + jz quit; + add EDI,4; + dec ECX; + jnz simplestore; + +quit: + mov EAX,EDI; // return new mptr + mov EDI,SS:[ESP+8]; // restore EDI + ret 8; + } +} + + +//TODO: rewrite this with SSE +// EAX is `count` +// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right +// we need to preserve ESI and EDI (and EBX in case of PIC code) +// this doesn't change every 2nd pixel; `count` is count of ALL pixels +public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc { + asm nothrow @trusted @nogc { + naked; + + xchg EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI + cmp EAX,0; + jle quit; + + push EBP; // EBP will contain the counter + push EBX; // EBX is temporary register + push ESI; // DMD expects ESI to be unmodified at exit + mov EBP,EAX; // EBP=counter + + mov EAX,SS:[ESP+16]; // EAX=clr + mov ECX,EAX; // ECX will be clrA + // clrG=clr&0x00ff00u; + and EAX,0x00ff00u; + push EAX; + // clrRB=clr&0xff00ffu; + mov EAX,ECX; + and EAX,0xff00ffu; + push EAX; + // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision + shr ECX,24; + inc ECX; + + // [ESP+0]: clrRB + // [ESP+4]: clrG + // ESI + // EBX + // EBP + // ret addr + // clr + // mptr + // EBP=counter + // EDI=mptr + // ECX=clrA + + align 16; // why not + + /+ + clrA = (clr>>24)+1; + clrRB = clr&0xff00ffu; + clrG = clr&0x00ff00u; + + rb = (*mptr)&0xff00ffu; + rb += ((clrRB-rb)*clrA)>>8; + rb &= 0xff00ffu; + + g = (*mptr)&0x00ff00u; + g += ((clrG-g)*clrA)>>8; + g &= 0x00ff00u; + + *mptr++ = rb|g|0xff000000u; + +/ + + mixloop: + // rb = (*mptr)&0xff00ffu; + // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu; + mov EBX,[EDI]; + mov ESI,EBX; // save `*mptr` + and EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu + mov EAX,SS:[ESP]; // EAX=clrRB + sub EAX,EBX; // EAX=clrRB-rb + mul ECX; // EAX=(clrRB-rb)*clrA (EDX is dead) + shr EAX,8; // EAX=((clrRB-rb)*clrA)>>8 + add EBX,EAX; // EBX=rb+(((clrRB-rb)*clrA)>>8) + and EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu + + // g = (*mptr)&0x00ff00u; + // g += (((clrG-g)*clrA)>>8)&0x00ff00u; + mov EDX,ESI; // EDX=*mptr + and EDX,0x00ff00u; // EDX=g=(*mptr)&0x00ff00u + mov ESI,EDX; // save g, we well need it later + mov EAX,SS:[ESP+4]; // EAX=clrG + sub EAX,EDX; // EAX=clrG-g + mul ECX; // EAX=(clrG-g)*clrA (EDX is dead) + shr EAX,8; // EAX=((clrG-g)*clrA)>>8 + add EAX,ESI; // EAX=(((clrG-g)*clrA)>>8)+g + and EAX,0x00ff00u; // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u + + // mix + or EAX,EBX; + or EAX,0xff000000u; + + stosd; + dec EBP; + jz mixdone; + add EDI,4; + dec EBP; + jnz mixloop; + +mixdone: + add ESP,2*4; // drop temp vars + // restore registers + pop ESI; + pop EBX; + pop EBP; + +quit: + mov EAX,EDI; // result + mov EDI,SS:[ESP+8]; // restore EDI + ret 8; + } +} + +} else { +// no SSE +public uint* memFillDW (uint* ptr, in uint value, in int count) nothrow @trusted @nogc { + pragma(inline, true); + if (count > 0) { + ptr[0..cast(usize)count] = value; + ptr += cast(usize)count; + } + return ptr; +} +public uint* memFillDWDash (uint* ptr, in uint value, in int count) nothrow @trusted @nogc { + pragma(inline, true); + foreach (immutable c; 0..count) { if (!(c&1)) *ptr++ = value; else ++ptr; } + if (count > 0) { + ptr[0..cast(usize)count] = value; + ptr += cast(usize)count; + } + return ptr; +} +public uint* memBlendColor (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc { + foreach (immutable _; 0..count) { mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); } + return mptr; +} +public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc { + foreach (immutable c; 0..count) { if (!(c&1)) mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); else ++mptr; } + return mptr; +} +} -- 2.11.4.GIT