From 342253c45ae6ca0b3ac3828f80d3db8eaf77ac4b Mon Sep 17 00:00:00 2001
From: ketmar <ketmar@ketmar.no-ip.org>
Date: Wed, 24 Nov 2021 01:51:36 +0000
Subject: [PATCH] egra: moved low-level SSE optimised code to separate module;
 also, optimised SSE code a little more ;-)

FossilOrigin-Name: e6d3c9c3c8f54d168f60b92532b945c3e50a5db24f7519305b4110525bfac193
---
 egra/gfx/backgl.d   |   2 +
 egra/gfx/backx11.d  |   2 +
 egra/gfx/base.d     | 683 +----------------------------------------------
 egra/gfx/config.d   |   2 +
 egra/gfx/lowlevel.d | 751 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 758 insertions(+), 682 deletions(-)
 create mode 100644 egra/gfx/lowlevel.d

diff --git a/egra/gfx/backgl.d b/egra/gfx/backgl.d
index 0178b00..29297b1 100644
--- a/egra/gfx/backgl.d
+++ b/egra/gfx/backgl.d
@@ -27,6 +27,7 @@ import iv.cmdcon;
 
 import iv.egra.gfx.config;
 import iv.egra.gfx.base;
+import iv.egra.gfx.lowlevel;
 
 
 // ////////////////////////////////////////////////////////////////////////// //
@@ -41,6 +42,7 @@ public __gshared uint vArrowTextureId = 0;
 // ////////////////////////////////////////////////////////////////////////// //
 shared static this () {
   import core.stdc.stdlib : malloc;
+  egfxCheckCPU();
   // always allocate additional 16 bytes for SSE routines
   vglTexBuf = cast(uint*)malloc((cast(uint)VBufWidth*cast(uint)VBufHeight)*vglTexBuf[0].sizeof+16u);
   if (vglTexBuf is null) { import core.exception : onOutOfMemoryErrorNoGC; onOutOfMemoryErrorNoGC(); }
diff --git a/egra/gfx/backx11.d b/egra/gfx/backx11.d
index 0f70959..336d17a 100644
--- a/egra/gfx/backx11.d
+++ b/egra/gfx/backx11.d
@@ -27,6 +27,7 @@ import iv.cmdcongl;
 
 import iv.egra.gfx.config;
 import iv.egra.gfx.base;
+import iv.egra.gfx.lowlevel;
 
 
 // ////////////////////////////////////////////////////////////////////////// //
@@ -38,6 +39,7 @@ public enum vArrowTextureId = 0;
 // ////////////////////////////////////////////////////////////////////////// //
 shared static this () {
   import core.stdc.stdlib : malloc;
+  egfxCheckCPU();
   // always allocate additional 16 bytes for SSE routines
   vglTexBuf = cast(uint*)malloc((cast(uint)VBufWidth*cast(uint)VBufHeight)*vglTexBuf[0].sizeof+16u);
   if (vglTexBuf is null) { import core.exception : onOutOfMemoryErrorNoGC; onOutOfMemoryErrorNoGC(); }
diff --git a/egra/gfx/base.d b/egra/gfx/base.d
index aa34704..0a32a52 100644
--- a/egra/gfx/base.d
+++ b/egra/gfx/base.d
@@ -24,14 +24,7 @@ import iv.bclamp;
 import iv.cmdcon;
 
 import iv.egra.gfx.config;
-
-
-// ////////////////////////////////////////////////////////////////////////// //
-version (DigitalMars) {
-  version(X86) {
-    version = EGRA_USE_SSE_ASM;
-  }
-}
+import iv.egra.gfx.lowlevel;
 
 
 // ////////////////////////////////////////////////////////////////////////// //
@@ -610,40 +603,6 @@ public ubyte gxGetBlue (in uint clr) pure nothrow @safe @nogc { pragma(inline, t
 public ubyte gxGetAlpha (in uint clr) pure nothrow @safe @nogc { pragma(inline, true); return cast(ubyte)(clr>>24); }
 
 
-// ////////////////////////////////////////////////////////////////////////// //
-// mix `dcvar` with ARGB (or ABGR) `colvar`; dc A is ignored (set to 255)
-// main code almost never calls this with solid or transparent `colvar`
-// the result will be put to `destvar` (it is written only once, at the end)
-// `colvar` and `dcvar` may be read several times
-// see http://stereopsis.com/doubleblend.html for the inspiration
-version(none) {
-// this works for solid and transparent colors too
-enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{
-  immutable uint col_ = `~colvar~`;
-  immutable uint dc_ = (`~dcvar~`)&0xffffffu;
-  /*immutable uint a_ = 256-(col_>>24);*/ /* to not loose bits */
-  immutable uint a_ = (col_>>24)+1; /* so it will work for both 0 and 255 correctly */
-  immutable uint srb_ = (col_&0xff00ffu);
-  immutable uint sg_ = (col_&0x00ff00u);
-  immutable uint drb_ = (dc_&0xff00ffu);
-  immutable uint dg_ = (dc_&0x00ff00u);
-  immutable uint orb_ = (drb_+(((srb_-drb_)*a_+0x800080u)>>8))&0xff00ffu;
-  immutable uint og_ = (dg_+(((sg_-dg_)*a_+0x008000u)>>8))&0x00ff00u;
-  (`~destvar~`) = orb_|og_|0xff_00_00_00u;
-}`;
-} else {
-// this works for solid and transparent colors too
-enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{
-  immutable uint a_ = ((`~colvar~`)>>24)+1u; /* to not loose bits */
-  uint rb_ = (`~dcvar~`)&0xff00ffu;
-  uint g_  = (`~dcvar~`)&0x00ff00u;
-  rb_ += ((cast(uint)((`~colvar~`)&0xff00ffu)-rb_)*a_)>>8;
-  g_  += ((cast(uint)((`~colvar~`)&0x00ff00u)-g_)*a_)>>8;
-  /* g is mixed with solid alpha; replace "0xff_" with other alpha if you want to */
-  (`~destvar~`) = (rb_&0xff00ffu)|(g_&0xff_00ff00u)|0xff_00_00_00u;
-}`;
-}
-
 public uint gxColMix (in uint dc, in uint clr) pure nothrow @trusted @nogc {
   pragma(inline, true);
        if (gxIsSolid(clr)) return clr;
@@ -1453,643 +1412,3 @@ public void gxDrawLine (int x0, int y0, int x1, int y1, in uint clr, bool lastPo
 public void gxDrawLine() (in auto ref GxPoint p0, in int x1, in int y1, in uint clr, in bool lastPoint=true) { pragma(inline, true); gxDrawLine(p0.x, p0.y, x1, y1, clr, lastPoint); }
 public void gxDrawLine() (in int x0, in int y0, auto ref GxPoint p1, in uint clr, in bool lastPoint=true) { pragma(inline, true); gxDrawLine(x0, y0, p1.x, p1.y, clr, lastPoint); }
 public void gxDrawLine() (in auto ref GxPoint p0, auto ref GxPoint p1, in uint clr, in bool lastPoint=true) { pragma(inline, true); gxDrawLine(p0.x, p0.y, p1.x, p1.y, clr, lastPoint); }
-
-
-// ////////////////////////////////////////////////////////////////////////// //
-// size is in dwords
-version(EGRA_USE_SSE_ASM) {
-
-align(16) immutable ubyte[16] sseSpreadOneColor = [
-0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
-0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
-];
-
-// for x86 naked functions, DMD will pass last arg in EAX
-// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
-// we need to preserve ESI and EDI (and EBX in case of PIC code)
-public uint* memFillDW (uint* mptr, in uint value, in int count) nothrow @trusted @nogc {
-  asm nothrow @trusted @nogc {
-    naked;
-    xchg    EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI
-    cmp     EAX,0;
-    jle     done;
-
-    mov     ECX,EAX;        // ECX=count (because last arg is in EAX)
-    mov     EAX,SS:[ESP+4]; // EAX=value
-
-    cmp     ECX,8;
-    jc      simplestore;  // too small
-
-    // load XMM0 with our color
-    push    EAX;
-    movdqu  XMM0,SS:[ESP];
-    pshufb  XMM0,[sseSpreadOneColor];
-    add     ESP,4;
-
-    // if we cannot align at all, use "rep stosd"
-    // this should not happen, so i won't bother optimising it
-    test    EDI,0x03;
-    jnz     simplestore;
-
-    // align EDI (we have at least 8 pixels to fill here, so it is safe)
-alignloop:
-    test    EDI,0x0f;
-    jz      alignok;
-    stosd;
-    dec     ECX;
-    jmp     alignloop;
-
-alignok:
-    // ECX is never zero here
-    cmp     ECX,4;
-    jc      simplestore;  // too small
-
-    // save last 2 bits of counter (we'll mask them later)
-    movzx   EDX,CL;
-
-    // fill by 4 pixels while we can
-    shr     ECX,2;
-    //align   16; // why not
-alignfill:
-    movaps  [EDI],XMM0;
-    add     EDI,16;
-    dec     ECX;
-    jnz     alignfill;
-
-    // fill last 1-3 pixels
-    mov     ECX,EDX;
-    and     CL,0x03;
-    jz      done;
-
-simplestore:
-    rep; stosd;
-
-done:
-    mov  EAX,EDI; // return new mptr
-    mov  EDI,SS:[ESP+8];  // restore EDI
-    ret  4*2;
-  }
-}
-
-// WARNING! this function is not quite right (0 and 255 alphas will still modify the colors)
-// WARNING! do not call it with fully opaque or fully transparent `clr`!
-public alias memBlendColor = sseBlendColor;
-/*
-public uint* memBlendColor (uint* mptr, in uint clr, int count) nothrow @trusted @nogc {
-  pragma(inline, true);
-  version(all) {
-    if (count < 1) return mptr;
-    immutable int c4 = (count>>2); // it is actually unsigned
-    if (c4) { mptr = sseBlendColor4px(mptr, clr, cast(uint)c4); count -= (c4<<2); }
-    return (count ? memBlendColorSlow(mptr, clr, count) : mptr);
-  } else {
-    return memBlendColorSlow(mptr, clr, count);
-  }
-}
-*/
-
-
-align(16) immutable ubyte[16] sseSpreadAlpha = [
-0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff,
-0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff,
-];
-
-align(16) immutable ubyte[16] sseMaxAlpha = [
-0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00,
-0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00,
-];
-
-align(16) immutable ubyte[16] sseFullByteAlpha = [
-0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
-0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
-];
-
-// mix foreground to background
-// EAX is pixel count
-// background = (alpha * foreground) + (1-alpha)*background
-// WARNING! this function is not quite right (0 and 255 alphas will still modify the colors)
-// WARNING! do not call it with fully opaque or fully transparent `clr`!
-uint* sseBlendColor (uint* dest, uint clr, uint count) nothrow @trusted @nogc {
-  asm nothrow @trusted @nogc {
-    naked;
-    enter    0,0;
-    // save modified registers
-    push     EDI;
-
-    mov      EDI,[EBP+12]; // dest
-    // it can be negative
-    cmp      EAX,0;
-    jle      done;
-    mov      ECX,EAX; // counter
-
-    // EAX: count
-    // [EBP+8]: clr
-    // [EBP+16]: dest
-
-    // align stack
-    sub      ESP,16;
-    and      ESP,0xfffffff0u;
-
-    mov      EAX,[EBP+8]; // clr
-
-    // we can premultiply clr first, and convert alpha to 255-alpha
-
-    // prepare SSE data -- 2 pixels
-    mov      SS:[ESP],EAX;
-    mov      SS:[ESP+4],EAX;
-
-    movdqa   XMM0,SS:[ESP];
-    // expand 8 ubytes to 8 ushorts
-    pmovzxbw XMM1,XMM0;
-    // XMM0: xx xx xx xx ar gb ar gb
-    // XMM1: 0a 0r 0g 0b 0a 0r 0g 0b
-    pshufb   XMM0,[sseSpreadAlpha];
-    // XMM0: 00 0a 0a 0a 00 0a 0a 0a
-    movdqa   XMM7,[sseMaxAlpha];
-    psubw    XMM7,XMM0;  // XMM7 is 255-alpha
-    // XMM7: 00 0a 0a 0a 00 0a 0a 0a
-    pmulhuw  XMM0,XMM1;
-    // XMM0: 00 0r 0g 0b 00 0r 0g 0b
-    movdqa   XMM6,[sseFullByteAlpha];
-
-    //XMM0: 2 premultiplied colors
-    //XMM7: 2 inverted alphas
-    //XMM6: destination alpha (replace value)
-
-    // totally unaligned?
-    // this should never happen, but meh...
-    test     EDI,0x03;
-    jnz      slowestpath; // alas, the slowest path
-
-    // align the address (if necessary)
-    test     EDI,0x0f;
-    jz       trymix8aligned;
-
-    // we need to mix 1-3 pixels to make the address aligned
-    // check counter here to allow "slow, but aligned" path (see the code below)
-    cmp      ECX,4;
-    jc       slowestpath; // alas
-
-    // process 4 pixels (we will drop unused ones)
-    movdqu   XMM5,[EDI]; // 4 background pixels
-    pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
-    // copy high part of XMM5 to low part of XMM5
-    movhlps  XMM5,XMM5;
-    pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
-    //XMM1: 2 lower pixels
-    //XMM2: 2 upper pixels
-
-    pmulhuw  XMM1,XMM7; // multiply by alpha
-    pmulhuw  XMM2,XMM7; // multiply by alpha
-
-    paddusw  XMM1,XMM0; // add premultiplied colors
-    paddusw  XMM2,XMM0; // add premultiplied colors
-
-    packuswb XMM1,XMM2;
-
-    // set destination alpha
-    por      XMM1,XMM6;
-
-    // now write 1-3 pixels to align the address
-    // we are guaranteed to have at least 4 pixels to mix here
-    // i.e. 4 processed pixels, and at least 4 pixels in the counter
-
-    // put in temp storage (it is aligned)
-    movdqa   SS:[ESP],XMM1;
-    mov      EDX,ESI; // save ESI (DMD expects it unchanged)
-    lea      ESI,[ESP+4];
-uastoreloop:
-    movsd;
-    dec      ECX;
-    test     EDI,0x0f;
-    jnz      uastoreloop;
-    mov      ESI,EDX; // restore ESI
-    // ECX is at least 1 here, and EDI is aligned
-
-trymix8aligned:
-    // ECX is never zero here
-    // use "slow, but aligned" path if we have less than 8 pixels to process
-    cmp      ECX,8;
-    jc       slowalignedpath;
-
-    // save last 3 bits in EAX
-    // we'll mask it later
-    movzx    EAX,CL;
-
-    // process by 8 pixels while we can
-    shr      ECX,3;
-
-mix8aligned:
-    movdqa   XMM5,[EDI]; // 4 background pixels
-    pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
-    // copy high part of XMM5 to low part of XMM5
-    movhlps  XMM5,XMM5;
-    pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
-    //XMM1: 2 lower pixels
-    //XMM2: 2 upper pixels
-
-    movdqa   XMM5,[EDI+16]; // 4 background pixels
-    pmovzxbw XMM3,XMM5; // expand 2 lower pixels to XMM3
-    // copy high part of XMM5 to low part of XMM5
-    movhlps  XMM5,XMM5;
-    pmovzxbw XMM4,XMM5; // expand 2 upper pixels to XMM4
-    //XMM3: 2 lower pixels
-    //XMM4: 2 upper pixels
-
-    pmulhuw  XMM1,XMM7; // multiply by alpha
-    pmulhuw  XMM2,XMM7; // multiply by alpha
-    pmulhuw  XMM3,XMM7; // multiply by alpha
-    pmulhuw  XMM4,XMM7; // multiply by alpha
-
-    paddusw  XMM1,XMM0; // add premultiplied colors
-    paddusw  XMM2,XMM0; // add premultiplied colors
-    paddusw  XMM3,XMM0; // add premultiplied colors
-    paddusw  XMM4,XMM0; // add premultiplied colors
-
-    packuswb XMM1,XMM2;
-    packuswb XMM3,XMM4;
-
-    // set destination alpha
-    por      XMM1,XMM6;
-    por      XMM3,XMM6;
-
-    movdqa   [EDI],XMM1;
-    movdqa   [EDI+16],XMM3;
-
-    add      EDI,32;
-    dec      ECX;
-    jnz      mix8aligned;
-
-    // do last 1-7 pixels (last counter is in EAX)
-    // EDI is guaranteed to be aligned here
-    mov      ECX,EAX;
-    and      CL,0x07;
-    jnz      slowalignedpath;
-
-    // we're done
-    mov      EAX,EDI;
-    mov      EDI,[EBP-4]; // restore EDI
-    leave;
-    ret 4*2;
-
-    align 16;
-    // mix by 4 pixels, unaligned
-slowestpath:
-    // mix 4 pixels
-    movdqu   XMM5,[EDI]; // 4 background pixels
-    pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
-    // copy high part of XMM5 to low part of XMM5
-    movhlps  XMM5,XMM5;
-    pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
-    //XMM1: 2 lower pixels
-    //XMM2: 2 upper pixels
-
-    pmulhuw  XMM1,XMM7; // multiply by alpha
-    pmulhuw  XMM2,XMM7; // multiply by alpha
-
-    paddusw  XMM1,XMM0; // add premultiplied colors
-    paddusw  XMM2,XMM0; // add premultiplied colors
-
-    packuswb XMM1,XMM2;
-
-    // set destination alpha
-    por      XMM1,XMM6;
-
-    sub      ECX,4;
-    jc       slowestlast;
-
-    movdqu   [EDI],XMM1;
-    add      EDI,16;
-    jecxz    done;
-    jmp      slowestpath;
-
-    // last 1-3 pixels (never 0)
-slowestlast:
-    // put in temp storage (it is aligned)
-    movdqa   SS:[ESP],XMM1;
-    mov      EDX,ESI; // save ESI (DMD expects it unchanged)
-    push     ESI;
-    lea      ESI,[ESP+4];
-    and      ECX,0x03; // left counter
-    rep; movsd;
-    mov      ESI,EDX; // restore ESI
-    jmp      done;
-
-done:
-    mov      EAX,EDI;
-    mov      EDI,[EBP-4]; // restore EDI
-    leave;
-    ret 4*2;
-
-
-    align 16;
-    // mix by 4 pixels, aligned (used for 1-7 pixels)
-slowalignedpath:
-    // mix 4 pixels
-    movdqa   XMM5,[EDI]; // 4 background pixels
-    pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
-    // copy high part of XMM5 to low part of XMM5
-    movhlps  XMM5,XMM5;
-    pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
-    //XMM1: 2 lower pixels
-    //XMM2: 2 upper pixels
-
-    pmulhuw  XMM1,XMM7; // multiply by alpha
-    pmulhuw  XMM2,XMM7; // multiply by alpha
-
-    paddusw  XMM1,XMM0; // add premultiplied colors
-    paddusw  XMM2,XMM0; // add premultiplied colors
-
-    packuswb XMM1,XMM2;
-
-    // set destination alpha
-    por      XMM1,XMM6;
-
-    sub      ECX,4;
-    jc       slowestlast;
-
-    movdqa   [EDI],XMM1;
-    add      EDI,16;
-    jecxz    done;
-    jmp      slowalignedpath;
-  }
-}
-
-// EAX is `count`
-// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
-// we need to preserve ESI and EDI (and EBX in case of PIC code)
-/+
-public uint* memBlendColorSlow (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
-  asm nothrow @trusted @nogc {
-    naked;
-
-    xchg  EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI
-    cmp   EAX,0;
-    jle   quit;
-
-    push  EBP; // EBP will contain the counter
-    push  EBX; // EBX is temporary register
-    push  ESI; // DMD expects ESI to be unmodified at exit
-    mov   EBP,EAX; // EBP=counter
-
-    mov   EAX,SS:[ESP+16]; // EAX=clr
-    mov   ECX,EAX;  // ECX will be clrA
-    // clrG=clr&0x00ff00u;
-    and   EAX,0x00ff00u;
-    push  EAX;
-    // clrRB=clr&0xff00ffu;
-    mov   EAX,ECX;
-    and   EAX,0xff00ffu;
-    push  EAX;
-    // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision
-    shr   ECX,24;
-    inc   ECX;
-
-    // [ESP+0]: clrRB
-    // [ESP+4]: clrG
-    // ESI
-    // EBX
-    // EBP
-    // ret addr
-    // clr
-    // mptr
-    // EBP=counter
-    // EDI=mptr
-    // ECX=clrA
-
-    align 16; // why not
-
-    /+
-      clrA = (clr>>24)+1;
-      clrRB = clr&0xff00ffu;
-      clrG = clr&0x00ff00u;
-
-      rb = (*mptr)&0xff00ffu;
-      rb += ((clrRB-rb)*clrA)>>8;
-      rb &= 0xff00ffu;
-
-      g = (*mptr)&0x00ff00u;
-      g += ((clrG-g)*clrA)>>8;
-      g &= 0x00ff00u;
-
-      *mptr++ = rb|g|0xff000000u;
-    +/
-
-  mixloop:
-    // rb = (*mptr)&0xff00ffu;
-    // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu;
-    mov  EBX,[EDI];
-    mov  ESI,EBX;       // save `*mptr`
-    and  EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu
-    mov  EAX,SS:[ESP];  // EAX=clrRB
-    sub  EAX,EBX;       // EAX=clrRB-rb
-    mul  ECX;           // EAX=(clrRB-rb)*clrA (EDX is dead)
-    shr  EAX,8;         // EAX=((clrRB-rb)*clrA)>>8
-    add  EBX,EAX;       // EBX=rb+(((clrRB-rb)*clrA)>>8)
-    and  EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu
-
-    // g = (*mptr)&0x00ff00u;
-    // g += (((clrG-g)*clrA)>>8)&0x00ff00u;
-    mov  EDX,ESI;        // EDX=*mptr
-    and  EDX,0x00ff00u;  // EDX=g=(*mptr)&0x00ff00u
-    mov  ESI,EDX;        // save g, we well need it later
-    mov  EAX,SS:[ESP+4]; // EAX=clrG
-    sub  EAX,EDX;        // EAX=clrG-g
-    mul  ECX;            // EAX=(clrG-g)*clrA (EDX is dead)
-    shr  EAX,8;          // EAX=((clrG-g)*clrA)>>8
-    add  EAX,ESI;        // EAX=(((clrG-g)*clrA)>>8)+g
-    and  EAX,0x00ff00u;  // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u
-
-    // mix
-    or   EAX,EBX;
-    or   EAX,0xff000000u;
-
-    stosd;
-    dec  EBP;
-    jnz  mixloop;
-
-    add  ESP,2*4; // drop temp vars
-    // restore registers
-    pop  ESI;
-    pop  EBX;
-    pop  EBP;
-
-quit:
-    mov  EAX,EDI; // result
-    mov  EDI,SS:[ESP+8];  // restore EDI
-    ret  8;
-  }
-}
-+/
-
-
-// for x86 naked functions, DMD will pass last arg in EAX
-// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
-// we need to preserve ESI and EDI (and EBX in case of PIC code)
-// this doesn't change every 2nd pixel; `count` is count of ALL pixels
-public uint* memFillDWDash (uint* mptr, in uint value, in int count) nothrow @trusted @nogc {
-  asm nothrow @trusted @nogc {
-    naked;
-    xchg  EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI
-    // it can be negative
-    cmp   EAX,0;
-    jle   quit;
-    mov   ECX,EAX;        // ECX=count (because last arg is in EAX)
-    mov   EAX,SS:[ESP+4]; // EAX=value
-
-    align 16;
-simplestore:
-    stosd;
-    dec   ECX;
-    jz    quit;
-    add   EDI,4;
-    dec   ECX;
-    jnz   simplestore;
-
-quit:
-    mov  EAX,EDI; // return new mptr
-    mov  EDI,SS:[ESP+8];  // restore EDI
-    ret  8;
-  }
-}
-
-
-//TODO: rewrite this with SSE
-// EAX is `count`
-// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
-// we need to preserve ESI and EDI (and EBX in case of PIC code)
-// this doesn't change every 2nd pixel; `count` is count of ALL pixels
-public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
-  asm nothrow @trusted @nogc {
-    naked;
-
-    xchg  EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI
-    cmp   EAX,0;
-    jle   quit;
-
-    push  EBP; // EBP will contain the counter
-    push  EBX; // EBX is temporary register
-    push  ESI; // DMD expects ESI to be unmodified at exit
-    mov   EBP,EAX; // EBP=counter
-
-    mov   EAX,SS:[ESP+16]; // EAX=clr
-    mov   ECX,EAX;  // ECX will be clrA
-    // clrG=clr&0x00ff00u;
-    and   EAX,0x00ff00u;
-    push  EAX;
-    // clrRB=clr&0xff00ffu;
-    mov   EAX,ECX;
-    and   EAX,0xff00ffu;
-    push  EAX;
-    // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision
-    shr   ECX,24;
-    inc   ECX;
-
-    // [ESP+0]: clrRB
-    // [ESP+4]: clrG
-    // ESI
-    // EBX
-    // EBP
-    // ret addr
-    // clr
-    // mptr
-    // EBP=counter
-    // EDI=mptr
-    // ECX=clrA
-
-    align 16; // why not
-
-    /+
-      clrA = (clr>>24)+1;
-      clrRB = clr&0xff00ffu;
-      clrG = clr&0x00ff00u;
-
-      rb = (*mptr)&0xff00ffu;
-      rb += ((clrRB-rb)*clrA)>>8;
-      rb &= 0xff00ffu;
-
-      g = (*mptr)&0x00ff00u;
-      g += ((clrG-g)*clrA)>>8;
-      g &= 0x00ff00u;
-
-      *mptr++ = rb|g|0xff000000u;
-    +/
-
-  mixloop:
-    // rb = (*mptr)&0xff00ffu;
-    // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu;
-    mov  EBX,[EDI];
-    mov  ESI,EBX;       // save `*mptr`
-    and  EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu
-    mov  EAX,SS:[ESP];  // EAX=clrRB
-    sub  EAX,EBX;       // EAX=clrRB-rb
-    mul  ECX;           // EAX=(clrRB-rb)*clrA (EDX is dead)
-    shr  EAX,8;         // EAX=((clrRB-rb)*clrA)>>8
-    add  EBX,EAX;       // EBX=rb+(((clrRB-rb)*clrA)>>8)
-    and  EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu
-
-    // g = (*mptr)&0x00ff00u;
-    // g += (((clrG-g)*clrA)>>8)&0x00ff00u;
-    mov  EDX,ESI;        // EDX=*mptr
-    and  EDX,0x00ff00u;  // EDX=g=(*mptr)&0x00ff00u
-    mov  ESI,EDX;        // save g, we well need it later
-    mov  EAX,SS:[ESP+4]; // EAX=clrG
-    sub  EAX,EDX;        // EAX=clrG-g
-    mul  ECX;            // EAX=(clrG-g)*clrA (EDX is dead)
-    shr  EAX,8;          // EAX=((clrG-g)*clrA)>>8
-    add  EAX,ESI;        // EAX=(((clrG-g)*clrA)>>8)+g
-    and  EAX,0x00ff00u;  // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u
-
-    // mix
-    or   EAX,EBX;
-    or   EAX,0xff000000u;
-
-    stosd;
-    dec  EBP;
-    jz   mixdone;
-    add  EDI,4;
-    dec  EBP;
-    jnz  mixloop;
-
-mixdone:
-    add  ESP,2*4; // drop temp vars
-    // restore registers
-    pop  ESI;
-    pop  EBX;
-    pop  EBP;
-
-quit:
-    mov  EAX,EDI; // result
-    mov  EDI,SS:[ESP+8];  // restore EDI
-    ret  8;
-  }
-}
-
-} else {
-// no SSE
-public uint* memFillDW (uint* ptr, in uint value, in int count) nothrow @trusted @nogc {
-  pragma(inline, true);
-  if (count > 0) {
-    ptr[0..cast(usize)count] = value;
-    ptr += cast(usize)count;
-  }
-  return ptr;
-}
-public uint* memFillDWDash (uint* ptr, in uint value, in int count) nothrow @trusted @nogc {
-  pragma(inline, true);
-  foreach (immutable c; 0..count) { if (!(c&1)) *ptr++ = value; else ++ptr; }
-  if (count > 0) {
-    ptr[0..cast(usize)count] = value;
-    ptr += cast(usize)count;
-  }
-  return ptr;
-}
-public uint* memBlendColor (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
-  foreach (immutable _; 0..count) { mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); }
-  return mptr;
-}
-public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
-  foreach (immutable c; 0..count) { if (!(c&1)) mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); else ++mptr; }
-  return mptr;
-}
-}
diff --git a/egra/gfx/config.d b/egra/gfx/config.d
index 08e75b8..48e6654 100644
--- a/egra/gfx/config.d
+++ b/egra/gfx/config.d
@@ -18,8 +18,10 @@
  */
 module iv.egra.gfx.config /*is aliced*/;
 
+// uncomment this to use OpenGL backend instead of X11
 //version = egfx_opengl_backend;
 
+
 version(egfx_opengl_backend) {
   public enum EGfxOpenGLBackend = true;
 } else {
diff --git a/egra/gfx/lowlevel.d b/egra/gfx/lowlevel.d
new file mode 100644
index 0000000..27f32c2
--- /dev/null
+++ b/egra/gfx/lowlevel.d
@@ -0,0 +1,751 @@
+/*
+ * Simple Framebuffer Gfx/GUI lib
+ *
+ * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
+ * Understanding is not required. Only obedience.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License ONLY.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+module iv.egra.gfx.lowlevel /*is aliced*/;
+private:
+
+// uncomment this to disable SSE4.1 optimisations
+//version = egfx_disable_sse41;
+
+
+version(egfx_disable_sse41) {
+  version(egfx_use_sse41) {
+    static assert(false, "EGRA: SSE4.1 is both forced and disabled. wtf?!");
+  }
+} else {
+  version(DigitalMars) {
+    version(X86) {
+      version = egfx_use_sse41;
+    } else {
+      version(egfx_use_sse41) {
+        static assert(false, "EGRA: SSE4.1 is not supported on 64-bit architectures.");
+      }
+    }
+  } else {
+    version(egfx_use_sse41) {
+      static assert(false, "EGRA: SSE4.1 is not supported on non-DMD compilers.");
+    }
+  }
+}
+
+version(egfx_use_sse41) {
+  public enum EGfxUseSSE41 = true;
+} else {
+  public enum EGfxUseSSE41 = false;
+}
+
+
+// ////////////////////////////////////////////////////////////////////////// //
+public void egfxCheckCPU () nothrow @trusted @nogc {
+  version(egfx_use_sse41) {
+    import core.cpuid : sse41;
+    if (!sse41) {
+      import core.stdc.stdio : stderr, fprintf;
+      fprintf(stderr, "ERROR: EGRA requires CPU with SSE4.1 support!");
+      assert(0, "ERROR: EGRA requires CPU with SSE4.1 support!");
+    }
+  }
+}
+
+
+// ////////////////////////////////////////////////////////////////////////// //
+// mix `dcvar` with ARGB (or ABGR) `colvar`; dc A is ignored (set to 255)
+// main code almost never calls this with solid or transparent `colvar`
+// the result will be put to `destvar` (it is written only once, at the end)
+// `colvar` and `dcvar` may be read several times
+// see http://stereopsis.com/doubleblend.html for the inspiration
+version(none) {
+// this works for solid and transparent colors too
+public enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{
+  immutable uint col_ = `~colvar~`;
+  immutable uint dc_ = (`~dcvar~`)&0xffffffu;
+  /*immutable uint a_ = 256-(col_>>24);*/ /* to not loose bits */
+  immutable uint a_ = (col_>>24)+1; /* so it will work for both 0 and 255 correctly */
+  immutable uint srb_ = (col_&0xff00ffu);
+  immutable uint sg_ = (col_&0x00ff00u);
+  immutable uint drb_ = (dc_&0xff00ffu);
+  immutable uint dg_ = (dc_&0x00ff00u);
+  immutable uint orb_ = (drb_+(((srb_-drb_)*a_+0x800080u)>>8))&0xff00ffu;
+  immutable uint og_ = (dg_+(((sg_-dg_)*a_+0x008000u)>>8))&0x00ff00u;
+  (`~destvar~`) = orb_|og_|0xff_00_00_00u;
+}`;
+} else {
+// this works for solid and transparent colors too
+public enum GxColMixMixin(string destvar, string dcvar, string colvar) = `{
+  immutable uint a_ = ((`~colvar~`)>>24)+1u; /* to not loose bits */
+  uint rb_ = (`~dcvar~`)&0xff00ffu;
+  uint g_  = (`~dcvar~`)&0x00ff00u;
+  rb_ += ((cast(uint)((`~colvar~`)&0xff00ffu)-rb_)*a_)>>8;
+  g_  += ((cast(uint)((`~colvar~`)&0x00ff00u)-g_)*a_)>>8;
+  /* g is mixed with solid alpha; replace "0xff_" with other alpha if you want to */
+  (`~destvar~`) = (rb_&0xff00ffu)|(g_&0xff_00ff00u)|0xff_00_00_00u;
+}`;
+}
+
+
+// ////////////////////////////////////////////////////////////////////////// //
+// size is in dwords
+version(egfx_use_sse41) {
+  //pragma(msg," !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ");
+
+/*
+align(16) immutable ubyte[16] sseSpreadOneColor = [
+0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
+0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
+];
+*/
+
+// for x86 naked functions, DMD will pass last arg in EAX
+// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
+// we need to preserve ESI and EDI (and EBX in case of PIC code)
+public uint* memFillDW (uint* mptr, in uint value, in int count) nothrow @trusted @nogc {
+  asm nothrow @trusted @nogc {
+    naked;
+    xchg    EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI
+    cmp     EAX,0;
+    jle     done;
+
+    mov     ECX,EAX;        // ECX=count (because last arg is in EAX)
+    mov     EAX,SS:[ESP+4]; // EAX=value
+
+    cmp     ECX,8;
+    jc      simplestore;  // too small
+
+    // load XMM0 with our color
+    push    EAX;
+    push    EAX;
+    // used `movdqu`, because it indicates int type
+    // this doesn't matter, it just looks nicer
+    // also, `movlps` is one byte shorter
+    movlps  XMM0,SS:[ESP];
+    movlhps XMM0,XMM0; // copy low 64 bits of XMM0 to high 64 bits of XMM0
+    //movdqu  XMM0,SS:[ESP];
+    //pshufb  XMM0,[sseSpreadOneColor];
+    add     ESP,8;
+
+    // if we cannot align at all, use "rep stosd"
+    // this should not happen, so i won't bother optimising it
+    test    EDI,0x03;
+    jnz     simplestore;
+
+    // align EDI (we have at least 8 pixels to fill here, so it is safe)
+alignloop:
+    test    EDI,0x0f;
+    jz      alignok;
+    stosd;
+    dec     ECX;
+    jmp     alignloop;
+
+alignok:
+    // ECX is never zero here
+    cmp     ECX,4;
+    jc      simplestore;  // too small
+
+    // save last 2 bits of counter (we'll mask them later)
+    movzx   EDX,CL;
+
+    // fill by 4 pixels while we can
+    shr     ECX,2;
+    //align   16; // why not
+alignfill:
+    movaps  [EDI],XMM0;
+    add     EDI,16;
+    dec     ECX;
+    jnz     alignfill;
+
+    // fill last 1-3 pixels
+    mov     ECX,EDX;
+    and     CL,0x03;
+    jz      done;
+
+simplestore:
+    rep; stosd;
+
+done:
+    mov  EAX,EDI; // return new mptr
+    mov  EDI,SS:[ESP+8];  // restore EDI
+    ret  4*2;
+  }
+}
+
+// WARNING! this function is not quite right (0 and 255 alphas will still modify the colors)
+// WARNING! do not call it with fully opaque or fully transparent `clr`!
+public alias memBlendColor = sseBlendColor;
+/*
+public uint* memBlendColor (uint* mptr, in uint clr, int count) nothrow @trusted @nogc {
+  pragma(inline, true);
+  version(all) {
+    if (count < 1) return mptr;
+    immutable int c4 = (count>>2); // it is actually unsigned
+    if (c4) { mptr = sseBlendColor4px(mptr, clr, cast(uint)c4); count -= (c4<<2); }
+    return (count ? memBlendColorSlow(mptr, clr, count) : mptr);
+  } else {
+    return memBlendColorSlow(mptr, clr, count);
+  }
+}
+*/
+
+
+align(16) immutable ubyte[16] sseSpreadAlpha = [
+0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff,
+0xff, 0x03, 0xff, 0x03, 0xff, 0x03, 0xff, 0xff,
+];
+
+align(16) immutable ubyte[16] sseMaxAlpha = [
+0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00,
+0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0x00,
+];
+
+align(16) immutable ubyte[16] sseFullByteAlpha = [
+0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
+0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
+];
+
+// mix foreground to background
+// EAX is pixel count
+// background = (alpha * foreground) + (1-alpha)*background
+// WARNING! this function is not quite right (0 and 255 alphas will still modify the colors)
+// WARNING! do not call it with fully opaque or fully transparent `clr`!
+public uint* sseBlendColor (uint* dest, uint clr, uint count) nothrow @trusted @nogc {
+  asm nothrow @trusted @nogc {
+    naked;
+    enter    0,0;
+    // save modified registers
+    push     EDI;
+
+    mov      EDI,[EBP+12]; // dest
+    // it can be negative
+    cmp      EAX,0;
+    jle      done;
+    mov      ECX,EAX; // counter
+
+    // EAX: count
+    // [EBP+8]: clr
+    // [EBP+16]: dest
+
+    // align stack
+    sub      ESP,16;
+    and      ESP,0xfffffff0u;
+
+    mov      EAX,[EBP+8]; // clr
+
+    // we can premultiply clr first, and convert alpha to 255-alpha
+
+    // prepare SSE data -- 2 pixels
+    mov      SS:[ESP],EAX;
+    mov      SS:[ESP+4],EAX;
+
+    // used `movdqa`, because it indicates int type
+    // this doesn't matter, it just looks nicer
+    // also, `movlps` is one byte shorter
+    movlps   XMM0,SS:[ESP];
+    //movdqa   XMM0,SS:[ESP];
+    // expand 8 ubytes to 8 ushorts
+    pmovzxbw XMM1,XMM0;
+    // XMM0: xx xx xx xx ar gb ar gb
+    // XMM1: 0a 0r 0g 0b 0a 0r 0g 0b
+    pshufb   XMM0,[sseSpreadAlpha];
+    // XMM0: 00 0a 0a 0a 00 0a 0a 0a
+    movdqa   XMM7,[sseMaxAlpha];
+    psubw    XMM7,XMM0;  // XMM7 is 255-alpha
+    // XMM7: 00 0a 0a 0a 00 0a 0a 0a
+    pmulhuw  XMM0,XMM1;
+    // XMM0: 00 0r 0g 0b 00 0r 0g 0b
+    movdqa   XMM6,[sseFullByteAlpha];
+
+    //XMM0: 2 premultiplied colors
+    //XMM7: 2 inverted alphas
+    //XMM6: destination alpha (replace value)
+
+    // totally unaligned?
+    // this should never happen, but meh...
+    test     EDI,0x03;
+    jnz      slowestpath; // alas, the slowest path
+
+    // align the address (if necessary)
+    test     EDI,0x0f;
+    jz       trymix8aligned;
+
+    // we need to mix 1-3 pixels to make the address aligned
+    // check counter here to allow "slow, but aligned" path (see the code below)
+    cmp      ECX,4;
+    jc       slowestpath; // alas
+
+    // process 4 pixels (we will drop unused ones)
+    movdqu   XMM5,[EDI]; // 4 background pixels
+    pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
+    // copy high part of XMM5 to low part of XMM5
+    movhlps  XMM5,XMM5;
+    pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
+    //XMM1: 2 lower pixels
+    //XMM2: 2 upper pixels
+
+    pmulhuw  XMM1,XMM7; // multiply by alpha
+    pmulhuw  XMM2,XMM7; // multiply by alpha
+
+    paddusw  XMM1,XMM0; // add premultiplied colors
+    paddusw  XMM2,XMM0; // add premultiplied colors
+
+    packuswb XMM1,XMM2;
+
+    // set destination alpha
+    por      XMM1,XMM6;
+
+    // now write 1-3 pixels to align the address
+    // we are guaranteed to have at least 4 pixels to mix here
+    // i.e. 4 processed pixels, and at least 4 pixels in the counter
+
+    // put in temp storage (it is aligned)
+    movdqa   SS:[ESP],XMM1;
+    mov      EDX,ESI; // save ESI (DMD expects it unchanged)
+    lea      ESI,[ESP+4];
+uastoreloop:
+    movsd;
+    dec      ECX;
+    test     EDI,0x0f;
+    jnz      uastoreloop;
+    mov      ESI,EDX; // restore ESI
+    // ECX is at least 1 here, and EDI is aligned
+
+trymix8aligned:
+    // ECX is never zero here
+    // use "slow, but aligned" path if we have less than 8 pixels to process
+    cmp      ECX,8;
+    jc       slowalignedpath;
+
+    // save last 3 bits in EAX
+    // we'll mask it later
+    movzx    EAX,CL;
+
+    // process by 8 pixels while we can
+    shr      ECX,3;
+
+mix8aligned:
+    movdqa   XMM5,[EDI]; // 4 background pixels
+    pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
+    // copy high part of XMM5 to low part of XMM5
+    movhlps  XMM5,XMM5;
+    pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
+    //XMM1: 2 lower pixels
+    //XMM2: 2 upper pixels
+
+    movdqa   XMM5,[EDI+16]; // 4 background pixels
+    pmovzxbw XMM3,XMM5; // expand 2 lower pixels to XMM3
+    // copy high part of XMM5 to low part of XMM5
+    movhlps  XMM5,XMM5;
+    pmovzxbw XMM4,XMM5; // expand 2 upper pixels to XMM4
+    //XMM3: 2 lower pixels
+    //XMM4: 2 upper pixels
+
+    pmulhuw  XMM1,XMM7; // multiply by alpha
+    pmulhuw  XMM2,XMM7; // multiply by alpha
+    pmulhuw  XMM3,XMM7; // multiply by alpha
+    pmulhuw  XMM4,XMM7; // multiply by alpha
+
+    paddusw  XMM1,XMM0; // add premultiplied colors
+    paddusw  XMM2,XMM0; // add premultiplied colors
+    paddusw  XMM3,XMM0; // add premultiplied colors
+    paddusw  XMM4,XMM0; // add premultiplied colors
+
+    packuswb XMM1,XMM2;
+    packuswb XMM3,XMM4;
+
+    // set destination alpha
+    por      XMM1,XMM6;
+    por      XMM3,XMM6;
+
+    movdqa   [EDI],XMM1;
+    movdqa   [EDI+16],XMM3;
+
+    add      EDI,32;
+    dec      ECX;
+    jnz      mix8aligned;
+
+    // do last 1-7 pixels (last counter is in EAX)
+    // EDI is guaranteed to be aligned here
+    mov      ECX,EAX;
+    and      CL,0x07;
+    jnz      slowalignedpath;
+
+    // we're done
+    mov      EAX,EDI;
+    mov      EDI,[EBP-4]; // restore EDI
+    leave;
+    ret 4*2;
+
+    align 16;
+    // mix by 4 pixels, unaligned
+slowestpath:
+    // mix 4 pixels
+    movdqu   XMM5,[EDI]; // 4 background pixels
+    pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
+    // copy high part of XMM5 to low part of XMM5
+    movhlps  XMM5,XMM5;
+    pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
+    //XMM1: 2 lower pixels
+    //XMM2: 2 upper pixels
+
+    pmulhuw  XMM1,XMM7; // multiply by alpha
+    pmulhuw  XMM2,XMM7; // multiply by alpha
+
+    paddusw  XMM1,XMM0; // add premultiplied colors
+    paddusw  XMM2,XMM0; // add premultiplied colors
+
+    packuswb XMM1,XMM2;
+
+    // set destination alpha
+    por      XMM1,XMM6;
+
+    sub      ECX,4;
+    jc       slowestlast;
+
+    movdqu   [EDI],XMM1;
+    add      EDI,16;
+    jecxz    done;
+    jmp      slowestpath;
+
+    // last 1-3 pixels (never 0)
+slowestlast:
+    // put in temp storage (it is aligned)
+    movdqa   SS:[ESP],XMM1;
+    mov      EDX,ESI; // save ESI (DMD expects it unchanged)
+    push     ESI;
+    lea      ESI,[ESP+4];
+    and      ECX,0x03; // left counter
+    rep; movsd;
+    mov      ESI,EDX; // restore ESI
+    jmp      done;
+
+done:
+    mov      EAX,EDI;
+    mov      EDI,[EBP-4]; // restore EDI
+    leave;
+    ret 4*2;
+
+
+    align 16;
+    // mix by 4 pixels, aligned (used for 1-7 pixels)
+slowalignedpath:
+    // mix 4 pixels
+    movdqa   XMM5,[EDI]; // 4 background pixels
+    pmovzxbw XMM1,XMM5; // expand 2 lower pixels to XMM1
+    // copy high part of XMM5 to low part of XMM5
+    movhlps  XMM5,XMM5;
+    pmovzxbw XMM2,XMM5; // expand 2 upper pixels to XMM2
+    //XMM1: 2 lower pixels
+    //XMM2: 2 upper pixels
+
+    pmulhuw  XMM1,XMM7; // multiply by alpha
+    pmulhuw  XMM2,XMM7; // multiply by alpha
+
+    paddusw  XMM1,XMM0; // add premultiplied colors
+    paddusw  XMM2,XMM0; // add premultiplied colors
+
+    packuswb XMM1,XMM2;
+
+    // set destination alpha
+    por      XMM1,XMM6;
+
+    sub      ECX,4;
+    jc       slowestlast;
+
+    movdqa   [EDI],XMM1;
+    add      EDI,16;
+    jecxz    done;
+    jmp      slowalignedpath;
+  }
+}
+
+// EAX is `count`
+// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
+// we need to preserve ESI and EDI (and EBX in case of PIC code)
+/+
+public uint* memBlendColorSlow (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
+  asm nothrow @trusted @nogc {
+    naked;
+
+    xchg  EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI
+    cmp   EAX,0;
+    jle   quit;
+
+    push  EBP; // EBP will contain the counter
+    push  EBX; // EBX is temporary register
+    push  ESI; // DMD expects ESI to be unmodified at exit
+    mov   EBP,EAX; // EBP=counter
+
+    mov   EAX,SS:[ESP+16]; // EAX=clr
+    mov   ECX,EAX;  // ECX will be clrA
+    // clrG=clr&0x00ff00u;
+    and   EAX,0x00ff00u;
+    push  EAX;
+    // clrRB=clr&0xff00ffu;
+    mov   EAX,ECX;
+    and   EAX,0xff00ffu;
+    push  EAX;
+    // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision
+    shr   ECX,24;
+    inc   ECX;
+
+    // [ESP+0]: clrRB
+    // [ESP+4]: clrG
+    // ESI
+    // EBX
+    // EBP
+    // ret addr
+    // clr
+    // mptr
+    // EBP=counter
+    // EDI=mptr
+    // ECX=clrA
+
+    align 16; // why not
+
+    /+
+      clrA = (clr>>24)+1;
+      clrRB = clr&0xff00ffu;
+      clrG = clr&0x00ff00u;
+
+      rb = (*mptr)&0xff00ffu;
+      rb += ((clrRB-rb)*clrA)>>8;
+      rb &= 0xff00ffu;
+
+      g = (*mptr)&0x00ff00u;
+      g += ((clrG-g)*clrA)>>8;
+      g &= 0x00ff00u;
+
+      *mptr++ = rb|g|0xff000000u;
+    +/
+
+  mixloop:
+    // rb = (*mptr)&0xff00ffu;
+    // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu;
+    mov  EBX,[EDI];
+    mov  ESI,EBX;       // save `*mptr`
+    and  EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu
+    mov  EAX,SS:[ESP];  // EAX=clrRB
+    sub  EAX,EBX;       // EAX=clrRB-rb
+    mul  ECX;           // EAX=(clrRB-rb)*clrA (EDX is dead)
+    shr  EAX,8;         // EAX=((clrRB-rb)*clrA)>>8
+    add  EBX,EAX;       // EBX=rb+(((clrRB-rb)*clrA)>>8)
+    and  EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu
+
+    // g = (*mptr)&0x00ff00u;
+    // g += (((clrG-g)*clrA)>>8)&0x00ff00u;
+    mov  EDX,ESI;        // EDX=*mptr
+    and  EDX,0x00ff00u;  // EDX=g=(*mptr)&0x00ff00u
+    mov  ESI,EDX;        // save g, we well need it later
+    mov  EAX,SS:[ESP+4]; // EAX=clrG
+    sub  EAX,EDX;        // EAX=clrG-g
+    mul  ECX;            // EAX=(clrG-g)*clrA (EDX is dead)
+    shr  EAX,8;          // EAX=((clrG-g)*clrA)>>8
+    add  EAX,ESI;        // EAX=(((clrG-g)*clrA)>>8)+g
+    and  EAX,0x00ff00u;  // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u
+
+    // mix
+    or   EAX,EBX;
+    or   EAX,0xff000000u;
+
+    stosd;
+    dec  EBP;
+    jnz  mixloop;
+
+    add  ESP,2*4; // drop temp vars
+    // restore registers
+    pop  ESI;
+    pop  EBX;
+    pop  EBP;
+
+quit:
+    mov  EAX,EDI; // result
+    mov  EDI,SS:[ESP+8];  // restore EDI
+    ret  8;
+  }
+}
++/
+
+
+// for x86 naked functions, DMD will pass last arg in EAX
+// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
+// we need to preserve ESI and EDI (and EBX in case of PIC code)
+// this doesn't change every 2nd pixel; `count` is count of ALL pixels
+public uint* memFillDWDash (uint* mptr, in uint value, in int count) nothrow @trusted @nogc {
+  asm nothrow @trusted @nogc {
+    naked;
+    xchg  EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI
+    // it can be negative
+    cmp   EAX,0;
+    jle   quit;
+    mov   ECX,EAX;        // ECX=count (because last arg is in EAX)
+    mov   EAX,SS:[ESP+4]; // EAX=value
+
+    align 16;
+simplestore:
+    stosd;
+    dec   ECX;
+    jz    quit;
+    add   EDI,4;
+    dec   ECX;
+    jnz   simplestore;
+
+quit:
+    mov  EAX,EDI; // return new mptr
+    mov  EDI,SS:[ESP+8];  // restore EDI
+    ret  8;
+  }
+}
+
+
+//TODO: rewrite this with SSE
+// EAX is `count`
+// sadly, with -O DMD makes some assumptions about dead registers, and nothing is working right
+// we need to preserve ESI and EDI (and EBX in case of PIC code)
+// this doesn't change every 2nd pixel; `count` is count of ALL pixels
+public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
+  asm nothrow @trusted @nogc {
+    naked;
+
+    xchg  EDI,SS:[ESP+8]; // EDI=mptr; also, save old EDI
+    cmp   EAX,0;
+    jle   quit;
+
+    push  EBP; // EBP will contain the counter
+    push  EBX; // EBX is temporary register
+    push  ESI; // DMD expects ESI to be unmodified at exit
+    mov   EBP,EAX; // EBP=counter
+
+    mov   EAX,SS:[ESP+16]; // EAX=clr
+    mov   ECX,EAX;  // ECX will be clrA
+    // clrG=clr&0x00ff00u;
+    and   EAX,0x00ff00u;
+    push  EAX;
+    // clrRB=clr&0xff00ffu;
+    mov   EAX,ECX;
+    and   EAX,0xff00ffu;
+    push  EAX;
+    // ECX=clrA=(clr>>24)+1; -- `+1` to keep some precision
+    shr   ECX,24;
+    inc   ECX;
+
+    // [ESP+0]: clrRB
+    // [ESP+4]: clrG
+    // ESI
+    // EBX
+    // EBP
+    // ret addr
+    // clr
+    // mptr
+    // EBP=counter
+    // EDI=mptr
+    // ECX=clrA
+
+    align 16; // why not
+
+    /+
+      clrA = (clr>>24)+1;
+      clrRB = clr&0xff00ffu;
+      clrG = clr&0x00ff00u;
+
+      rb = (*mptr)&0xff00ffu;
+      rb += ((clrRB-rb)*clrA)>>8;
+      rb &= 0xff00ffu;
+
+      g = (*mptr)&0x00ff00u;
+      g += ((clrG-g)*clrA)>>8;
+      g &= 0x00ff00u;
+
+      *mptr++ = rb|g|0xff000000u;
+    +/
+
+  mixloop:
+    // rb = (*mptr)&0xff00ffu;
+    // rb += (((clrRB-rb)*clrA)>>8)&0xff00ffu;
+    mov  EBX,[EDI];
+    mov  ESI,EBX;       // save `*mptr`
+    and  EBX,0xff00ffu; // EBX=rb=(*mptr)&0xff00ffu
+    mov  EAX,SS:[ESP];  // EAX=clrRB
+    sub  EAX,EBX;       // EAX=clrRB-rb
+    mul  ECX;           // EAX=(clrRB-rb)*clrA (EDX is dead)
+    shr  EAX,8;         // EAX=((clrRB-rb)*clrA)>>8
+    add  EBX,EAX;       // EBX=rb+(((clrRB-rb)*clrA)>>8)
+    and  EBX,0xff00ffu; // EAX=(rb+(((clrRB-rb)*clrA)>>8))&0xff00ffu
+
+    // g = (*mptr)&0x00ff00u;
+    // g += (((clrG-g)*clrA)>>8)&0x00ff00u;
+    mov  EDX,ESI;        // EDX=*mptr
+    and  EDX,0x00ff00u;  // EDX=g=(*mptr)&0x00ff00u
+    mov  ESI,EDX;        // save g, we well need it later
+    mov  EAX,SS:[ESP+4]; // EAX=clrG
+    sub  EAX,EDX;        // EAX=clrG-g
+    mul  ECX;            // EAX=(clrG-g)*clrA (EDX is dead)
+    shr  EAX,8;          // EAX=((clrG-g)*clrA)>>8
+    add  EAX,ESI;        // EAX=(((clrG-g)*clrA)>>8)+g
+    and  EAX,0x00ff00u;  // EAX=((((clrG-g)*clrA)>>8)+g)&0x00ff00u
+
+    // mix
+    or   EAX,EBX;
+    or   EAX,0xff000000u;
+
+    stosd;
+    dec  EBP;
+    jz   mixdone;
+    add  EDI,4;
+    dec  EBP;
+    jnz  mixloop;
+
+mixdone:
+    add  ESP,2*4; // drop temp vars
+    // restore registers
+    pop  ESI;
+    pop  EBX;
+    pop  EBP;
+
+quit:
+    mov  EAX,EDI; // result
+    mov  EDI,SS:[ESP+8];  // restore EDI
+    ret  8;
+  }
+}
+
+} else {
+// no SSE
+public uint* memFillDW (uint* ptr, in uint value, in int count) nothrow @trusted @nogc {
+  pragma(inline, true);
+  if (count > 0) {
+    ptr[0..cast(usize)count] = value;
+    ptr += cast(usize)count;
+  }
+  return ptr;
+}
+public uint* memFillDWDash (uint* ptr, in uint value, in int count) nothrow @trusted @nogc {
+  pragma(inline, true);
+  foreach (immutable c; 0..count) { if (!(c&1)) *ptr++ = value; else ++ptr; }
+  if (count > 0) {
+    ptr[0..cast(usize)count] = value;
+    ptr += cast(usize)count;
+  }
+  return ptr;
+}
+public uint* memBlendColor (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
+  foreach (immutable _; 0..count) { mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); }
+  return mptr;
+}
+public uint* memBlendColorDash (uint* mptr, in uint clr, in int count) nothrow @trusted @nogc {
+  foreach (immutable c; 0..count) { if (!(c&1)) mixin(GxColMixMixin!("*mptr++", "*mptr", "clr")); else ++mptr; }
+  return mptr;
+}
+}
-- 
2.11.4.GIT