2 * optimized memcpy for x86_64 with AVX and ERMSB.
3 * beats musl 0.9.11, 1.1.24 C&ASM version,
4 * beats cosmopolitan for all sizes
5 * tested with GCC 6.5.0 -O3 -march=native.
7 * (C) 2021 rofl0r - licensed under the standard MIT license,
8 * as shipped with musl.
14 typedef uint8_t u16
__attribute__ ((vector_size (16), aligned (1), __may_alias__
));
15 typedef uint64_t u8
__attribute__((__may_alias__
));
16 typedef uint32_t u4
__attribute__((__may_alias__
));
17 typedef uint16_t u2
__attribute__((__may_alias__
));
18 typedef uint8_t u1
__attribute__((__may_alias__
));
21 /* gcc 3.4.6 generates movaps instead of moveups, ignoring the aligned(1) */
22 #define COPY16(D, S) __builtin_ia32_storeups((void*)(D), __builtin_ia32_loadups((void*)(S)))
23 #define UNREACHABLE() break
25 #define COPY32(D, S) do { __builtin_ia32_storedqu256((void*)(D), __builtin_ia32_loaddqu256((void*)(S))); } while(0)
26 #define COPY16(D, S) do { *(u16*)(D) = *(u16*)(S); } while(0)
27 #define UNREACHABLE() __builtin_unreachable()
30 #define OP8(N) do {*(u8*)(d+N) = *(u8*)(s+N); } while(0)
31 #define OP4(N) do {*(u4*)(d+N) = *(u4*)(s+N); } while(0)
32 #define OP2(N) do {*(u2*)(d+N) = *(u2*)(s+N); } while(0)
33 #define OP1(N) do {*(u1*)(d+N) = *(u1*)(s+N); } while(0)
35 void *mymemcpy(void *__restrict dest
, const void *__restrict src
, size_t count
)
43 case 15: { OP8(0); OP4(8); OP2(12); OP1(14); } break;
44 case 14: { OP8(0); OP4(8); OP2(12); } break;
45 case 13: { OP8(0); OP4(8); OP1(12); } break;
46 case 12: { OP8(0); OP4(8); } break;
47 case 11: { OP8(0); OP2(8); OP1(10);} break;
48 case 10: { OP8(0); OP2(8); } break;
49 case 9: { OP8(0); OP1(8); } break;
50 case 8: { OP8(0); } break;
51 case 7: { OP4(0); OP2(4); OP1(6);} break;
52 case 6: { OP4(0); OP2(4); } break;
53 case 5: { OP4(0); OP1(4); } break;
54 case 4: { OP4(0); } break;
55 case 3: { OP2(0); OP1(2); } break;
56 case 2: { OP2(0); } break;
57 case 1: { OP1(0); } break;
59 default: UNREACHABLE();
62 if(n
>= 1024 && n
<= 128*1024) {
66 __asm__
__volatile__("cld ; rep movsq" : "=S"(s
), "=D"(d
) : "S"(s
),"D"(d
),"c"(n
) : "memory");
76 for( ; c
< n
; c
+= 32) COPY32(d
+c
, s
+c
);