2 * optimized memcpy for x86_64 with AVX.
3 * beats musl 0.9.11, 1.1.24 C&ASM version,
4 * beats cosmopolitan for all sizes
5 * tested with GCC 6.5.0 -O2 -mavx.
7 * (C) 2021 rofl0r - licensed under the standard MIT license,
8 * as shipped with musl.
14 typedef uint8_t u16
__attribute__ ((vector_size (16), aligned (1), __may_alias__
));
15 typedef uint64_t u8
__attribute__((__may_alias__
));
16 typedef uint32_t u4
__attribute__((__may_alias__
));
17 typedef uint16_t u2
__attribute__((__may_alias__
));
18 typedef uint8_t u1
__attribute__((__may_alias__
));
21 /* gcc 3.4.6 generates movaps instead of moveups, ignoring the aligned(1) */
22 #define COPY16(D, S) __builtin_ia32_storeups((void*)(D), __builtin_ia32_loadups((void*)(S)))
23 #define UNREACHABLE() break
25 #define COPY16(D, S) do { *(u16*)(D) = *(u16*)(S); } while(0)
26 #define UNREACHABLE() __builtin_unreachable()
30 #define COPY32(D, S) do { __builtin_ia32_storedqu256((void*)(D), __builtin_ia32_loaddqu256((void*)(S))); } while(0)
33 #define OP8(N) do {*(u8*)(d+N) = *(u8*)(s+N); } while(0)
34 #define OP4(N) do {*(u4*)(d+N) = *(u4*)(s+N); } while(0)
35 #define OP2(N) do {*(u2*)(d+N) = *(u2*)(s+N); } while(0)
36 #define OP1(N) do {*(u1*)(d+N) = *(u1*)(s+N); } while(0)
38 void *mymemcpy(void *__restrict dest
, const void *__restrict src
, size_t count
)
40 register u1
* d
/*__asm__("rdi")*/ = dest
;
41 register const u1
* s
/*__asm__("rsi")*/ = src
;
42 register size_t n
/*__asm__("rdx")*/ = count
;
43 register size_t c
/*__asm__("rcx")*/ = n
& 0xf;
46 case 15: { OP8(0); OP4(8); OP2(12); OP1(14); } break;
47 case 14: { OP8(0); OP4(8); OP2(12); } break;
48 case 13: { OP8(0); OP4(8); OP1(12); } break;
49 case 12: { OP8(0); OP4(8); } break;
50 case 11: { OP8(0); OP2(8); OP1(10);} break;
51 case 10: { OP8(0); OP2(8); } break;
52 case 9: { OP8(0); OP1(8); } break;
53 case 8: { OP8(0); } break;
54 case 7: { OP4(0); OP2(4); OP1(6);} break;
55 case 6: { OP4(0); OP2(4); } break;
56 case 5: { OP4(0); OP1(4); } break;
57 case 4: { OP4(0); } break;
58 case 3: { OP2(0); OP1(2); } break;
59 case 2: { OP2(0); } break;
60 case 1: { OP1(0); } break;
62 default: UNREACHABLE();
71 for( ; c
< n
; c
+= 32) COPY32(d
+c
, s
+c
);