add amd memcpy
[rofl0r-memcpy-test.git] / rofl_ult.c
blob2d4adb945ed4a9436fca3cfab1408fd9919513d0
1 /*
2 * optimized memcpy for x86_64 with AVX and ERMSB.
3 * beats musl 0.9.11, 1.1.24 C&ASM version,
4 * beats cosmopolitan for all sizes
5 * tested with GCC 6.5.0 -O3 -march=native.
7 * (C) 2021 rofl0r - licensed under the standard MIT license,
8 * as shipped with musl.
9 */
11 #include <string.h>
12 #include <stdint.h>
14 typedef uint8_t u16 __attribute__ ((vector_size (16), aligned (1), __may_alias__));
15 typedef uint64_t u8 __attribute__((__may_alias__));
16 typedef uint32_t u4 __attribute__((__may_alias__));
17 typedef uint16_t u2 __attribute__((__may_alias__));
18 typedef uint8_t u1 __attribute__((__may_alias__));
20 #if __GNUC__ +0 <= 3
21 /* gcc 3.4.6 generates movaps instead of moveups, ignoring the aligned(1) */
22 #define COPY16(D, S) __builtin_ia32_storeups((void*)(D), __builtin_ia32_loadups((void*)(S)))
23 #define UNREACHABLE() break
24 #else
25 #define COPY32(D, S) do { __builtin_ia32_storedqu256((void*)(D), __builtin_ia32_loaddqu256((void*)(S))); } while(0)
26 #define COPY16(D, S) do { *(u16*)(D) = *(u16*)(S); } while(0)
27 #define UNREACHABLE() __builtin_unreachable()
28 #endif
30 #define OP8(N) do {*(u8*)(d+N) = *(u8*)(s+N); } while(0)
31 #define OP4(N) do {*(u4*)(d+N) = *(u4*)(s+N); } while(0)
32 #define OP2(N) do {*(u2*)(d+N) = *(u2*)(s+N); } while(0)
33 #define OP1(N) do {*(u1*)(d+N) = *(u1*)(s+N); } while(0)
35 void *mymemcpy(void *__restrict dest, const void *__restrict src, size_t count)
37 u1 *d = dest;
38 const u1 *s = src;
39 size_t n = count;
40 size_t c = n & 0xf;
42 switch(c) {
43 case 15: { OP8(0); OP4(8); OP2(12); OP1(14); } break;
44 case 14: { OP8(0); OP4(8); OP2(12); } break;
45 case 13: { OP8(0); OP4(8); OP1(12); } break;
46 case 12: { OP8(0); OP4(8); } break;
47 case 11: { OP8(0); OP2(8); OP1(10);} break;
48 case 10: { OP8(0); OP2(8); } break;
49 case 9: { OP8(0); OP1(8); } break;
50 case 8: { OP8(0); } break;
51 case 7: { OP4(0); OP2(4); OP1(6);} break;
52 case 6: { OP4(0); OP2(4); } break;
53 case 5: { OP4(0); OP1(4); } break;
54 case 4: { OP4(0); } break;
55 case 3: { OP2(0); OP1(2); } break;
56 case 2: { OP2(0); } break;
57 case 1: { OP1(0); } break;
58 case 0: break;
59 default: UNREACHABLE();
62 if(n >= 1024 && n <= 128*1024) {
63 d += c;
64 s += c;
65 n = (n - c) >> 3;
66 __asm__ __volatile__("cld ; rep movsq" : "=S"(s), "=D"(d) : "S"(s),"D"(d),"c"(n) : "memory");
67 return dest;
70 if(n & 0x10) {
71 COPY16(d+c, s+c);
72 c += 16;
75 #if __GNUC__+0 >= 4
76 for( ; c < n ; c += 32) COPY32(d+c, s+c);
77 #else
78 for( ; c < n ; ) {
79 COPY16(d+c, s+c);
80 c += 16;
81 COPY16(d+c, s+c);
82 c += 16;
84 #endif
85 return dest;