add amd memcpy
[rofl0r-memcpy-test.git] / rofl_avx.c
blob1fe16ca1dfb4526db86f5c24d1eaa427b753c6c1
1 /*
2 * optimized memcpy for x86_64 with AVX.
3 * beats musl 0.9.11, 1.1.24 C&ASM version,
4 * beats cosmopolitan for all sizes
5 * tested with GCC 6.5.0 -O2 -mavx.
7 * (C) 2021 rofl0r - licensed under the standard MIT license,
8 * as shipped with musl.
9 */
11 #include <string.h>
12 #include <stdint.h>
14 typedef uint8_t u16 __attribute__ ((vector_size (16), aligned (1), __may_alias__));
15 typedef uint64_t u8 __attribute__((__may_alias__));
16 typedef uint32_t u4 __attribute__((__may_alias__));
17 typedef uint16_t u2 __attribute__((__may_alias__));
18 typedef uint8_t u1 __attribute__((__may_alias__));
20 #if __GNUC__ +0 <= 3
21 /* gcc 3.4.6 generates movaps instead of moveups, ignoring the aligned(1) */
22 #define COPY16(D, S) __builtin_ia32_storeups((void*)(D), __builtin_ia32_loadups((void*)(S)))
23 #define UNREACHABLE() break
24 #else
25 #define COPY16(D, S) do { *(u16*)(D) = *(u16*)(S); } while(0)
26 #define UNREACHABLE() __builtin_unreachable()
27 #endif
29 #ifdef __AVX__
30 #define COPY32(D, S) do { __builtin_ia32_storedqu256((void*)(D), __builtin_ia32_loaddqu256((void*)(S))); } while(0)
31 #endif
33 #define OP8(N) do {*(u8*)(d+N) = *(u8*)(s+N); } while(0)
34 #define OP4(N) do {*(u4*)(d+N) = *(u4*)(s+N); } while(0)
35 #define OP2(N) do {*(u2*)(d+N) = *(u2*)(s+N); } while(0)
36 #define OP1(N) do {*(u1*)(d+N) = *(u1*)(s+N); } while(0)
38 void *mymemcpy(void *__restrict dest, const void *__restrict src, size_t count)
40 register u1 * d /*__asm__("rdi")*/ = dest;
41 register const u1 * s /*__asm__("rsi")*/ = src;
42 register size_t n /*__asm__("rdx")*/ = count;
43 register size_t c /*__asm__("rcx")*/ = n & 0xf;
45 switch(c) {
46 case 15: { OP8(0); OP4(8); OP2(12); OP1(14); } break;
47 case 14: { OP8(0); OP4(8); OP2(12); } break;
48 case 13: { OP8(0); OP4(8); OP1(12); } break;
49 case 12: { OP8(0); OP4(8); } break;
50 case 11: { OP8(0); OP2(8); OP1(10);} break;
51 case 10: { OP8(0); OP2(8); } break;
52 case 9: { OP8(0); OP1(8); } break;
53 case 8: { OP8(0); } break;
54 case 7: { OP4(0); OP2(4); OP1(6);} break;
55 case 6: { OP4(0); OP2(4); } break;
56 case 5: { OP4(0); OP1(4); } break;
57 case 4: { OP4(0); } break;
58 case 3: { OP2(0); OP1(2); } break;
59 case 2: { OP2(0); } break;
60 case 1: { OP1(0); } break;
61 case 0: break;
62 default: UNREACHABLE();
65 if(n & 0x10) {
66 COPY16(d+c, s+c);
67 c += 16;
70 #ifdef COPY32
71 for( ; c < n ; c += 32) COPY32(d+c, s+c);
72 #else
73 for( ; c < n ; ) {
74 COPY16(d+c, s+c);
75 c += 16;
76 COPY16(d+c, s+c);
77 c += 16;
79 #endif
80 return dest;