add amd memcpy
[rofl0r-memcpy-test.git] / memcpy_test.c
blob2204b47dc6253e4965e9076bc018ee7405ba0767
1 #include <stdio.h>
2 #include <string.h>
3 #include <unistd.h>
4 #include <stdlib.h>
5 #include <assert.h>
6 #include <sys/time.h>
8 unsigned long long microsecpassed(struct timeval* t) {
9 struct timeval now, diff;
10 gettimeofday(&now, NULL);
11 timersub(&now, t, &diff);
12 return (diff.tv_sec * 1000 * 1000) + diff.tv_usec;
15 #if defined(__i386__)
17 inline unsigned long long rdtsc() {
18 unsigned int lo, hi;
19 __asm__ volatile (
20 "cpuid \n"
21 "rdtsc"
22 : "=a"(lo), "=d"(hi) /* outputs */
23 : "a"(0) /* inputs */
24 : "%ebx", "%ecx"); /* clobbers*/
25 return ((unsigned long long)lo) | (((unsigned long long)hi) << 32);
27 #elif 0
28 static inline unsigned long long rdtsc(void) {
29 unsigned long long hi, lo;
30 __asm__ __volatile__(
31 "xorl %%eax, %%eax;\n\t"
32 "push %%ebx;"
33 "cpuid\n\t"
35 :"%eax", "%ebx", "%ecx", "%edx");
36 __asm__ __volatile__(
37 "rdtsc;"
38 : "=a" (lo), "=d" (hi)
39 ::);
40 __asm__ __volatile__(
41 "xorl %%eax, %%eax; cpuid;"
42 "pop %%ebx;"
44 :"%eax", "%ebx", "%ecx", "%edx");
46 return (unsigned long long)hi << 32 | lo;
49 #elif 0
50 static inline unsigned long long rdtsc(void)
52 unsigned long long int x;
53 __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
54 return x;
56 #elif defined(__x86_64__)
57 /*static inline unsigned long long rdtsc(void) {
58 unsigned long long hi, lo;
59 __asm__ __volatile__(
60 "xorl %%eax, %%eax;\n\t"
61 "push %%rbx;"
62 "cpuid\n\t"
64 :"%rax", "%rbx", "%rcx", "%rdx");
65 __asm__ __volatile__(
66 "rdtsc;"
67 : "=a" (lo), "=d" (hi)
68 ::);
69 __asm__ __volatile__(
70 "xorl %%eax, %%eax; cpuid;"
71 "pop %%rbx;"
73 :"%rax", "%rbx", "%rcx", "%rdx");
75 return (unsigned long long)hi << 32 | lo;
77 #elif 0
79 static inline unsigned long long rdtsc(void)
81 unsigned hi, lo;
82 __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
83 return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
86 static inline void serialize () {
87 __asm__ __volatile__ ("cpuid" : : "a"(0) : "ebx", "ecx", "edx" );
91 #elif defined(__powerpc__)
94 static __inline__ unsigned long long rdtsc(void)
96 unsigned long long int result=0;
97 unsigned long int upper, lower,tmp;
98 __asm__ volatile(
99 "0: \n"
100 "\tmftbu %0 \n"
101 "\tmftb %1 \n"
102 "\tmftbu %2 \n"
103 "\tcmpw %2,%0 \n"
104 "\tbne 0b \n"
105 : "=r"(upper),"=r"(lower),"=r"(tmp)
107 result = upper;
108 result = result<<32;
109 result = result|lower;
111 return(result);
114 #endif
117 extern void *mymemcpy(void *dest, const void *src, size_t n);
118 extern void fillmem(void* mem, size_t size);
119 extern int dummy_access(void* mem, size_t size);
121 #include <sched.h>
122 static void lock_affinity(void) {
123 cpu_set_t cs;
124 assert(0 == sched_getaffinity(0, sizeof(cs), &cs));
125 CPU_ZERO_S(sizeof(cs), &cs);
126 CPU_SET_S(0, sizeof(cs), &cs);
127 assert(0 == sched_setaffinity(0, sizeof(cs), &cs));
130 static inline unsigned long test(size_t size, char* src, char* dst) {
131 unsigned long ticka, tickb;
132 fillmem(src, size); // dummy call so that gcc can not guess the mem contents.
133 ticka = rdtsc();
134 void* p = mymemcpy(dst, src, size);
135 tickb = rdtsc();
136 assert(p == dst);
137 if(dummy_access(dst, size) == 1) abort(); // dummy call so that gcc can not assume mem content is never accessed.
138 return tickb - ticka;
141 int main(int argc, char** argv) {
142 #define K(X) (1024UL * X)
143 #define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
144 #define MIN(A,B) ((A) > (B) ? (B) : (A))
146 const size_t testsizes[] = {
147 0, 1, 2, 3 ,4 ,5, 6, 7,
148 8, 9, 10, 11, 12, 13, 14,
149 15, 16,
150 23, 24, 25,
151 31, 32, 33,
152 63, 64, 65,
153 79, 80, 81,
154 95, 96, 97,
155 127, 128, 129,
156 159, 160, 161,
157 191, 192, 193,
158 224,
159 255, 256, 257,
160 288, 320, 348,
161 383, 384, 385,
162 416, 448, 476,
163 511, 512, 513,
164 548, 640, 732,
165 767, 768, 769,
166 1023, 1024, 1025, 1152, 1280, 1408,
167 1535, 1536, 1537,
168 2048, 4096, 8192,
169 16384, 32768, 65536,
170 K(128), K(160), K(192), K(208), K(216), K(220), K(224), K(240),
171 K(256), K(384), K(512),
172 K(1024), K(1280), K(1536), K(1792), K(2048),
173 K(2560), K(3072), K(3584), K(4096),
174 K(4352), K(4608), K(4864),
175 K(5120), K(5376), K(5632), K(5888),
176 K(6144), K(6400), K(6656), K(6912), K(7168),
177 K(8192), K(16384), K(32768), K(65536),
180 lock_affinity();
182 unsigned long x, y, ymax;
183 unsigned long long smallest;
184 double res;
186 // warm up
187 for(x = 1 << 28; x > 0; --x);
189 FILE *f = fopen("/dev/urandom", "r");
190 for (x = 0 ; x < ARRAY_SIZE(testsizes); x++) {
191 char *src, *dst;
193 //smallest = 0xffffffff;
194 y = testsizes[x] ? testsizes[x] : 1;
195 ymax = (K(65536)*(100ULL - (MIN(99ULL, ARRAY_SIZE(testsizes) - x -1ULL))))/y;
196 if(testsizes[x] >= 64) ymax*=2;
197 if(testsizes[x] >= 1024) ymax*=2;
199 src = malloc(testsizes[x] + 64);
200 dst = malloc(testsizes[x] + 64);
201 /* check that the function works correctly -
202 the +1 stuff is to get unaligned start offset,
203 the 0xaa/bb stuff to check whether it writes off bounds. */
204 dst[0] = src[0] = 0xee;
205 memset(src+1+testsizes[x], 0xbb, 32);
206 memset(dst+1+testsizes[x], 0xaa, 32);
207 fread(src+1, 1, testsizes[x], f);
208 if(testsizes[x] >= 4) {
209 /* make sure we copy into the right direction */
210 src[1] = 'a'; src[2] = 'b'; src[3] = 'c'; src[4] = 'd';
211 dst[1] = '0'; dst[2] = '1'; dst[3] = '2'; dst[4] = '3';
213 mymemcpy(dst+1, src+1, testsizes[x]);
214 memset(src+1+testsizes[x], 0xaa, 32);
215 if(memcmp(src+1, dst+1, testsizes[x]+32) || (testsizes[x] >= 4 && memcmp(dst+1, "abcd", 4))) {
216 fprintf(stderr, "warning: %s didn't pass self-test with size %zu!\n", FILENAME, testsizes[x]);
219 unsigned long long curr, total, best = -1ULL;
221 serialize();
222 for(y = 0; y < ymax; y++) {
223 //__builtin_ia32_clflush(src);
224 //__builtin_ia32_clflush(dst);
225 curr = test(testsizes[x], src, dst);
226 if(curr < best) best = curr;
227 total += curr;
229 serialize();
231 fprintf(stdout, "%-8zu\t%llu\n", testsizes[x], best);
232 fflush(stdout);
233 free(src);
234 free(dst);
236 fclose(f);
237 return 0;