Build system improvements
[ustl.git] / ualgobase.cc
blobe664c4937d7f680f3b2ff8dc91ced7197bfaa710
1 // This file is part of the ustl library, an STL implementation.
2 //
3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4 // This file is free software, distributed under the MIT License.
5 //
6 // ualgobase.cc
7 //
8 // Copy and fill optimizations are here.
9 //
11 #ifndef NDEBUG // Optimized code here. asserts slow it down, and are checked elsewhere.
12 #define NDEBUG
13 #endif
15 #include "ualgo.h"
17 namespace ustl {
19 // Generic version for implementing fill_nX_fast on non-i386 architectures.
20 template <typename T> static inline void stosv (T*& p, size_t n, T v)
21 { while (n--) *p++ = v; }
23 #if defined(__i386__) || defined(__x86_64__)
25 //----------------------------------------------------------------------
26 // Copy functions
27 //----------------------------------------------------------------------
29 #if __GNUC__ >= 3
30 static inline void movsb_dir_up (void) __attribute__((always_inline));
31 static inline void movsb_dir_down (void) __attribute__((always_inline));
32 static inline void movsb (const void*& src, size_t nBytes, void*& dest) __attribute__((always_inline));
33 static inline void movsd (const void*& src, size_t nWords, void*& dest) __attribute__((always_inline));
34 #endif
36 static inline void movsb_dir_up (void) { asm volatile ("cld"); }
37 static inline void movsb_dir_down (void) { asm volatile ("std"); }
39 static inline void movsb (const void*& src, size_t nBytes, void*& dest)
41 asm volatile ("rep;\n\tmovsb"
42 : "=&S"(src), "=&D"(dest), "=&c"(nBytes)
43 : "0"(src), "1"(dest), "2"(nBytes)
44 : "memory");
47 static inline void movsd (const void*& src, size_t nWords, void*& dest)
49 asm volatile ("rep;\n\tmovsl"
50 : "=&S"(src), "=&D"(dest), "=&c"(nWords)
51 : "0"(src), "1"(dest), "2"(nWords)
52 : "memory");
55 template <> inline void stosv (uint8_t*& p, size_t n, uint8_t v)
56 { asm volatile ("rep;\n\tstosb" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
57 template <> inline void stosv (uint16_t*& p, size_t n, uint16_t v)
58 { asm volatile ("rep;\n\tstosw" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
59 template <> inline void stosv (uint32_t*& p, size_t n, uint32_t v)
60 { asm volatile ("rep;\n\tstosl" : "=&D"(p), "=c"(n) : "0"(p), "1"(n), "a"(v) : "memory"); }
62 #if CPU_HAS_MMX
63 #define MMX_ALIGN 16U // Data must be aligned on this grain
64 #define MMX_BS 32U // Assembly routines copy data this many bytes at a time.
66 static inline void simd_block_copy (const void* src, void* dest) __attribute__((always_inline));
67 static inline void simd_block_store (uint8_t* dest) __attribute__((always_inline));
68 static inline void simd_block_cleanup (void) __attribute__((always_inline));
70 static inline void simd_block_copy (const void* src, void* dest)
72 const char* csrc ((const char*) src);
73 char* cdest ((char*) dest);
74 #if CPU_HAS_SSE
75 asm (
76 "movaps\t%2, %%xmm0 \n\t"
77 "movaps\t%3, %%xmm1 \n\t"
78 "movntps\t%%xmm0, %0 \n\t"
79 "movntps\t%%xmm1, %1"
80 : "=m"(cdest[0]), "=m"(cdest[16])
81 : "m"(csrc[0]), "m"(csrc[16])
82 : "xmm0", "xmm1", "memory");
83 #else
84 asm (
85 "movq %4, %%mm0 \n\t"
86 "movq %5, %%mm1 \n\t"
87 "movq %6, %%mm2 \n\t"
88 "movq %7, %%mm3 \n\t"
89 "movq %%mm0, %0 \n\t"
90 "movq %%mm1, %1 \n\t"
91 "movq %%mm2, %2 \n\t"
92 "movq %%mm3, %3"
93 : "=m"(cdest[0]), "=m"(cdest[8]), "=m"(cdest[16]), "=m"(cdest[24])
94 : "m"(csrc[0]), "m"(csrc[8]), "m"(csrc[16]), "m"(csrc[24])
95 : "mm0", "mm1", "mm2", "mm3", "st", "st(1)", "st(2)", "st(3)", "memory");
96 #endif
99 static inline void simd_block_store (uint8_t* dest)
101 #if CPU_HAS_SSE
102 asm volatile (
103 "movntq %%mm0, %0\n\t"
104 "movntq %%mm0, %1\n\t"
105 "movntq %%mm0, %2\n\t"
106 "movntq %%mm0, %3"
107 : "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])
108 :: "memory");
109 #else
110 asm volatile (
111 "movq %%mm0, %0 \n\t"
112 "movq %%mm0, %1 \n\t"
113 "movq %%mm0, %2 \n\t"
114 "movq %%mm0, %3"
115 : "=m"(dest[0]), "=m"(dest[8]), "=m"(dest[16]), "=m"(dest[24])
116 :: "memory");
117 #endif
120 static inline void simd_block_cleanup (void)
122 #if !CPU_HAS_SSE
123 simd::reset_mmx();
124 #endif
125 asm volatile ("sfence");
128 /// The fastest optimized raw memory copy.
129 void copy_n_fast (const void* src, size_t nBytes, void* dest) throw()
131 movsb_dir_up();
132 size_t nHeadBytes = Align(uintptr_t(src), MMX_ALIGN) - uintptr_t(src);
133 nHeadBytes = min (nHeadBytes, nBytes);
134 movsb (src, nHeadBytes, dest);
135 nBytes -= nHeadBytes;
136 if (!(uintptr_t(dest) % MMX_ALIGN)) {
137 const size_t nMiddleBlocks = nBytes / MMX_BS;
138 for (uoff_t i = 0; i < nMiddleBlocks; ++ i) {
139 prefetch (advance (src, 512), 0, 0);
140 simd_block_copy (src, dest);
141 src = advance (src, MMX_BS);
142 dest = advance (dest, MMX_BS);
144 simd_block_cleanup();
145 nBytes %= MMX_BS;
147 movsb (src, nBytes, dest);
149 #endif // CPU_HAS_MMX
151 /// The fastest optimized backwards raw memory copy.
152 void copy_backward_fast (const void* first, const void* last, void* result) throw()
154 prefetch (first, 0, 0);
155 prefetch (result, 1, 0);
156 size_t nBytes (distance (first, last));
157 movsb_dir_down();
158 size_t nHeadBytes = uintptr_t(last) % 4;
159 last = advance (last, -1);
160 result = advance (result, -1);
161 movsb (last, nHeadBytes, result);
162 nBytes -= nHeadBytes;
163 if (uintptr_t(result) % 4 == 3) {
164 const size_t nMiddleBlocks = nBytes / 4;
165 last = advance (last, -3);
166 result = advance (result, -3);
167 movsd (last, nMiddleBlocks, result);
168 nBytes %= 4;
170 movsb (last, nBytes, result);
171 movsb_dir_up();
173 #endif // __i386__
175 //----------------------------------------------------------------------
176 // Fill functions
177 //----------------------------------------------------------------------
179 #if CPU_HAS_MMX
180 template <typename T> static inline void build_block (T) {}
181 template <> inline void build_block (uint8_t v)
183 asm volatile (
184 "movd %0, %%mm0\n\tpunpcklbw %%mm0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
185 : : "g"(uint32_t(v)) : "mm0");
187 template <> inline void build_block (uint16_t v)
189 asm volatile (
190 "movd %0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
191 : : "g"(uint32_t(v)) : "mm0");
193 template <> inline void build_block (uint32_t v)
195 asm volatile (
196 "movd %0, %%mm0\n\tpunpckldq %%mm0, %%mm0"
197 : : "g"(uint32_t(v)) : "mm0");
200 static inline void simd_block_fill_loop (uint8_t*& dest, size_t count)
202 prefetch (advance (dest, 512), 1, 0);
203 for (uoff_t i = 0; i < count; ++ i, dest += MMX_BS)
204 simd_block_store (dest);
205 simd_block_cleanup();
206 simd::reset_mmx();
209 template <typename T>
210 static inline void fill_n_fast (T* dest, size_t count, T v)
212 size_t nHead = Align(uintptr_t(dest), MMX_ALIGN) - uintptr_t(dest) / sizeof(T);
213 nHead = min (nHead, count);
214 stosv (dest, nHead, v);
215 count -= nHead;
216 build_block (v);
217 uint8_t* bdest = (uint8_t*) dest;
218 simd_block_fill_loop (bdest, count * sizeof(T) / MMX_BS);
219 count %= MMX_BS;
220 dest = (T*) bdest;
221 stosv (dest, count, v);
224 void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) throw()
225 { fill_n_fast (dest, count, v); }
226 void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) throw()
227 { fill_n_fast (dest, count, v); }
228 void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) throw()
229 { fill_n_fast (dest, count, v); }
230 #else
231 void fill_n8_fast (uint8_t* dest, size_t count, uint8_t v) throw() { memset (dest, v, count); }
232 void fill_n16_fast (uint16_t* dest, size_t count, uint16_t v) throw() { stosv (dest, count, v); }
233 void fill_n32_fast (uint32_t* dest, size_t count, uint32_t v) throw() { stosv (dest, count, v); }
234 #endif // CPU_HAS_MMX
236 /// Exchanges ranges [first, middle) and [middle, last)
237 void rotate_fast (void* first, void* middle, void* last) throw()
239 #ifdef HAVE_ALLOCA_H
240 const size_t half1 (distance (first, middle)), half2 (distance (middle, last));
241 const size_t hmin (min (half1, half2));
242 if (!hmin)
243 return;
244 void* buf = alloca (hmin);
245 if (buf) {
246 if (half2 < half1) {
247 copy_n_fast (middle, half2, buf);
248 copy_backward_fast (first, middle, last);
249 copy_n_fast (buf, half2, first);
250 } else {
251 copy_n_fast (first, half1, buf);
252 copy_n_fast (middle, half2, first);
253 copy_n_fast (buf, half1, advance (first, half2));
255 } else
256 #else
257 if (first == middle || middle == last)
258 return;
259 #endif
261 char* f = (char*) first;
262 char* m = (char*) middle;
263 char* l = (char*) last;
264 reverse (f, m);
265 reverse (m, l);
266 while (f != m && m != l)
267 iter_swap (f++, --l);
268 reverse (f, (f == m ? l : m));
272 #if __GNUC__ < 4
273 size_t popcount (uint32_t v)
275 const uint32_t w = v - ((v >> 1) & 0x55555555); // Algorithm from AMD optimization guide
276 const uint32_t x = (w & 0x33333333) + ((w >> 2) & 0x33333333);
277 return (((x + (x >> 4) & 0x0F0F0F0F) * 0x01010101) >> 24);
280 #if HAVE_INT64_T
281 /// \brief Returns the number of 1s in \p v in binary.
282 size_t popcount (uint64_t v)
284 v -= (v >> 1) & UINT64_C(0x5555555555555555); // Algorithm from Wikipedia
285 v = (v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333));
286 v = (v + (v >> 4)) & UINT64_C(0x0F0F0F0F0F0F0F0F);
287 return ((v * UINT64_C(0x0101010101010101)) >> 56);
289 #endif // HAVE_INT64_T
290 #endif // !__GNUC__
292 //----------------------------------------------------------------------
293 // Miscellaneous instantiated stuff from headers which don't have enough
294 // to warrant creation of a separate file.cc
295 //----------------------------------------------------------------------
297 // Used in uspecial to print printable characters
298 const char _FmtPrtChr[2][8]={"'%c'","%d"};
300 } // namespace ustl