1 // This file is part of the ustl library, an STL implementation.
3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4 // This file is free software, distributed under the MIT License.
8 // Copy and fill optimizations are here.
11 #ifndef NDEBUG // Optimized code here. asserts slow it down, and are checked elsewhere.
19 // Generic version for implementing fill_nX_fast on non-i386 architectures.
20 template <typename T
> static inline void stosv (T
*& p
, size_t n
, T v
)
21 { while (n
--) *p
++ = v
; }
23 #if defined(__i386__) || defined(__x86_64__)
25 //----------------------------------------------------------------------
27 //----------------------------------------------------------------------
30 static inline void movsb_dir_up (void) __attribute__((always_inline
));
31 static inline void movsb_dir_down (void) __attribute__((always_inline
));
32 static inline void movsb (const void*& src
, size_t nBytes
, void*& dest
) __attribute__((always_inline
));
33 static inline void movsd (const void*& src
, size_t nWords
, void*& dest
) __attribute__((always_inline
));
36 static inline void movsb_dir_up (void) { asm volatile ("cld"); }
37 static inline void movsb_dir_down (void) { asm volatile ("std"); }
39 static inline void movsb (const void*& src
, size_t nBytes
, void*& dest
)
41 asm volatile ("rep;\n\tmovsb"
42 : "=&S"(src
), "=&D"(dest
), "=&c"(nBytes
)
43 : "0"(src
), "1"(dest
), "2"(nBytes
)
47 static inline void movsd (const void*& src
, size_t nWords
, void*& dest
)
49 asm volatile ("rep;\n\tmovsl"
50 : "=&S"(src
), "=&D"(dest
), "=&c"(nWords
)
51 : "0"(src
), "1"(dest
), "2"(nWords
)
55 template <> inline void stosv (uint8_t*& p
, size_t n
, uint8_t v
)
56 { asm volatile ("rep;\n\tstosb" : "=&D"(p
), "=c"(n
) : "0"(p
), "1"(n
), "a"(v
) : "memory"); }
57 template <> inline void stosv (uint16_t*& p
, size_t n
, uint16_t v
)
58 { asm volatile ("rep;\n\tstosw" : "=&D"(p
), "=c"(n
) : "0"(p
), "1"(n
), "a"(v
) : "memory"); }
59 template <> inline void stosv (uint32_t*& p
, size_t n
, uint32_t v
)
60 { asm volatile ("rep;\n\tstosl" : "=&D"(p
), "=c"(n
) : "0"(p
), "1"(n
), "a"(v
) : "memory"); }
63 #define MMX_ALIGN 16U // Data must be aligned on this grain
64 #define MMX_BS 32U // Assembly routines copy data this many bytes at a time.
66 static inline void simd_block_copy (const void* src
, void* dest
) __attribute__((always_inline
));
67 static inline void simd_block_store (uint8_t* dest
) __attribute__((always_inline
));
68 static inline void simd_block_cleanup (void) __attribute__((always_inline
));
70 static inline void simd_block_copy (const void* src
, void* dest
)
72 const char* csrc ((const char*) src
);
73 char* cdest ((char*) dest
);
76 "movaps\t%2, %%xmm0 \n\t"
77 "movaps\t%3, %%xmm1 \n\t"
78 "movntps\t%%xmm0, %0 \n\t"
80 : "=m"(cdest
[0]), "=m"(cdest
[16])
81 : "m"(csrc
[0]), "m"(csrc
[16])
82 : "xmm0", "xmm1", "memory");
93 : "=m"(cdest
[0]), "=m"(cdest
[8]), "=m"(cdest
[16]), "=m"(cdest
[24])
94 : "m"(csrc
[0]), "m"(csrc
[8]), "m"(csrc
[16]), "m"(csrc
[24])
95 : "mm0", "mm1", "mm2", "mm3", "st", "st(1)", "st(2)", "st(3)", "memory");
99 static inline void simd_block_store (uint8_t* dest
)
103 "movntq %%mm0, %0\n\t"
104 "movntq %%mm0, %1\n\t"
105 "movntq %%mm0, %2\n\t"
107 : "=m"(dest
[0]), "=m"(dest
[8]), "=m"(dest
[16]), "=m"(dest
[24])
111 "movq %%mm0, %0 \n\t"
112 "movq %%mm0, %1 \n\t"
113 "movq %%mm0, %2 \n\t"
115 : "=m"(dest
[0]), "=m"(dest
[8]), "=m"(dest
[16]), "=m"(dest
[24])
120 static inline void simd_block_cleanup (void)
125 asm volatile ("sfence");
128 /// The fastest optimized raw memory copy.
129 void copy_n_fast (const void* src
, size_t nBytes
, void* dest
) throw()
132 size_t nHeadBytes
= Align(uintptr_t(src
), MMX_ALIGN
) - uintptr_t(src
);
133 nHeadBytes
= min (nHeadBytes
, nBytes
);
134 movsb (src
, nHeadBytes
, dest
);
135 nBytes
-= nHeadBytes
;
136 if (!(uintptr_t(dest
) % MMX_ALIGN
)) {
137 const size_t nMiddleBlocks
= nBytes
/ MMX_BS
;
138 for (uoff_t i
= 0; i
< nMiddleBlocks
; ++ i
) {
139 prefetch (advance (src
, 512), 0, 0);
140 simd_block_copy (src
, dest
);
141 src
= advance (src
, MMX_BS
);
142 dest
= advance (dest
, MMX_BS
);
144 simd_block_cleanup();
147 movsb (src
, nBytes
, dest
);
149 #endif // CPU_HAS_MMX
151 /// The fastest optimized backwards raw memory copy.
152 void copy_backward_fast (const void* first
, const void* last
, void* result
) throw()
154 prefetch (first
, 0, 0);
155 prefetch (result
, 1, 0);
156 size_t nBytes (distance (first
, last
));
158 size_t nHeadBytes
= uintptr_t(last
) % 4;
159 last
= advance (last
, -1);
160 result
= advance (result
, -1);
161 movsb (last
, nHeadBytes
, result
);
162 nBytes
-= nHeadBytes
;
163 if (uintptr_t(result
) % 4 == 3) {
164 const size_t nMiddleBlocks
= nBytes
/ 4;
165 last
= advance (last
, -3);
166 result
= advance (result
, -3);
167 movsd (last
, nMiddleBlocks
, result
);
170 movsb (last
, nBytes
, result
);
175 //----------------------------------------------------------------------
177 //----------------------------------------------------------------------
180 template <typename T
> static inline void build_block (T
) {}
181 template <> inline void build_block (uint8_t v
)
184 "movd %0, %%mm0\n\tpunpcklbw %%mm0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
185 : : "g"(uint32_t(v
)) : "mm0");
187 template <> inline void build_block (uint16_t v
)
190 "movd %0, %%mm0\n\tpshufw $0, %%mm0, %%mm0"
191 : : "g"(uint32_t(v
)) : "mm0");
193 template <> inline void build_block (uint32_t v
)
196 "movd %0, %%mm0\n\tpunpckldq %%mm0, %%mm0"
197 : : "g"(uint32_t(v
)) : "mm0");
200 static inline void simd_block_fill_loop (uint8_t*& dest
, size_t count
)
202 prefetch (advance (dest
, 512), 1, 0);
203 for (uoff_t i
= 0; i
< count
; ++ i
, dest
+= MMX_BS
)
204 simd_block_store (dest
);
205 simd_block_cleanup();
209 template <typename T
>
210 static inline void fill_n_fast (T
* dest
, size_t count
, T v
)
212 size_t nHead
= Align(uintptr_t(dest
), MMX_ALIGN
) - uintptr_t(dest
) / sizeof(T
);
213 nHead
= min (nHead
, count
);
214 stosv (dest
, nHead
, v
);
217 uint8_t* bdest
= (uint8_t*) dest
;
218 simd_block_fill_loop (bdest
, count
* sizeof(T
) / MMX_BS
);
221 stosv (dest
, count
, v
);
224 void fill_n8_fast (uint8_t* dest
, size_t count
, uint8_t v
) throw()
225 { fill_n_fast (dest
, count
, v
); }
226 void fill_n16_fast (uint16_t* dest
, size_t count
, uint16_t v
) throw()
227 { fill_n_fast (dest
, count
, v
); }
228 void fill_n32_fast (uint32_t* dest
, size_t count
, uint32_t v
) throw()
229 { fill_n_fast (dest
, count
, v
); }
231 void fill_n8_fast (uint8_t* dest
, size_t count
, uint8_t v
) throw() { memset (dest
, v
, count
); }
232 void fill_n16_fast (uint16_t* dest
, size_t count
, uint16_t v
) throw() { stosv (dest
, count
, v
); }
233 void fill_n32_fast (uint32_t* dest
, size_t count
, uint32_t v
) throw() { stosv (dest
, count
, v
); }
234 #endif // CPU_HAS_MMX
236 /// Exchanges ranges [first, middle) and [middle, last)
237 void rotate_fast (void* first
, void* middle
, void* last
) throw()
240 const size_t half1 (distance (first
, middle
)), half2 (distance (middle
, last
));
241 const size_t hmin (min (half1
, half2
));
244 void* buf
= alloca (hmin
);
247 copy_n_fast (middle
, half2
, buf
);
248 copy_backward_fast (first
, middle
, last
);
249 copy_n_fast (buf
, half2
, first
);
251 copy_n_fast (first
, half1
, buf
);
252 copy_n_fast (middle
, half2
, first
);
253 copy_n_fast (buf
, half1
, advance (first
, half2
));
257 if (first
== middle
|| middle
== last
)
261 char* f
= (char*) first
;
262 char* m
= (char*) middle
;
263 char* l
= (char*) last
;
266 while (f
!= m
&& m
!= l
)
267 iter_swap (f
++, --l
);
268 reverse (f
, (f
== m
? l
: m
));
273 size_t popcount (uint32_t v
)
275 const uint32_t w
= v
- ((v
>> 1) & 0x55555555); // Algorithm from AMD optimization guide
276 const uint32_t x
= (w
& 0x33333333) + ((w
>> 2) & 0x33333333);
277 return (((x
+ (x
>> 4) & 0x0F0F0F0F) * 0x01010101) >> 24);
281 /// \brief Returns the number of 1s in \p v in binary.
282 size_t popcount (uint64_t v
)
284 v
-= (v
>> 1) & UINT64_C(0x5555555555555555); // Algorithm from Wikipedia
285 v
= (v
& UINT64_C(0x3333333333333333)) + ((v
>> 2) & UINT64_C(0x3333333333333333));
286 v
= (v
+ (v
>> 4)) & UINT64_C(0x0F0F0F0F0F0F0F0F);
287 return ((v
* UINT64_C(0x0101010101010101)) >> 56);
289 #endif // HAVE_INT64_T
292 //----------------------------------------------------------------------
293 // Miscellaneous instantiated stuff from headers which don't have enough
294 // to warrant creation of a separate file.cc
295 //----------------------------------------------------------------------
297 // Used in uspecial to print printable characters
298 const char _FmtPrtChr
[2][8]={"'%c'","%d"};