NEWS: Mention the MinGW include/valgrind.h fix
[valgrind.git] / cachegrind / cg_arch.c
blob57570dd6387c38a40d87b08b3e6d29b41c10cb27
1 /*--------------------------------------------------------------------*/
2 /*--- Cachegrind: cache configuration. cg-arch.c ---*/
3 /*--------------------------------------------------------------------*/
5 /*
6 This file is part of Cachegrind, a Valgrind tool for cache
7 profiling programs.
9 Copyright (C) 2011-2017 Nicholas Nethercote
10 njn@valgrind.org
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of the
15 License, or (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, see <http://www.gnu.org/licenses/>.
25 The GNU General Public License is contained in the file COPYING.
28 #include "pub_tool_basics.h"
29 #include "pub_tool_libcassert.h"
30 #include "pub_tool_libcbase.h"
31 #include "pub_tool_libcprint.h"
32 #include "pub_tool_options.h"
33 #include "pub_tool_machine.h"
35 #include "cg_arch.h"
37 static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
38 Bool all_caches_clo_defined);
40 // Checks cache config is ok. Returns NULL if ok, or a pointer to an error
41 // string otherwise.
42 static const HChar* check_cache(cache_t* cache)
44 // Simulator requires set count to be a power of two.
45 if ((cache->size % (cache->line_size * cache->assoc) != 0) ||
46 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
48 return "Cache set count is not a power of two.\n";
51 // Simulator requires line size to be a power of two.
52 if (-1 == VG_(log2)(cache->line_size)) {
53 return "Cache line size is not a power of two.\n";
56 // Then check line size >= 16 -- any smaller and a single instruction could
57 // straddle three cache lines, which breaks a simulation assertion and is
58 // stupid anyway.
59 if (cache->line_size < MIN_LINE_SIZE) {
60 return "Cache line size is too small.\n";
63 /* Then check cache size > line size (causes seg faults if not). */
64 if (cache->size <= cache->line_size) {
65 return "Cache size <= line size.\n";
68 /* Then check assoc <= (size / line size) (seg faults otherwise). */
69 if (cache->assoc > (cache->size / cache->line_size)) {
70 return "Cache associativity > (size / line size).\n";
73 return NULL;
77 static void parse_cache_opt ( cache_t* cache, const HChar* opt,
78 const HChar* optval )
80 Long i1, i2, i3;
81 HChar* endptr;
82 const HChar* checkRes;
84 // Option argument looks like "65536,2,64". Extract them.
85 i1 = VG_(strtoll10)(optval, &endptr); if (*endptr != ',') goto bad;
86 i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',') goto bad;
87 i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
89 // Check for overflow.
90 cache->size = (Int)i1;
91 cache->assoc = (Int)i2;
92 cache->line_size = (Int)i3;
93 if (cache->size != i1) goto overflow;
94 if (cache->assoc != i2) goto overflow;
95 if (cache->line_size != i3) goto overflow;
97 checkRes = check_cache(cache);
98 if (checkRes) {
99 VG_(fmsg)("%s", checkRes);
100 goto bad;
103 return;
105 bad:
106 VG_(fmsg_bad_option)(opt, "Bad argument '%s'\n", optval);
108 overflow:
109 VG_(fmsg_bad_option)(opt,
110 "One of the cache parameters was too large and overflowed.\n");
114 Bool VG_(str_clo_cache_opt)(const HChar *arg,
115 cache_t* clo_I1c,
116 cache_t* clo_D1c,
117 cache_t* clo_LLc)
119 const HChar* tmp_str;
121 if VG_STR_CLO(arg, "--I1", tmp_str) {
122 parse_cache_opt(clo_I1c, arg, tmp_str);
123 return True;
124 } else if VG_STR_CLO(arg, "--D1", tmp_str) {
125 parse_cache_opt(clo_D1c, arg, tmp_str);
126 return True;
127 } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
128 VG_STR_CLO(arg, "--LL", tmp_str)) {
129 parse_cache_opt(clo_LLc, arg, tmp_str);
130 return True;
131 } else
132 return False;
135 static void umsg_cache_img(const HChar* desc, cache_t* c)
137 VG_(umsg)(" %s: %'d B, %d-way, %d B lines\n", desc,
138 c->size, c->assoc, c->line_size);
141 // Verifies if c is a valid cache.
142 // An invalid value causes an assert, unless clo_redefined is True.
143 static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined)
145 const HChar* checkRes;
147 checkRes = check_cache(c);
148 if (checkRes) {
149 VG_(umsg)("Auto-detected %s cache configuration not supported: %s",
150 desc, checkRes);
151 umsg_cache_img(desc, c);
152 if (!clo_redefined) {
153 VG_(umsg)("As it probably should be supported, please report a bug!\n");
154 VG_(umsg)("Bypass this message by using option --%s=...\n", desc);
155 tl_assert(0);
161 /* If the LL cache config isn't something the simulation functions
162 can handle, try to adjust it so it is. Caches are characterised
163 by (total size T, line size L, associativity A), and then we
164 have
166 number of sets S = T / (L * A)
168 The required constraints are:
170 * L must be a power of 2, but it always is in practice, so
171 no problem there
173 * A can be any value >= 1
175 * T can be any value, but ..
177 * S must be a power of 2.
179 That sometimes gives a problem. For example, some Core iX based
180 Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
181 sets. Some AMD cpus have T = 5MB, A = 48, L = 64, which gives
182 1706.667 sets (!).
184 The "fix" is to force S down to the nearest power of two below its
185 original value, and increase A proportionately, so as to keep the
186 total cache size the same. In fact to be safe we recalculate the
187 cache size afterwards anyway, to guarantee that it divides exactly
188 between the new number of sets.
190 The "fix" is "justified" (cough, cough) by alleging that
191 increases of associativity above about 4 have very little effect
192 on the actual miss rate. It would be far more inaccurate to
193 fudge this by changing the size of the simulated cache --
194 changing the associativity is a much better option.
197 /* (Helper function) Returns the largest power of 2 that is <= |x|.
198 Even works when |x| == 0. */
199 static UInt floor_power_of_2 ( UInt x )
201 x = x | (x >> 1);
202 x = x | (x >> 2);
203 x = x | (x >> 4);
204 x = x | (x >> 8);
205 x = x | (x >> 16);
206 return x - (x >> 1);
209 static void
210 maybe_tweak_LLc(cache_t *LLc)
212 if (LLc->size == 0 || LLc->assoc == 0 || LLc->line_size == 0)
213 return;
215 tl_assert(LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0);
217 UInt old_size = (UInt)LLc->size;
218 UInt old_assoc = (UInt)LLc->assoc;
219 UInt old_line_size = (UInt)LLc->line_size;
221 UInt new_size = old_size;
222 UInt new_assoc = old_assoc;
223 UInt new_line_size = old_line_size;
225 UInt old_nSets = old_size / (old_assoc * old_line_size);
226 if (old_nSets == 0) {
227 /* This surely can't happen; but would cause chaos with the maths
228 * below if it did. Just give up if it does. */
229 return;
232 if (-1 != VG_(log2_64)(old_nSets)) {
233 /* The number of sets is already a power of 2. Make sure that
234 the size divides exactly between the sets. Almost all of the
235 time this will have no effect. */
236 new_size = old_line_size * old_assoc * old_nSets;
237 } else {
238 /* The number of sets isn't a power of two. Calculate some
239 scale-down factor which causes the number of sets to become a
240 power of two. Then, increase the associativity by that
241 factor. Finally, re-calculate the total size so as to make
242 sure it divides exactly between the sets. */
243 tl_assert(old_nSets >= 0);
244 UInt new_nSets = floor_power_of_2 ( old_nSets );
245 tl_assert(new_nSets > 0 && new_nSets < old_nSets);
246 Double factor = (Double)old_nSets / (Double)new_nSets;
247 tl_assert(factor >= 1.0);
249 new_assoc = (UInt)(0.5 + factor * (Double)old_assoc);
250 tl_assert(new_assoc >= old_assoc);
252 new_size = old_line_size * new_assoc * new_nSets;
255 tl_assert(new_line_size == old_line_size); /* we never change this */
256 if (new_size == old_size && new_assoc == old_assoc)
257 return;
259 VG_(dmsg)("warning: "
260 "specified LL cache: line_size %u assoc %u total_size %'u\n",
261 old_line_size, old_assoc, old_size);
262 VG_(dmsg)("warning: "
263 "simulated LL cache: line_size %u assoc %u total_size %'u\n",\
264 new_line_size, new_assoc, new_size);
266 LLc->size = new_size;
267 LLc->assoc = new_assoc;
268 LLc->line_size = new_line_size;
271 void VG_(post_clo_init_configure_caches)(cache_t* I1c,
272 cache_t* D1c,
273 cache_t* LLc,
274 cache_t* clo_I1c,
275 cache_t* clo_D1c,
276 cache_t* clo_LLc)
278 #define DEFINED(L) (-1 != L->size || -1 != L->assoc || -1 != L->line_size)
280 // Count how many were defined on the command line.
281 Bool all_caches_clo_defined =
282 (DEFINED(clo_I1c) &&
283 DEFINED(clo_D1c) &&
284 DEFINED(clo_LLc));
286 // Set the cache config (using auto-detection, if supported by the
287 // architecture).
288 configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
290 maybe_tweak_LLc( LLc );
292 // Check the default/auto-detected values.
293 // Allow the user to override invalid auto-detected caches
294 // with command line.
295 check_cache_or_override ("I1", I1c, DEFINED(clo_I1c));
296 check_cache_or_override ("D1", D1c, DEFINED(clo_D1c));
297 check_cache_or_override ("LL", LLc, DEFINED(clo_LLc));
299 // Then replace with any defined on the command line. (Already checked in
300 // VG(parse_clo_cache_opt)().)
301 if (DEFINED(clo_I1c)) { *I1c = *clo_I1c; }
302 if (DEFINED(clo_D1c)) { *D1c = *clo_D1c; }
303 if (DEFINED(clo_LLc)) { *LLc = *clo_LLc; }
305 if (VG_(clo_verbosity) >= 2) {
306 VG_(umsg)("Cache configuration used:\n");
307 umsg_cache_img ("I1", I1c);
308 umsg_cache_img ("D1", D1c);
309 umsg_cache_img ("LL", LLc);
311 #undef DEFINED
314 void VG_(print_cache_clo_opts)()
316 VG_(printf)(
317 " --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
318 " --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
319 " --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
324 // Traverse the cache info and return a cache of the given kind and level.
325 // Return NULL if no such cache exists.
326 static const VexCache *
327 locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level)
329 const VexCache *c;
331 for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) {
332 if (c->level == level && c->kind == kind) {
333 return c;
336 return NULL; // not found
340 // Gives the auto-detected configuration of I1, D1 and LL caches. They get
341 // overridden by any cache configurations specified on the command line.
342 static void
343 configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc,
344 Bool all_caches_clo_defined)
346 VexArchInfo vai;
347 const VexCacheInfo *ci;
348 const VexCache *i1, *d1, *ll;
350 VG_(machine_get_VexArchInfo)(NULL, &vai);
351 ci = &vai.hwcache_info;
353 // Extract what we need
354 i1 = locate_cache(ci, INSN_CACHE, 1);
355 d1 = locate_cache(ci, DATA_CACHE, 1);
356 ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels);
358 if (ci->num_caches > 0 && ll == NULL) {
359 VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
362 if (ll && ci->num_levels > 2) {
363 VG_(dmsg)("warning: L%u cache found, using its data for the "
364 "LL simulation.\n", ci->num_levels);
367 if (i1 && d1 && ll) {
368 if (i1->is_trace_cache) {
369 /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
370 * conversion to byte size is a total guess; treat the 12K and 16K
371 * cases the same since the cache byte size must be a power of two for
372 * everything to work!. Also guessing 32 bytes for the line size...
374 UInt adjusted_size, guessed_line_size = 32;
376 if (i1->sizeB == 12 * 1024 || i1->sizeB == 16 * 1024) {
377 adjusted_size = 16 * 1024;
378 } else {
379 adjusted_size = 32 * 1024;
381 VG_(dmsg)("warning: Pentium 4 with %u KB micro-op instruction trace cache\n",
382 i1->sizeB / 1024);
383 VG_(dmsg)(" Simulating a %u KB I-cache with %u B lines\n",
384 adjusted_size / 1024, guessed_line_size);
386 *I1c = (cache_t) { adjusted_size, i1->assoc, guessed_line_size };
387 } else {
388 *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB };
390 *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB };
391 *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB };
393 return;
396 // Cache information could not be queried; choose some default
397 // architecture specific default setting.
399 #if defined(VGA_ppc32)
401 // Default cache configuration
402 *I1c = (cache_t) { 65536, 2, 64 };
403 *D1c = (cache_t) { 65536, 2, 64 };
404 *LLc = (cache_t) { 262144, 8, 64 };
406 #elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
408 // Default cache configuration
409 *I1c = (cache_t) { 65536, 2, 64 };
410 *D1c = (cache_t) { 65536, 2, 64 };
411 *LLc = (cache_t) { 262144, 8, 64 };
413 #elif defined(VGA_arm)
415 // Set caches to default (for Cortex-A8 ?)
416 *I1c = (cache_t) { 16384, 4, 64 };
417 *D1c = (cache_t) { 16384, 4, 64 };
418 *LLc = (cache_t) { 262144, 8, 64 };
420 #elif defined(VGA_arm64)
422 // Copy the 32-bit ARM version until such time as we have
423 // some real hardware to run on
424 *I1c = (cache_t) { 16384, 4, 64 };
425 *D1c = (cache_t) { 16384, 4, 64 };
426 *LLc = (cache_t) { 262144, 8, 64 };
428 #elif defined(VGA_s390x)
430 // Here is the cache data from older machine models:
432 // I1 D1 I/D L2
433 // z900 256k/256/4 256k/256/4 16MB
434 // z800 256k/256/4 256k/256/4 8MB
435 // z990 256k/256/4 256k/256/4 32MB
436 // z890 256k/256/4 256k/256/4 32MB
437 // z9 256k/256/4 256k/256/4 40MB
439 // Sources:
440 // (1) IBM System z9 109 Technical Introduction
441 // www.redbooks.ibm.com/redbooks/pdfs/sg246669.pdf
442 // (2) The microarchitecture of the IBM eServer z900 processor
443 // IBM Journal of Research and Development
444 // Volume 46, Number 4/5, pp 381-395, July/September 2002
445 // (3) The IBM eServer z990 microprocessor
446 // IBM Journal of Research and Development
447 // Volume 48, Number 3/4, pp 295-309, May/July 2004
448 // (4) Charles Webb, IBM
450 // L2 data is unfortunately incomplete. Otherwise, we could support
451 // machines without the ECAG insn by looking at VEX_S390X_MODEL(hwcaps).
453 // Default cache configuration is z10-EC (Source: ECAG insn)
454 *I1c = (cache_t) { 65536, 4, 256 };
455 *D1c = (cache_t) { 131072, 8, 256 };
456 *LLc = (cache_t) { 50331648, 24, 256 };
458 #elif defined(VGA_mips32) || defined(VGA_nanomips)
460 // Set caches to default (for MIPS32-r2(mips 74kc))
461 *I1c = (cache_t) { 32768, 4, 32 };
462 *D1c = (cache_t) { 32768, 4, 32 };
463 *LLc = (cache_t) { 524288, 8, 32 };
465 #elif defined(VGA_mips64)
467 // Set caches to default (for MIPS64 - 5kc)
468 *I1c = (cache_t) { 32768, 4, 32 };
469 *D1c = (cache_t) { 32768, 4, 32 };
470 *LLc = (cache_t) { 524288, 8, 32 };
472 #elif defined(VGA_x86) || defined(VGA_amd64)
474 *I1c = (cache_t) { 65536, 2, 64 };
475 *D1c = (cache_t) { 65536, 2, 64 };
476 *LLc = (cache_t) { 262144, 8, 64 };
478 #else
480 #error "Unknown arch"
482 #endif
484 if (!all_caches_clo_defined) {
485 const HChar warning[] =
486 "Warning: Cannot auto-detect cache config, using defaults.\n"
487 " Run with -v to see.\n";
488 VG_(dmsg)("%s", warning);
492 /*--------------------------------------------------------------------*/
493 /*--- end ---*/
494 /*--------------------------------------------------------------------*/