tests/vg_regtest: Always evaluate prerequisite expressions with sh
[valgrind.git] / cachegrind / cg_arch.c
blob376d79e33b97f54fcab0463450d2567aa698ede0
1 /*--------------------------------------------------------------------*/
2 /*--- Cachegrind: cache configuration. cg-arch.c ---*/
3 /*--------------------------------------------------------------------*/
5 /*
6 This file is part of Cachegrind, a Valgrind tool for cache
7 profiling programs.
9 Copyright (C) 2011-2013 Nicholas Nethercote
10 njn@valgrind.org
12 This program is free software; you can redistribute it and/or
13 modify it under the terms of the GNU General Public License as
14 published by the Free Software Foundation; either version 2 of the
15 License, or (at your option) any later version.
17 This program is distributed in the hope that it will be useful, but
18 WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program; if not, write to the Free Software
24 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25 02111-1307, USA.
27 The GNU General Public License is contained in the file COPYING.
30 #include "pub_tool_basics.h"
31 #include "pub_tool_libcassert.h"
32 #include "pub_tool_libcbase.h"
33 #include "pub_tool_libcprint.h"
34 #include "pub_tool_options.h"
35 #include "pub_tool_machine.h"
37 #include "cg_arch.h"
39 static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
40 Bool all_caches_clo_defined);
42 // Checks cache config is ok. Returns NULL if ok, or a pointer to an error
43 // string otherwise.
44 static const HChar* check_cache(cache_t* cache)
46 // Simulator requires set count to be a power of two.
47 if ((cache->size % (cache->line_size * cache->assoc) != 0) ||
48 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
50 return "Cache set count is not a power of two.\n";
53 // Simulator requires line size to be a power of two.
54 if (-1 == VG_(log2)(cache->line_size)) {
55 return "Cache line size is not a power of two.\n";
58 // Then check line size >= 16 -- any smaller and a single instruction could
59 // straddle three cache lines, which breaks a simulation assertion and is
60 // stupid anyway.
61 if (cache->line_size < MIN_LINE_SIZE) {
62 return "Cache line size is too small.\n";
65 /* Then check cache size > line size (causes seg faults if not). */
66 if (cache->size <= cache->line_size) {
67 return "Cache size <= line size.\n";
70 /* Then check assoc <= (size / line size) (seg faults otherwise). */
71 if (cache->assoc > (cache->size / cache->line_size)) {
72 return "Cache associativity > (size / line size).\n";
75 return NULL;
79 static void parse_cache_opt ( cache_t* cache, const HChar* opt,
80 const HChar* optval )
82 Long i1, i2, i3;
83 HChar* endptr;
84 const HChar* checkRes;
86 // Option argument looks like "65536,2,64". Extract them.
87 i1 = VG_(strtoll10)(optval, &endptr); if (*endptr != ',') goto bad;
88 i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',') goto bad;
89 i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
91 // Check for overflow.
92 cache->size = (Int)i1;
93 cache->assoc = (Int)i2;
94 cache->line_size = (Int)i3;
95 if (cache->size != i1) goto overflow;
96 if (cache->assoc != i2) goto overflow;
97 if (cache->line_size != i3) goto overflow;
99 checkRes = check_cache(cache);
100 if (checkRes) {
101 VG_(fmsg)("%s", checkRes);
102 goto bad;
105 return;
107 bad:
108 VG_(fmsg_bad_option)(opt, "Bad argument '%s'\n", optval);
110 overflow:
111 VG_(fmsg_bad_option)(opt,
112 "One of the cache parameters was too large and overflowed.\n");
116 Bool VG_(str_clo_cache_opt)(const HChar *arg,
117 cache_t* clo_I1c,
118 cache_t* clo_D1c,
119 cache_t* clo_LLc)
121 const HChar* tmp_str;
123 if VG_STR_CLO(arg, "--I1", tmp_str) {
124 parse_cache_opt(clo_I1c, arg, tmp_str);
125 return True;
126 } else if VG_STR_CLO(arg, "--D1", tmp_str) {
127 parse_cache_opt(clo_D1c, arg, tmp_str);
128 return True;
129 } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
130 VG_STR_CLO(arg, "--LL", tmp_str)) {
131 parse_cache_opt(clo_LLc, arg, tmp_str);
132 return True;
133 } else
134 return False;
137 static void umsg_cache_img(const HChar* desc, cache_t* c)
139 VG_(umsg)(" %s: %'d B, %d-way, %d B lines\n", desc,
140 c->size, c->assoc, c->line_size);
143 // Verifies if c is a valid cache.
144 // An invalid value causes an assert, unless clo_redefined is True.
145 static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined)
147 const HChar* checkRes;
149 checkRes = check_cache(c);
150 if (checkRes) {
151 VG_(umsg)("Auto-detected %s cache configuration not supported: %s",
152 desc, checkRes);
153 umsg_cache_img(desc, c);
154 if (!clo_redefined) {
155 VG_(umsg)("As it probably should be supported, please report a bug!\n");
156 VG_(umsg)("Bypass this message by using option --%s=...\n", desc);
157 tl_assert(0);
163 /* If the LL cache config isn't something the simulation functions
164 can handle, try to adjust it so it is. Caches are characterised
165 by (total size T, line size L, associativity A), and then we
166 have
168 number of sets S = T / (L * A)
170 The required constraints are:
172 * L must be a power of 2, but it always is in practice, so
173 no problem there
175 * A can be any value >= 1
177 * T can be any value, but ..
179 * S must be a power of 2.
181 That sometimes gives a problem. For example, some Core iX based
182 Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
183 sets. Some AMD cpus have T = 5MB, A = 48, L = 64, which gives
184 1706.667 sets (!).
186 The "fix" is to force S down to the nearest power of two below its
187 original value, and increase A proportionately, so as to keep the
188 total cache size the same. In fact to be safe we recalculate the
189 cache size afterwards anyway, to guarantee that it divides exactly
190 between the new number of sets.
192 The "fix" is "justified" (cough, cough) by alleging that
193 increases of associativity above about 4 have very little effect
194 on the actual miss rate. It would be far more inaccurate to
195 fudge this by changing the size of the simulated cache --
196 changing the associativity is a much better option.
199 /* (Helper function) Returns the largest power of 2 that is <= |x|.
200 Even works when |x| == 0. */
201 static UInt floor_power_of_2 ( UInt x )
203 x = x | (x >> 1);
204 x = x | (x >> 2);
205 x = x | (x >> 4);
206 x = x | (x >> 8);
207 x = x | (x >> 16);
208 return x - (x >> 1);
211 static void
212 maybe_tweak_LLc(cache_t *LLc)
214 if (LLc->size == 0 || LLc->assoc == 0 || LLc->line_size == 0)
215 return;
217 tl_assert(LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0);
219 UInt old_size = (UInt)LLc->size;
220 UInt old_assoc = (UInt)LLc->assoc;
221 UInt old_line_size = (UInt)LLc->line_size;
223 UInt new_size = old_size;
224 UInt new_assoc = old_assoc;
225 UInt new_line_size = old_line_size;
227 UInt old_nSets = old_size / (old_assoc * old_line_size);
228 if (old_nSets == 0) {
229 /* This surely can't happen; but would cause chaos with the maths
230 * below if it did. Just give up if it does. */
231 return;
234 if (-1 != VG_(log2_64)(old_nSets)) {
235 /* The number of sets is already a power of 2. Make sure that
236 the size divides exactly between the sets. Almost all of the
237 time this will have no effect. */
238 new_size = old_line_size * old_assoc * old_nSets;
239 } else {
240 /* The number of sets isn't a power of two. Calculate some
241 scale-down factor which causes the number of sets to become a
242 power of two. Then, increase the associativity by that
243 factor. Finally, re-calculate the total size so as to make
244 sure it divides exactly between the sets. */
245 tl_assert(old_nSets >= 0);
246 UInt new_nSets = floor_power_of_2 ( old_nSets );
247 tl_assert(new_nSets > 0 && new_nSets < old_nSets);
248 Double factor = (Double)old_nSets / (Double)new_nSets;
249 tl_assert(factor >= 1.0);
251 new_assoc = (UInt)(0.5 + factor * (Double)old_assoc);
252 tl_assert(new_assoc >= old_assoc);
254 new_size = old_line_size * new_assoc * new_nSets;
257 tl_assert(new_line_size == old_line_size); /* we never change this */
258 if (new_size == old_size && new_assoc == old_assoc)
259 return;
261 VG_(dmsg)("warning: "
262 "specified LL cache: line_size %u assoc %u total_size %'u\n",
263 old_line_size, old_assoc, old_size);
264 VG_(dmsg)("warning: "
265 "simulated LL cache: line_size %u assoc %u total_size %'u\n",\
266 new_line_size, new_assoc, new_size);
268 LLc->size = new_size;
269 LLc->assoc = new_assoc;
270 LLc->line_size = new_line_size;
273 void VG_(post_clo_init_configure_caches)(cache_t* I1c,
274 cache_t* D1c,
275 cache_t* LLc,
276 cache_t* clo_I1c,
277 cache_t* clo_D1c,
278 cache_t* clo_LLc)
280 #define DEFINED(L) (-1 != L->size || -1 != L->assoc || -1 != L->line_size)
282 // Count how many were defined on the command line.
283 Bool all_caches_clo_defined =
284 (DEFINED(clo_I1c) &&
285 DEFINED(clo_D1c) &&
286 DEFINED(clo_LLc));
288 // Set the cache config (using auto-detection, if supported by the
289 // architecture).
290 configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
292 maybe_tweak_LLc( LLc );
294 // Check the default/auto-detected values.
295 // Allow the user to override invalid auto-detected caches
296 // with command line.
297 check_cache_or_override ("I1", I1c, DEFINED(clo_I1c));
298 check_cache_or_override ("D1", D1c, DEFINED(clo_D1c));
299 check_cache_or_override ("LL", LLc, DEFINED(clo_LLc));
301 // Then replace with any defined on the command line. (Already checked in
302 // VG(parse_clo_cache_opt)().)
303 if (DEFINED(clo_I1c)) { *I1c = *clo_I1c; }
304 if (DEFINED(clo_D1c)) { *D1c = *clo_D1c; }
305 if (DEFINED(clo_LLc)) { *LLc = *clo_LLc; }
307 if (VG_(clo_verbosity) >= 2) {
308 VG_(umsg)("Cache configuration used:\n");
309 umsg_cache_img ("I1", I1c);
310 umsg_cache_img ("D1", D1c);
311 umsg_cache_img ("LL", LLc);
313 #undef DEFINED
316 void VG_(print_cache_clo_opts)()
318 VG_(printf)(
319 " --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
320 " --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
321 " --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
326 // Traverse the cache info and return a cache of the given kind and level.
327 // Return NULL if no such cache exists.
328 static const VexCache *
329 locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level)
331 const VexCache *c;
333 for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) {
334 if (c->level == level && c->kind == kind) {
335 return c;
338 return NULL; // not found
342 // Gives the auto-detected configuration of I1, D1 and LL caches. They get
343 // overridden by any cache configurations specified on the command line.
344 static void
345 configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc,
346 Bool all_caches_clo_defined)
348 VexArchInfo vai;
349 const VexCacheInfo *ci;
350 const VexCache *i1, *d1, *ll;
352 VG_(machine_get_VexArchInfo)(NULL, &vai);
353 ci = &vai.hwcache_info;
355 // Extract what we need
356 i1 = locate_cache(ci, INSN_CACHE, 1);
357 d1 = locate_cache(ci, DATA_CACHE, 1);
358 ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels);
360 if (ci->num_caches > 0 && ll == NULL) {
361 VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
364 if (ll && ci->num_levels > 2) {
365 VG_(dmsg)("warning: L%u cache found, using its data for the "
366 "LL simulation.\n", ci->num_levels);
369 if (i1 && d1 && ll) {
370 if (i1->is_trace_cache) {
371 /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
372 * conversion to byte size is a total guess; treat the 12K and 16K
373 * cases the same since the cache byte size must be a power of two for
374 * everything to work!. Also guessing 32 bytes for the line size...
376 UInt adjusted_size, guessed_line_size = 32;
378 if (i1->sizeB == 12 * 1024 || i1->sizeB == 16 * 1024) {
379 adjusted_size = 16 * 1024;
380 } else {
381 adjusted_size = 32 * 1024;
383 VG_(dmsg)("warning: Pentium 4 with %u KB micro-op instruction trace cache\n",
384 i1->sizeB / 1024);
385 VG_(dmsg)(" Simulating a %d KB I-cache with %d B lines\n",
386 adjusted_size / 1024, guessed_line_size);
388 *I1c = (cache_t) { adjusted_size, i1->assoc, guessed_line_size };
389 } else {
390 *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB };
392 *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB };
393 *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB };
395 return;
398 // Cache information could not be queried; choose some default
399 // architecture specific default setting.
401 #if defined(VGA_ppc32)
403 // Default cache configuration
404 *I1c = (cache_t) { 65536, 2, 64 };
405 *D1c = (cache_t) { 65536, 2, 64 };
406 *LLc = (cache_t) { 262144, 8, 64 };
408 #elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
410 // Default cache configuration
411 *I1c = (cache_t) { 65536, 2, 64 };
412 *D1c = (cache_t) { 65536, 2, 64 };
413 *LLc = (cache_t) { 262144, 8, 64 };
415 #elif defined(VGA_arm)
417 // Set caches to default (for Cortex-A8 ?)
418 *I1c = (cache_t) { 16384, 4, 64 };
419 *D1c = (cache_t) { 16384, 4, 64 };
420 *LLc = (cache_t) { 262144, 8, 64 };
422 #elif defined(VGA_arm64)
424 // Copy the 32-bit ARM version until such time as we have
425 // some real hardware to run on
426 *I1c = (cache_t) { 16384, 4, 64 };
427 *D1c = (cache_t) { 16384, 4, 64 };
428 *LLc = (cache_t) { 262144, 8, 64 };
430 #elif defined(VGA_s390x)
432 // Here is the cache data from older machine models:
434 // I1 D1 I/D L2
435 // z900 256k/256/4 256k/256/4 16MB
436 // z800 256k/256/4 256k/256/4 8MB
437 // z990 256k/256/4 256k/256/4 32MB
438 // z890 256k/256/4 256k/256/4 32MB
439 // z9 256k/256/4 256k/256/4 40MB
441 // Sources:
442 // (1) IBM System z9 109 Technical Introduction
443 // www.redbooks.ibm.com/redbooks/pdfs/sg246669.pdf
444 // (2) The microarchitecture of the IBM eServer z900 processor
445 // IBM Journal of Research and Development
446 // Volume 46, Number 4/5, pp 381-395, July/September 2002
447 // (3) The IBM eServer z990 microprocessor
448 // IBM Journal of Research and Development
449 // Volume 48, Number 3/4, pp 295-309, May/July 2004
450 // (4) Charles Webb, IBM
452 // L2 data is unfortunately incomplete. Otherwise, we could support
453 // machines without the ECAG insn by looking at VEX_S390X_MODEL(hwcaps).
455 // Default cache configuration is z10-EC (Source: ECAG insn)
456 *I1c = (cache_t) { 65536, 4, 256 };
457 *D1c = (cache_t) { 131072, 8, 256 };
458 *LLc = (cache_t) { 50331648, 24, 256 };
460 #elif defined(VGA_mips32)
462 // Set caches to default (for MIPS32-r2(mips 74kc))
463 *I1c = (cache_t) { 32768, 4, 32 };
464 *D1c = (cache_t) { 32768, 4, 32 };
465 *LLc = (cache_t) { 524288, 8, 32 };
467 #elif defined(VGA_mips64)
469 // Set caches to default (for MIPS64 - 5kc)
470 *I1c = (cache_t) { 32768, 4, 32 };
471 *D1c = (cache_t) { 32768, 4, 32 };
472 *LLc = (cache_t) { 524288, 8, 32 };
474 #elif defined(VGA_x86) || defined(VGA_amd64)
476 *I1c = (cache_t) { 65536, 2, 64 };
477 *D1c = (cache_t) { 65536, 2, 64 };
478 *LLc = (cache_t) { 262144, 8, 64 };
480 #elif defined(VGA_tilegx)
482 // Set caches to default for Tilegx.
483 *I1c = (cache_t) { 0x8000, 2, 64 };
484 *D1c = (cache_t) { 0x8000, 2, 64 };
485 *LLc = (cache_t) { 0x40000, 8, 64 };
487 #else
489 #error "Unknown arch"
491 #endif
493 if (!all_caches_clo_defined) {
494 const HChar warning[] =
495 "Warning: Cannot auto-detect cache config, using defaults.\n"
496 " Run with -v to see.\n";
497 VG_(dmsg)("%s", warning);
501 /*--------------------------------------------------------------------*/
502 /*--- end ---*/
503 /*--------------------------------------------------------------------*/