2 # SPDX-License-Identifier: GPL-2.0-only
3 # Copyright (C) 2024 ARM Ltd.
5 # Utility providing smaps-like output detailing transparent hugepage usage.
26 with open('/sys/kernel/mm/transparent_hugepage/hpage_pmd_size') as f:
27 PAGE_SIZE = resource.getpagesize()
28 PAGE_SHIFT = int(math.log2(PAGE_SIZE))
29 PMD_SIZE = int(f.read())
30 PMD_ORDER = int(math.log2(PMD_SIZE / PAGE_SIZE))
33 def align_forward(v, a):
34 return (v + (a - 1)) & ~(a - 1)
37 def align_offset(v, a):
42 # Convert KB to number of pages.
43 return (kb << 10) >> PAGE_SHIFT
47 # Convert number of pages to KB.
48 return (nr << PAGE_SHIFT) >> 10
52 # Convert page order to KB.
53 return (PAGE_SIZE << order) >> 10
56 def cont_ranges_all(search, index):
57 # Given a list of arrays, find the ranges for which values are monotonically
58 # incrementing in all arrays. all arrays in search and index must be the
62 d = np.diff(search[0]) == 1
63 for dd in [np.diff(arr) == 1 for arr in search[1:]]:
67 return [np.repeat(arr, r).reshape(-1, 2) for arr in index]
70 class ArgException(Exception):
74 class FileIOException(Exception):
79 # Base class used to read /proc/<pid>/pagemap and /proc/kpageflags into a
80 # numpy array. Use inherrited class in a with clause to ensure file is
81 # closed when it goes out of scope.
82 def __init__(self, filename, element_size):
83 self.element_size = element_size
84 self.filename = filename
85 self.fd = os.open(self.filename, os.O_RDONLY)
93 def __exit__(self, exc_type, exc_val, exc_tb):
96 def _readin(self, offset, buffer):
97 length = os.preadv(self.fd, (buffer,), offset)
98 if len(buffer) != length:
99 raise FileIOException('error: {} failed to read {} bytes at {:x}'
100 .format(self.filename, len(buffer), offset))
102 def _toarray(self, buf):
103 assert(self.element_size == 8)
104 return np.frombuffer(buf, dtype=np.uint64)
107 vec *= self.element_size
109 lengths = (np.diff(vec) + self.element_size).reshape(len(vec))
110 buf = bytearray(int(np.sum(lengths)))
111 view = memoryview(buf)
113 for offset, length in zip(offsets, lengths):
116 self._readin(offset, view[pos:pos+length])
118 return self._toarray(buf)
120 def get(self, index, nr=1):
121 offset = index * self.element_size
122 length = nr * self.element_size
123 buf = bytearray(length)
124 self._readin(offset, buf)
125 return self._toarray(buf)
128 PM_PAGE_PRESENT = 1 << 63
129 PM_PFN_MASK = (1 << 55) - 1
131 class PageMap(BinArrayFile):
132 # Read ranges of a given pid's pagemap into a numpy array.
133 def __init__(self, pid='self'):
134 super().__init__(f'/proc/{pid}/pagemap', 8)
138 KPF_COMPOUND_HEAD = 1 << 15
139 KPF_COMPOUND_TAIL = 1 << 16
142 class KPageFlags(BinArrayFile):
143 # Read ranges of /proc/kpageflags into a numpy array.
145 super().__init__(f'/proc/kpageflags', 8)
148 vma_all_stats = set([
171 vma_min_stats = set([
179 VMA = collections.namedtuple('VMA', [
195 # A container for VMAs, parsed from /proc/<pid>/smaps. Iterate over the
196 # instance to receive VMAs.
197 def __init__(self, pid='self', stats=[]):
199 with open(f'/proc/{pid}/smaps', 'r') as file:
201 elements = line.split()
202 if '-' in elements[0]:
203 start, end = map(lambda x: int(x, 16), elements[0].split('-'))
204 major, minor = map(lambda x: int(x, 16), elements[3].split(':'))
205 self.vmas.append(VMA(
206 name=elements[5] if len(elements) == 6 else '',
209 read=elements[1][0] == 'r',
210 write=elements[1][1] == 'w',
211 execute=elements[1][2] == 'x',
212 private=elements[1][3] == 'p',
213 pgoff=int(elements[2], 16),
216 inode=int(elements[4], 16),
220 param = elements[0][:-1]
222 value = int(elements[1])
223 self.vmas[-1].stats[param] = {'type': None, 'value': value}
229 def thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads):
230 # Given 4 same-sized arrays representing a range within a page table backed
231 # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
232 # True if page is anonymous, heads: True if page is head of a THP), return a
233 # dictionary of statistics describing the mapped THPs.
237 'aligned': [0] * (PMD_ORDER + 1),
238 'unaligned': [0] * (PMD_ORDER + 1),
242 'aligned': [0] * (PMD_ORDER + 1),
243 'unaligned': [0] * (PMD_ORDER + 1),
247 for rindex, rpfn in zip(ranges[0], ranges[2]):
248 index_next = int(rindex[0])
249 index_end = int(rindex[1]) + 1
250 pfn_end = int(rpfn[1]) + 1
252 folios = indexes[index_next:index_end][heads[index_next:index_end]]
254 # Account pages for any partially mapped THP at the front. In that case,
255 # the first page of the range is a tail.
256 nr = (int(folios[0]) if len(folios) else index_end) - index_next
257 stats['anon' if anons[index_next] else 'file']['partial'] += nr
259 # Account pages for any partially mapped THP at the back. In that case,
260 # the next page after the range is a tail.
262 flags = int(kpageflags.get(pfn_end)[0])
263 if flags & KPF_COMPOUND_TAIL:
264 nr = index_end - int(folios[-1])
267 stats['anon' if anons[index_end - 1] else 'file']['partial'] += nr
269 # Account fully mapped THPs in the middle of the range.
271 folio_nrs = np.append(np.diff(folios), np.uint64(index_end - folios[-1]))
272 folio_orders = np.log2(folio_nrs).astype(np.uint64)
273 for index, order in zip(folios, folio_orders):
277 vfn = int(vfns[index])
278 align = 'aligned' if align_forward(vfn, nr) == vfn else 'unaligned'
279 anon = 'anon' if anons[index] else 'file'
280 stats[anon][align][order] += nr
282 # Account PMD-mapped THPs spearately, so filter out of the stats. There is a
283 # race between acquiring the smaps stats and reading pagemap, where memory
284 # could be deallocated. So clamp to zero incase it would have gone negative.
285 anon_pmd_mapped = vma.stats['AnonHugePages']['value']
286 file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
287 vma.stats['FilePmdMapped']['value']
288 stats['anon']['aligned'][PMD_ORDER] = max(0, stats['anon']['aligned'][PMD_ORDER] - kbnr(anon_pmd_mapped))
289 stats['file']['aligned'][PMD_ORDER] = max(0, stats['file']['aligned'][PMD_ORDER] - kbnr(file_pmd_mapped))
292 f"anon-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
293 f"file-thp-pmd-aligned-{odkb(PMD_ORDER)}kB": {'type': 'file', 'value': file_pmd_mapped},
296 def flatten_sub(type, subtype, stats):
297 param = f"{type}-thp-pte-{subtype}-{{}}kB"
298 for od, nr in enumerate(stats[2:], 2):
299 rstats[param.format(odkb(od))] = {'type': type, 'value': nrkb(nr)}
301 def flatten_type(type, stats):
302 flatten_sub(type, 'aligned', stats['aligned'])
303 flatten_sub(type, 'unaligned', stats['unaligned'])
304 rstats[f"{type}-thp-pte-partial"] = {'type': type, 'value': nrkb(stats['partial'])}
306 flatten_type('anon', stats['anon'])
307 flatten_type('file', stats['file'])
312 def cont_parse(vma, order, ranges, anons, heads):
313 # Given 4 same-sized arrays representing a range within a page table backed
314 # by THPs (vfns: virtual frame numbers, pfns: physical frame numbers, anons:
315 # True if page is anonymous, heads: True if page is head of a THP), return a
316 # dictionary of statistics describing the contiguous blocks.
321 for rindex, rvfn, rpfn in zip(*ranges):
322 index_next = int(rindex[0])
323 index_end = int(rindex[1]) + 1
324 vfn_start = int(rvfn[0])
325 pfn_start = int(rpfn[0])
327 if align_offset(pfn_start, nr_cont) != align_offset(vfn_start, nr_cont):
330 off = align_forward(vfn_start, nr_cont) - vfn_start
333 while index_next + nr_cont <= index_end:
334 folio_boundary = heads[index_next+1:index_next+nr_cont].any()
335 if not folio_boundary:
336 if anons[index_next]:
340 index_next += nr_cont
342 # Account blocks that are PMD-mapped spearately, so filter out of the stats.
343 # There is a race between acquiring the smaps stats and reading pagemap,
344 # where memory could be deallocated. So clamp to zero incase it would have
346 anon_pmd_mapped = vma.stats['AnonHugePages']['value']
347 file_pmd_mapped = vma.stats['ShmemPmdMapped']['value'] + \
348 vma.stats['FilePmdMapped']['value']
349 nr_anon = max(0, nr_anon - kbnr(anon_pmd_mapped))
350 nr_file = max(0, nr_file - kbnr(file_pmd_mapped))
353 f"anon-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'anon', 'value': anon_pmd_mapped},
354 f"file-cont-pmd-aligned-{nrkb(nr_cont)}kB": {'type': 'file', 'value': file_pmd_mapped},
357 rstats[f"anon-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'anon', 'value': nrkb(nr_anon)}
358 rstats[f"file-cont-pte-aligned-{nrkb(nr_cont)}kB"] = {'type': 'file', 'value': nrkb(nr_file)}
363 def vma_print(vma, pid):
364 # Prints a VMA instance in a format similar to smaps. The main difference is
365 # that the pid is included as the first value.
366 print("{:010d}: {:016x}-{:016x} {}{}{}{} {:08x} {:02x}:{:02x} {:08x} {}"
368 pid, vma.start, vma.end,
369 'r' if vma.read else '-', 'w' if vma.write else '-',
370 'x' if vma.execute else '-', 'p' if vma.private else 's',
371 vma.pgoff, vma.major, vma.minor, vma.inode, vma.name
375 def stats_print(stats, tot_anon, tot_file, inc_empty):
376 # Print a statistics dictionary.
378 for label, stat in stats.items():
380 value = stat['value']
381 if value or inc_empty:
382 pad = max(0, label_field - len(label) - 1)
383 if type == 'anon' and tot_anon > 0:
384 percent = f' ({value / tot_anon:3.0%})'
385 elif type == 'file' and tot_file > 0:
386 percent = f' ({value / tot_file:3.0%})'
389 print(f"{label}:{' ' * pad}{value:8} kB{percent}")
392 def vma_parse(vma, pagemap, kpageflags, contorders):
393 # Generate thp and cont statistics for a single VMA.
394 start = vma.start >> PAGE_SHIFT
395 end = vma.end >> PAGE_SHIFT
397 pmes = pagemap.get(start, end - start)
398 present = pmes & PM_PAGE_PRESENT != 0
399 pfns = pmes & PM_PFN_MASK
401 vfns = np.arange(start, end, dtype=np.uint64)
404 pfn_vec = cont_ranges_all([pfns], [pfns])[0]
405 flags = kpageflags.getv(pfn_vec)
406 anons = flags & KPF_ANON != 0
407 heads = flags & KPF_COMPOUND_HEAD != 0
408 thps = flags & KPF_THP != 0
415 indexes = np.arange(len(vfns), dtype=np.uint64)
416 ranges = cont_ranges_all([vfns, pfns], [indexes, vfns, pfns])
418 thpstats = thp_parse(vma, kpageflags, ranges, indexes, vfns, pfns, anons, heads)
419 contstats = [cont_parse(vma, order, ranges, anons, heads) for order in contorders]
421 tot_anon = vma.stats['Anonymous']['value']
422 tot_file = vma.stats['Rss']['value'] - tot_anon
426 **{k: v for s in contstats for k, v in s.items()}
427 }, tot_anon, tot_file
438 for walk_info in os.walk(args.cgroup):
439 cgroup = walk_info[0]
440 with open(f'{cgroup}/cgroup.procs') as pidfile:
441 for line in pidfile.readlines():
442 pids.add(int(line.strip()))
445 pids = pids.union(args.pid)
448 for pid in os.listdir('/proc'):
453 print(" PID START END PROT OFFSET DEV INODE OBJECT")
457 with PageMap(pid) as pagemap:
458 with KPageFlags() as kpageflags:
459 for vma in VMAList(pid, vma_all_stats if args.inc_smaps else vma_min_stats):
460 if (vma.read or vma.write or vma.execute) and vma.stats['Rss']['value'] > 0:
461 stats, vma_anon, vma_file = vma_parse(vma, pagemap, kpageflags, args.cont)
467 stats = {**vma.stats, **stats}
469 for k, v in stats.items():
471 assert(rollup[k]['type'] == v['type'])
472 rollup[k]['value'] += v['value']
475 rollup_anon += vma_anon
476 rollup_file += vma_file
479 stats_print(stats, vma_anon, vma_file, args.inc_empty)
480 except (FileNotFoundError, ProcessLookupError, FileIOException):
485 stats_print(rollup, rollup_anon, rollup_file, args.inc_empty)
489 docs_width = shutil.get_terminal_size().columns
491 docs_width = min(80, docs_width)
494 text = re.sub(r'\s+', ' ', string)
495 text = re.sub(r'\s*\\n\s*', '\n', text)
496 paras = text.split('\n')
497 paras = [textwrap.fill(p, width=docs_width) for p in paras]
498 return '\n'.join(paras)
501 return argparse.RawDescriptionHelpFormatter(prog, width=docs_width)
503 def size2order(human):
505 "K": 2**10, "M": 2**20, "G": 2**30,
506 "k": 2**10, "m": 2**20, "g": 2**30,
509 if human[-1] in units:
510 unit = units[human[-1]]
515 raise ArgException('error: --cont value must be integer size with optional KMG unit')
517 order = int(math.log2(size / PAGE_SIZE))
519 raise ArgException('error: --cont value must be size of at least 2 pages')
520 if (1 << order) * PAGE_SIZE != size:
521 raise ArgException('error: --cont value must be size of power-of-2 pages')
522 if order > PMD_ORDER:
523 raise ArgException('error: --cont value must be less than or equal to PMD order')
526 parser = argparse.ArgumentParser(formatter_class=formatter,
527 description=format("""Prints information about how transparent huge
528 pages are mapped, either system-wide, or for a specified
529 process or cgroup.\\n
531 When run with --pid, the user explicitly specifies the set
532 of pids to scan. e.g. "--pid 10 [--pid 134 ...]". When run
533 with --cgroup, the user passes either a v1 or v2 cgroup and
534 all pids that belong to the cgroup subtree are scanned. When
535 run with neither --pid nor --cgroup, the full set of pids on
536 the system is gathered from /proc and scanned as if the user
537 had provided "--pid 1 --pid 2 ...".\\n
539 A default set of statistics is always generated for THP
540 mappings. However, it is also possible to generate
541 additional statistics for "contiguous block mappings" where
542 the block size is user-defined.\\n
544 Statistics are maintained independently for anonymous and
545 file-backed (pagecache) memory and are shown both in kB and
546 as a percentage of either total anonymous or total
547 file-backed memory as appropriate.\\n
552 Statistics are always generated for fully- and
553 contiguously-mapped THPs whose mapping address is aligned to
554 their size, for each <size> supported by the system.
555 Separate counters describe THPs mapped by PTE vs those
556 mapped by PMD. (Although note a THP can only be mapped by
557 PMD if it is PMD-sized):\\n
559 - anon-thp-pte-aligned-<size>kB\\n
560 - file-thp-pte-aligned-<size>kB\\n
561 - anon-thp-pmd-aligned-<size>kB\\n
562 - file-thp-pmd-aligned-<size>kB\\n
564 Similarly, statistics are always generated for fully- and
565 contiguously-mapped THPs whose mapping address is *not*
566 aligned to their size, for each <size> supported by the
567 system. Due to the unaligned mapping, it is impossible to
568 map by PMD, so there are only PTE counters for this case:\\n
570 - anon-thp-pte-unaligned-<size>kB\\n
571 - file-thp-pte-unaligned-<size>kB\\n
573 Statistics are also always generated for mapped pages that
574 belong to a THP but where the is THP is *not* fully- and
575 contiguously- mapped. These "partial" mappings are all
576 counted in the same counter regardless of the size of the
577 THP that is partially mapped:\\n
579 - anon-thp-pte-partial\\n
580 - file-thp-pte-partial\\n
582 Contiguous Block Statistics\\n
583 ---------------------------\\n
585 An optional, additional set of statistics is generated for
586 every contiguous block size specified with `--cont <size>`.
587 These statistics show how much memory is mapped in
588 contiguous blocks of <size> and also aligned to <size>. A
589 given contiguous block must all belong to the same THP, but
590 there is no requirement for it to be the *whole* THP.
591 Separate counters describe contiguous blocks mapped by PTE
592 vs those mapped by PMD:\\n
594 - anon-cont-pte-aligned-<size>kB\\n
595 - file-cont-pte-aligned-<size>kB\\n
596 - anon-cont-pmd-aligned-<size>kB\\n
597 - file-cont-pmd-aligned-<size>kB\\n
599 As an example, if monitoring 64K contiguous blocks (--cont
600 64K), there are a number of sources that could provide such
601 blocks: a fully- and contiguously-mapped 64K THP that is
602 aligned to a 64K boundary would provide 1 block. A fully-
603 and contiguously-mapped 128K THP that is aligned to at least
604 a 64K boundary would provide 2 blocks. Or a 128K THP that
605 maps its first 100K, but contiguously and starting at a 64K
606 boundary would provide 1 block. A fully- and
607 contiguously-mapped 2M THP would provide 32 blocks. There
608 are many other possible permutations.\\n"""),
609 epilog=format("""Requires root privilege to access pagemap and
612 group = parser.add_mutually_exclusive_group(required=False)
613 group.add_argument('--pid',
614 metavar='pid', required=False, type=int, default=[], action='append',
615 help="""Process id of the target process. Maybe issued multiple times to
616 scan multiple processes. --pid and --cgroup are mutually exclusive.
617 If neither are provided, all processes are scanned to provide
618 system-wide information.""")
620 group.add_argument('--cgroup',
621 metavar='path', required=False,
622 help="""Path to the target cgroup in sysfs. Iterates over every pid in
623 the cgroup and its children. --pid and --cgroup are mutually
624 exclusive. If neither are provided, all processes are scanned to
625 provide system-wide information.""")
627 parser.add_argument('--rollup',
628 required=False, default=False, action='store_true',
629 help="""Sum the per-vma statistics to provide a summary over the whole
630 system, process or cgroup.""")
632 parser.add_argument('--cont',
633 metavar='size[KMG]', required=False, default=[], action='append',
634 help="""Adds stats for memory that is mapped in contiguous blocks of
635 <size> and also aligned to <size>. May be issued multiple times to
636 track multiple sized blocks. Useful to infer e.g. arm64 contpte and
637 hpa mappings. Size must be a power-of-2 number of pages.""")
639 parser.add_argument('--inc-smaps',
640 required=False, default=False, action='store_true',
641 help="""Include all numerical, additive /proc/<pid>/smaps stats in the
644 parser.add_argument('--inc-empty',
645 required=False, default=False, action='store_true',
646 help="""Show all statistics including those whose value is 0.""")
648 parser.add_argument('--periodic',
649 metavar='sleep_ms', required=False, type=int,
650 help="""Run in a loop, polling every sleep_ms milliseconds.""")
652 args = parser.parse_args()
655 args.cont = [size2order(cont) for cont in args.cont]
656 except ArgException as e:
664 time.sleep(args.periodic / 1000)
669 if __name__ == "__main__":
672 except Exception as e:
673 prog = os.path.basename(sys.argv[0])
674 print(f'{prog}: {e}')