pkgs/build-support/references-by-popularity/closure-graph.py

   1 # IMPORTANT: Making changes?
   2 #
   3 # Validate your changes with python3 ./closure-graph.py --test
   4
   5
   6 # Using a simple algorithm, convert the references to a path in to a
   7 # sorted list of dependent paths based on how often they're referenced
   8 # and how deep in the tree they live. Equally-"popular" paths are then
   9 # sorted by name.
  10 #
  11 # The existing writeClosure prints the paths in a simple ascii-based
  12 # sorting of the paths.
  13 #
  14 # Sorting the paths by graph improves the chances that the difference
  15 # between two builds appear near the end of the list, instead of near
  16 # the beginning. This makes a difference for Nix builds which export a
  17 # closure for another program to consume, if that program implements its
  18 # own level of binary diffing.
  19 #
  20 # For an example, Docker Images. If each store path is a separate layer
  21 # then Docker Images can be very efficiently transfered between systems,
  22 # and we get very good cache reuse between images built with the same
  23 # version of Nixpkgs. However, since Docker only reliably supports a
  24 # small number of layers (42) it is important to pick the individual
  25 # layers carefully. By storing very popular store paths in the first 40
  26 # layers, we improve the chances that the next Docker image will share
  27 # many of those layers.*
  28 #
  29 # Given the dependency tree:
  30 #
  31 #     A - B - C - D -\
  32 #      \   \   \      \
  33 #       \   \   \      \
  34 #        \   \ - E ---- F
  35 #         \- G
  36 #
  37 # Nodes which have multiple references are duplicated:
  38 #
  39 #     A - B - C - D - F
  40 #      \   \   \
  41 #       \   \   \- E - F
  42 #        \   \
  43 #         \   \- E - F
  44 #          \
  45 #           \- G
  46 #
  47 # Each leaf node is now replaced by a counter defaulted to 1:
  48 #
  49 #     A - B - C - D - (F:1)
  50 #      \   \   \
  51 #       \   \   \- E - (F:1)
  52 #        \   \
  53 #         \   \- E - (F:1)
  54 #          \
  55 #           \- (G:1)
  56 #
  57 # Then each leaf counter is merged with its parent node, replacing the
  58 # parent node with a counter of 1, and each existing counter being
  59 # incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
  60 #
  61 #     A - B - C - (D:1, F:2)
  62 #      \   \   \
  63 #       \   \   \- (E:1, F:2)
  64 #        \   \
  65 #         \   \- (E:1, F:2)
  66 #          \
  67 #           \- (G:1)
  68 #
  69 # Then each leaf counter is merged with its parent node again, merging
  70 # any counters, then incrementing each:
  71 #
  72 #     A - B - (C:1, D:2, E:2, F:5)
  73 #      \   \
  74 #       \   \- (E:1, F:2)
  75 #        \
  76 #         \- (G:1)
  77 #
  78 # And again:
  79 #
  80 #     A - (B:1, C:2, D:3, E:4, F:8)
  81 #      \
  82 #       \- (G:1)
  83 #
  84 # And again:
  85 #
  86 #     (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
  87 #
  88 # and then paths have the following "popularity":
  89 #
  90 #     A     1
  91 #     B     2
  92 #     C     3
  93 #     D     4
  94 #     E     5
  95 #     F     9
  96 #     G     2
  97 #
  98 # and the popularity contest would result in the paths being printed as:
  99 #
 100 #     F
 101 #     E
 102 #     D
 103 #     C
 104 #     B
 105 #     G
 106 #     A
 107 #
 108 # * Note: People who have used a Dockerfile before assume Docker's
 109 # Layers are inherently ordered. However, this is not true -- Docker
 110 # layers are content-addressable and are not explicitly layered until
 111 # they are composed in to an Image.
 112
 113 import sys
 114 import json
 115 import unittest
 116
 117 from pprint import pprint
 118 from collections import defaultdict
 119
 120
 121 def debug(msg, *args, **kwargs):
 122     if False:
 123         print(
 124             "DEBUG: {}".format(
 125                 msg.format(*args, **kwargs)
 126             ),
 127             file=sys.stderr
 128         )
 129
 130
 131 # Find paths in the original dataset which are never referenced by
 132 # any other paths
 133 def find_roots(closures):
 134     roots = [];
 135
 136     for closure in closures:
 137         path = closure['path']
 138         if not any_refer_to(path, closures):
 139             roots.append(path)
 140
 141     return roots
 142
 143 class TestFindRoots(unittest.TestCase):
 144     def test_find_roots(self):
 145         self.assertCountEqual(
 146             find_roots([
 147                 {
 148                     "path": "/nix/store/foo",
 149                     "references": [
 150                         "/nix/store/foo",
 151                         "/nix/store/bar"
 152                     ]
 153                 },
 154                 {
 155                     "path": "/nix/store/bar",
 156                     "references": [
 157                         "/nix/store/bar",
 158                         "/nix/store/tux"
 159                     ]
 160                 },
 161                 {
 162                     "path": "/nix/store/hello",
 163                     "references": [
 164                     ]
 165                 }
 166             ]),
 167             ["/nix/store/foo", "/nix/store/hello"]
 168         )
 169
 170
 171 def any_refer_to(path, closures):
 172     for closure in closures:
 173         if path != closure['path']:
 174             if path in closure['references']:
 175                 return True
 176     return False
 177
 178 class TestAnyReferTo(unittest.TestCase):
 179     def test_has_references(self):
 180         self.assertTrue(
 181             any_refer_to(
 182                 "/nix/store/bar",
 183                 [
 184                     {
 185                         "path": "/nix/store/foo",
 186                         "references": [
 187                             "/nix/store/bar"
 188                         ]
 189                     },
 190                 ]
 191             ),
 192         )
 193     def test_no_references(self):
 194         self.assertFalse(
 195             any_refer_to(
 196                 "/nix/store/foo",
 197                 [
 198                     {
 199                         "path": "/nix/store/foo",
 200                         "references": [
 201                             "/nix/store/foo",
 202                             "/nix/store/bar"
 203                         ]
 204                     },
 205                 ]
 206             ),
 207         )
 208
 209 def all_paths(closures):
 210     paths = []
 211     for closure in closures:
 212         paths.append(closure['path'])
 213         paths.extend(closure['references'])
 214     paths.sort()
 215     return list(set(paths))
 216
 217
 218 class TestAllPaths(unittest.TestCase):
 219     def test_returns_all_paths(self):
 220         self.assertCountEqual(
 221             all_paths([
 222                 {
 223                     "path": "/nix/store/foo",
 224                     "references": [
 225                         "/nix/store/foo",
 226                         "/nix/store/bar"
 227                     ]
 228                 },
 229                 {
 230                     "path": "/nix/store/bar",
 231                     "references": [
 232                         "/nix/store/bar",
 233                         "/nix/store/tux"
 234                     ]
 235                 },
 236                 {
 237                     "path": "/nix/store/hello",
 238                     "references": [
 239                     ]
 240                 }
 241             ]),
 242             ["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",]
 243         )
 244     def test_no_references(self):
 245         self.assertFalse(
 246             any_refer_to(
 247                 "/nix/store/foo",
 248                 [
 249                     {
 250                         "path": "/nix/store/foo",
 251                         "references": [
 252                             "/nix/store/foo",
 253                             "/nix/store/bar"
 254                         ]
 255                     },
 256                 ]
 257             ),
 258         )
 259
 260 # Convert:
 261 #
 262 # [
 263 #    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
 264 #    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
 265 #    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
 266 #    { path: /nix/store/tux, references: [ /nix/store/tux ] }
 267 #  ]
 268 #
 269 # To:
 270 #    {
 271 #      /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
 272 #      /nix/store/bar: [ /nix/store/baz ],
 273 #      /nix/store/baz: [ /nix/store/tux ] },
 274 #      /nix/store/tux: [ ]
 275 #    }
 276 #
 277 # Note that it drops self-references to avoid loops.
 278 def make_lookup(closures):
 279     lookup = {}
 280
 281     for closure in closures:
 282         # paths often self-refer
 283         nonreferential_paths = [ref for ref in closure['references'] if ref != closure['path']]
 284         lookup[closure['path']] = nonreferential_paths
 285
 286     return lookup
 287
 288 class TestMakeLookup(unittest.TestCase):
 289     def test_returns_lookp(self):
 290         self.assertDictEqual(
 291             make_lookup([
 292                 {
 293                     "path": "/nix/store/foo",
 294                     "references": [
 295                         "/nix/store/foo",
 296                         "/nix/store/bar"
 297                     ]
 298                 },
 299                 {
 300                     "path": "/nix/store/bar",
 301                     "references": [
 302                         "/nix/store/bar",
 303                         "/nix/store/tux"
 304                     ]
 305                 },
 306                 {
 307                     "path": "/nix/store/hello",
 308                     "references": [
 309                     ]
 310                 }
 311             ]),
 312             {
 313                 "/nix/store/foo": [ "/nix/store/bar" ],
 314                 "/nix/store/bar": [ "/nix/store/tux" ],
 315                 "/nix/store/hello": [ ],
 316             }
 317         )
 318
 319 # Convert:
 320 #
 321 # /nix/store/foo with
 322 #  {
 323 #    /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
 324 #    /nix/store/bar: [ /nix/store/baz ],
 325 #    /nix/store/baz: [ /nix/store/tux ] },
 326 #    /nix/store/tux: [ ]
 327 #  }
 328 #
 329 # To:
 330 #
 331 # {
 332 #   /nix/store/bar: {
 333 #                    /nix/store/baz: {
 334 #                                     /nix/store/tux: {}
 335 #                    }
 336 #   },
 337 #   /nix/store/baz: {
 338 #                   /nix/store/tux: {}
 339 #   }
 340 # }
 341 subgraphs_cache = {}
 342 def make_graph_segment_from_root(root, lookup):
 343     global subgraphs_cache
 344     children = {}
 345     for ref in lookup[root]:
 346         # make_graph_segment_from_root is a pure function, and will
 347         # always return the same result based on a given input. Thus,
 348         # cache computation.
 349         #
 350         # Python's assignment will use a pointer, preventing memory
 351         # bloat for large graphs.
 352         if ref not in subgraphs_cache:
 353             debug("Subgraph Cache miss on {}".format(ref))
 354             subgraphs_cache[ref] = make_graph_segment_from_root(ref, lookup)
 355         else:
 356             debug("Subgraph Cache hit on {}".format(ref))
 357         children[ref] = subgraphs_cache[ref]
 358     return children
 359
 360 class TestMakeGraphSegmentFromRoot(unittest.TestCase):
 361     def test_returns_graph(self):
 362         self.assertDictEqual(
 363             make_graph_segment_from_root("/nix/store/foo", {
 364                 "/nix/store/foo": [ "/nix/store/bar" ],
 365                 "/nix/store/bar": [ "/nix/store/tux" ],
 366                 "/nix/store/tux": [ ],
 367                 "/nix/store/hello": [ ],
 368             }),
 369             {
 370                 "/nix/store/bar": {
 371                     "/nix/store/tux": {}
 372                 }
 373             }
 374         )
 375     def test_returns_graph_tiny(self):
 376         self.assertDictEqual(
 377             make_graph_segment_from_root("/nix/store/tux", {
 378                 "/nix/store/foo": [ "/nix/store/bar" ],
 379                 "/nix/store/bar": [ "/nix/store/tux" ],
 380                 "/nix/store/tux": [ ],
 381             }),
 382             {}
 383         )
 384
 385 # Convert a graph segment in to a popularity-counted dictionary:
 386 #
 387 # From:
 388 # {
 389 #    /nix/store/foo: {
 390 #                      /nix/store/bar: {
 391 #                                        /nix/store/baz: {
 392 #                                                           /nix/store/tux: {}
 393 #                                        }
 394 #                      }
 395 #                      /nix/store/baz: {
 396 #                                         /nix/store/tux: {}
 397 #                      }
 398 #    }
 399 # }
 400 #
 401 # to:
 402 # [
 403 #   /nix/store/foo: 1
 404 #   /nix/store/bar: 2
 405 #   /nix/store/baz: 4
 406 #   /nix/store/tux: 6
 407 # ]
 408 popularity_cache = {}
 409 def graph_popularity_contest(full_graph):
 410     global popularity_cache
 411     popularity = defaultdict(int)
 412     for path, subgraph in full_graph.items():
 413         popularity[path] += 1
 414         # graph_popularity_contest is a pure function, and will
 415         # always return the same result based on a given input. Thus,
 416         # cache computation.
 417         #
 418         # Python's assignment will use a pointer, preventing memory
 419         # bloat for large graphs.
 420         if path not in popularity_cache:
 421             debug("Popularity Cache miss on {}", path)
 422             popularity_cache[path] = graph_popularity_contest(subgraph)
 423         else:
 424             debug("Popularity Cache hit on {}", path)
 425
 426         subcontest = popularity_cache[path]
 427         for subpath, subpopularity in subcontest.items():
 428             debug("Calculating popularity for {}", subpath)
 429             popularity[subpath] += subpopularity + 1
 430
 431     return popularity
 432
 433 class TestGraphPopularityContest(unittest.TestCase):
 434     def test_counts_popularity(self):
 435         self.assertDictEqual(
 436             graph_popularity_contest({
 437                 "/nix/store/foo": {
 438                     "/nix/store/bar": {
 439                         "/nix/store/baz": {
 440                             "/nix/store/tux": {}
 441                         }
 442                     },
 443                     "/nix/store/baz": {
 444                         "/nix/store/tux": {}
 445                     }
 446                 }
 447             }),
 448             {
 449                    "/nix/store/foo": 1,
 450                    "/nix/store/bar": 2,
 451                    "/nix/store/baz": 4,
 452                    "/nix/store/tux": 6,
 453             }
 454         )
 455
 456 # Emit a list of packages by popularity, most first:
 457 #
 458 # From:
 459 # [
 460 #   /nix/store/foo: 1
 461 #   /nix/store/bar: 1
 462 #   /nix/store/baz: 2
 463 #   /nix/store/tux: 2
 464 # ]
 465 #
 466 # To:
 467 # [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
 468 def order_by_popularity(paths):
 469     paths_by_popularity = defaultdict(list)
 470     popularities = []
 471     for path, popularity in paths.items():
 472         popularities.append(popularity)
 473         paths_by_popularity[popularity].append(path)
 474
 475     popularities = list(set(popularities))
 476     popularities.sort()
 477
 478     flat_ordered = []
 479     for popularity in popularities:
 480         paths = paths_by_popularity[popularity]
 481         paths.sort(key=package_name)
 482
 483         flat_ordered.extend(reversed(paths))
 484     return list(reversed(flat_ordered))
 485
 486
 487 class TestOrderByPopularity(unittest.TestCase):
 488     def test_returns_in_order(self):
 489         self.assertEqual(
 490             order_by_popularity({
 491                    "/nix/store/foo": 1,
 492                    "/nix/store/bar": 1,
 493                    "/nix/store/baz": 2,
 494                    "/nix/store/tux": 2,
 495             }),
 496             [
 497                 "/nix/store/baz",
 498                 "/nix/store/tux",
 499                 "/nix/store/bar",
 500                 "/nix/store/foo"
 501             ]
 502         )
 503
 504 def package_name(path):
 505     parts = path.split('-')
 506     start = parts.pop(0)
 507     # don't throw away any data, so the order is always the same.
 508     # even in cases where only the hash at the start has changed.
 509     parts.append(start)
 510     return '-'.join(parts)
 511
 512 def main():
 513     filename = sys.argv[1]
 514     key = sys.argv[2]
 515
 516     debug("Loading from {}", filename)
 517     with open(filename) as f:
 518         data = json.load(f)
 519
 520     # Data comes in as:
 521     # [
 522     #    { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
 523     #    { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
 524     #    { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
 525     #    { path: /nix/store/tux, references: [ /nix/store/tux ] }
 526     #  ]
 527     #
 528     # and we want to get out a list of paths ordered by how universally,
 529     # important they are, ie: tux is referenced by every path, transitively
 530     # so it should be #1
 531     #
 532     # [
 533     #   /nix/store/tux,
 534     #   /nix/store/baz,
 535     #   /nix/store/bar,
 536     #   /nix/store/foo,
 537     # ]
 538     graph = data[key]
 539
 540     debug("Finding roots from {}", key)
 541     roots = find_roots(graph);
 542     debug("Making lookup for {}", key)
 543     lookup = make_lookup(graph)
 544
 545     full_graph = {}
 546     for root in roots:
 547         debug("Making full graph for {}", root)
 548         full_graph[root] = make_graph_segment_from_root(root, lookup)
 549
 550     debug("Running contest")
 551     contest = graph_popularity_contest(full_graph)
 552     debug("Ordering by popularity")
 553     ordered = order_by_popularity(contest)
 554     debug("Checking for missing paths")
 555     missing = []
 556     for path in all_paths(graph):
 557         if path not in ordered:
 558             missing.append(path)
 559
 560     ordered.extend(missing)
 561     print("\n".join(ordered))
 562
 563 if "--test" in sys.argv:
 564     # Don't pass --test otherwise unittest gets mad
 565     unittest.main(argv = [f for f in sys.argv if f != "--test" ])
 566 else:
 567     main()