1 # IMPORTANT: Making changes?
3 # Validate your changes with python3 ./closure-graph.py --test
6 # Using a simple algorithm, convert the references to a path in to a
7 # sorted list of dependent paths based on how often they're referenced
8 # and how deep in the tree they live. Equally-"popular" paths are then
11 # The existing writeClosure prints the paths in a simple ascii-based
12 # sorting of the paths.
14 # Sorting the paths by graph improves the chances that the difference
15 # between two builds appear near the end of the list, instead of near
16 # the beginning. This makes a difference for Nix builds which export a
17 # closure for another program to consume, if that program implements its
18 # own level of binary diffing.
20 # For an example, Docker Images. If each store path is a separate layer
21 # then Docker Images can be very efficiently transfered between systems,
22 # and we get very good cache reuse between images built with the same
23 # version of Nixpkgs. However, since Docker only reliably supports a
24 # small number of layers (42) it is important to pick the individual
25 # layers carefully. By storing very popular store paths in the first 40
26 # layers, we improve the chances that the next Docker image will share
27 # many of those layers.*
29 # Given the dependency tree:
37 # Nodes which have multiple references are duplicated:
47 # Each leaf node is now replaced by a counter defaulted to 1:
49 # A - B - C - D - (F:1)
57 # Then each leaf counter is merged with its parent node, replacing the
58 # parent node with a counter of 1, and each existing counter being
59 # incremented by 1. That is to say `- D - (F:1)` becomes `- (D:1, F:2)`:
61 # A - B - C - (D:1, F:2)
69 # Then each leaf counter is merged with its parent node again, merging
70 # any counters, then incrementing each:
72 # A - B - (C:1, D:2, E:2, F:5)
80 # A - (B:1, C:2, D:3, E:4, F:8)
86 # (A:1, B:2, C:3, D:4, E:5, F:9, G:2)
88 # and then paths have the following "popularity":
98 # and the popularity contest would result in the paths being printed as:
108 # * Note: People who have used a Dockerfile before assume Docker's
109 # Layers are inherently ordered. However, this is not true -- Docker
110 # layers are content-addressable and are not explicitly layered until
111 # they are composed in to an Image.
117 from pprint
import pprint
118 from collections
import defaultdict
121 def debug(msg
, *args
, **kwargs
):
125 msg
.format(*args
, **kwargs
)
131 # Find paths in the original dataset which are never referenced by
133 def find_roots(closures
):
136 for closure
in closures
:
137 path
= closure
['path']
138 if not any_refer_to(path
, closures
):
143 class TestFindRoots(unittest
.TestCase
):
144 def test_find_roots(self
):
145 self
.assertCountEqual(
148 "path": "/nix/store/foo",
155 "path": "/nix/store/bar",
162 "path": "/nix/store/hello",
167 ["/nix/store/foo", "/nix/store/hello"]
171 def any_refer_to(path
, closures
):
172 for closure
in closures
:
173 if path
!= closure
['path']:
174 if path
in closure
['references']:
178 class TestAnyReferTo(unittest
.TestCase
):
179 def test_has_references(self
):
185 "path": "/nix/store/foo",
193 def test_no_references(self
):
199 "path": "/nix/store/foo",
209 def all_paths(closures
):
211 for closure
in closures
:
212 paths
.append(closure
['path'])
213 paths
.extend(closure
['references'])
215 return list(set(paths
))
218 class TestAllPaths(unittest
.TestCase
):
219 def test_returns_all_paths(self
):
220 self
.assertCountEqual(
223 "path": "/nix/store/foo",
230 "path": "/nix/store/bar",
237 "path": "/nix/store/hello",
242 ["/nix/store/foo", "/nix/store/bar", "/nix/store/hello", "/nix/store/tux",]
244 def test_no_references(self
):
250 "path": "/nix/store/foo",
263 # { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
264 # { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
265 # { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
266 # { path: /nix/store/tux, references: [ /nix/store/tux ] }
271 # /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
272 # /nix/store/bar: [ /nix/store/baz ],
273 # /nix/store/baz: [ /nix/store/tux ] },
274 # /nix/store/tux: [ ]
277 # Note that it drops self-references to avoid loops.
278 def make_lookup(closures
):
281 for closure
in closures
:
282 # paths often self-refer
283 nonreferential_paths
= [ref
for ref
in closure
['references'] if ref
!= closure
['path']]
284 lookup
[closure
['path']] = nonreferential_paths
288 class TestMakeLookup(unittest
.TestCase
):
289 def test_returns_lookp(self
):
290 self
.assertDictEqual(
293 "path": "/nix/store/foo",
300 "path": "/nix/store/bar",
307 "path": "/nix/store/hello",
313 "/nix/store/foo": [ "/nix/store/bar" ],
314 "/nix/store/bar": [ "/nix/store/tux" ],
315 "/nix/store/hello": [ ],
321 # /nix/store/foo with
323 # /nix/store/foo: [ /nix/store/bar, /nix/store/baz ],
324 # /nix/store/bar: [ /nix/store/baz ],
325 # /nix/store/baz: [ /nix/store/tux ] },
326 # /nix/store/tux: [ ]
342 def make_graph_segment_from_root(root
, lookup
):
343 global subgraphs_cache
345 for ref
in lookup
[root
]:
346 # make_graph_segment_from_root is a pure function, and will
347 # always return the same result based on a given input. Thus,
350 # Python's assignment will use a pointer, preventing memory
351 # bloat for large graphs.
352 if ref
not in subgraphs_cache
:
353 debug("Subgraph Cache miss on {}".format(ref
))
354 subgraphs_cache
[ref
] = make_graph_segment_from_root(ref
, lookup
)
356 debug("Subgraph Cache hit on {}".format(ref
))
357 children
[ref
] = subgraphs_cache
[ref
]
360 class TestMakeGraphSegmentFromRoot(unittest
.TestCase
):
361 def test_returns_graph(self
):
362 self
.assertDictEqual(
363 make_graph_segment_from_root("/nix/store/foo", {
364 "/nix/store/foo": [ "/nix/store/bar" ],
365 "/nix/store/bar": [ "/nix/store/tux" ],
366 "/nix/store/tux": [ ],
367 "/nix/store/hello": [ ],
375 def test_returns_graph_tiny(self
):
376 self
.assertDictEqual(
377 make_graph_segment_from_root("/nix/store/tux", {
378 "/nix/store/foo": [ "/nix/store/bar" ],
379 "/nix/store/bar": [ "/nix/store/tux" ],
380 "/nix/store/tux": [ ],
385 # Convert a graph segment in to a popularity-counted dictionary:
408 popularity_cache
= {}
409 def graph_popularity_contest(full_graph
):
410 global popularity_cache
411 popularity
= defaultdict(int)
412 for path
, subgraph
in full_graph
.items():
413 popularity
[path
] += 1
414 # graph_popularity_contest is a pure function, and will
415 # always return the same result based on a given input. Thus,
418 # Python's assignment will use a pointer, preventing memory
419 # bloat for large graphs.
420 if path
not in popularity_cache
:
421 debug("Popularity Cache miss on {}", path
)
422 popularity_cache
[path
] = graph_popularity_contest(subgraph
)
424 debug("Popularity Cache hit on {}", path
)
426 subcontest
= popularity_cache
[path
]
427 for subpath
, subpopularity
in subcontest
.items():
428 debug("Calculating popularity for {}", subpath
)
429 popularity
[subpath
] += subpopularity
+ 1
433 class TestGraphPopularityContest(unittest
.TestCase
):
434 def test_counts_popularity(self
):
435 self
.assertDictEqual(
436 graph_popularity_contest({
456 # Emit a list of packages by popularity, most first:
467 # [ /nix/store/baz /nix/store/tux /nix/store/bar /nix/store/foo ]
468 def order_by_popularity(paths
):
469 paths_by_popularity
= defaultdict(list)
471 for path
, popularity
in paths
.items():
472 popularities
.append(popularity
)
473 paths_by_popularity
[popularity
].append(path
)
475 popularities
= list(set(popularities
))
479 for popularity
in popularities
:
480 paths
= paths_by_popularity
[popularity
]
481 paths
.sort(key
=package_name
)
483 flat_ordered
.extend(reversed(paths
))
484 return list(reversed(flat_ordered
))
487 class TestOrderByPopularity(unittest
.TestCase
):
488 def test_returns_in_order(self
):
490 order_by_popularity({
504 def package_name(path
):
505 parts
= path
.split('-')
507 # don't throw away any data, so the order is always the same.
508 # even in cases where only the hash at the start has changed.
510 return '-'.join(parts
)
513 filename
= sys
.argv
[1]
516 debug("Loading from {}", filename
)
517 with
open(filename
) as f
:
522 # { path: /nix/store/foo, references: [ /nix/store/foo, /nix/store/bar, /nix/store/baz ] },
523 # { path: /nix/store/bar, references: [ /nix/store/bar, /nix/store/baz ] },
524 # { path: /nix/store/baz, references: [ /nix/store/baz, /nix/store/tux ] },
525 # { path: /nix/store/tux, references: [ /nix/store/tux ] }
528 # and we want to get out a list of paths ordered by how universally,
529 # important they are, ie: tux is referenced by every path, transitively
540 debug("Finding roots from {}", key
)
541 roots
= find_roots(graph
);
542 debug("Making lookup for {}", key
)
543 lookup
= make_lookup(graph
)
547 debug("Making full graph for {}", root
)
548 full_graph
[root
] = make_graph_segment_from_root(root
, lookup
)
550 debug("Running contest")
551 contest
= graph_popularity_contest(full_graph
)
552 debug("Ordering by popularity")
553 ordered
= order_by_popularity(contest
)
554 debug("Checking for missing paths")
556 for path
in all_paths(graph
):
557 if path
not in ordered
:
560 ordered
.extend(missing
)
561 print("\n".join(ordered
))
563 if "--test" in sys
.argv
:
564 # Don't pass --test otherwise unittest gets mad
565 unittest
.main(argv
= [f
for f
in sys
.argv
if f
!= "--test" ])