biome: 1.9.2 -> 1.9.3
[NixPkgs.git] / pkgs / build-support / docker / stream_layered_image.py
blob0078c1cb764ea01c6fe427973db1e2356d15246c
1 """
2 This script generates a Docker image from a set of store paths. Uses
3 Docker Image Specification v1.2 as reference [1].
5 It expects a JSON file with the following properties and writes the
6 image as an uncompressed tarball to stdout:
8 * "architecture", "config", "os", "created", "repo_tag" correspond to
9 the fields with the same name on the image spec [2].
10 * "created" can be "now".
11 * "created" is also used as mtime for files added to the image.
12 * "uid", "gid", "uname", "gname" is the file ownership, for example,
13 0, 0, "root", "root".
14 * "store_layers" is a list of layers in ascending order, where each
15 layer is the list of store paths to include in that layer.
17 The main challenge for this script to create the final image in a
18 streaming fashion, without dumping any intermediate data to disk
19 for performance.
21 A docker image has each layer contents archived as separate tarballs,
22 and they later all get enveloped into a single big tarball in a
23 content addressed fashion. However, because how "tar" format works,
24 we have to know about the name (which includes the checksum in our
25 case) and the size of the tarball before we can start adding it to the
26 outer tarball. We achieve that by creating the layer tarballs twice;
27 on the first iteration we calculate the file size and the checksum,
28 and on the second one we actually stream the contents. 'add_layer_dir'
29 function does all this.
31 [1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md
32 [2]: https://github.com/moby/moby/blob/4fb59c20a4fb54f944fe170d0ff1d00eb4a24d6f/image/spec/v1.2.md#image-json-field-descriptions
33 """ # noqa: E501
35 import argparse
36 import io
37 import os
38 import re
39 import sys
40 import json
41 import hashlib
42 import pathlib
43 import tarfile
44 import itertools
45 import threading
46 from datetime import datetime, timezone
47 from collections import namedtuple
50 def archive_paths_to(obj, paths, mtime, uid, gid, uname, gname):
51 """
52 Writes the given store paths as a tar file to the given stream.
54 obj: Stream to write to. Should have a 'write' method.
55 paths: List of store paths.
56 """
58 # gettarinfo makes the paths relative, this makes them
59 # absolute again
60 def append_root(ti):
61 ti.name = "/" + ti.name
62 return ti
64 def apply_filters(ti):
65 ti.mtime = mtime
66 ti.uid = uid
67 ti.gid = gid
68 ti.uname = uname
69 ti.gname = gname
70 return ti
72 def nix_root(ti):
73 ti.mode = 0o0755 # rwxr-xr-x
74 return ti
76 def dir(path):
77 ti = tarfile.TarInfo(path)
78 ti.type = tarfile.DIRTYPE
79 return ti
81 with tarfile.open(fileobj=obj, mode="w|") as tar:
82 # To be consistent with the docker utilities, we need to have
83 # these directories first when building layer tarballs.
84 tar.addfile(apply_filters(nix_root(dir("/nix"))))
85 tar.addfile(apply_filters(nix_root(dir("/nix/store"))))
87 for path in paths:
88 path = pathlib.Path(path)
89 if path.is_symlink():
90 files = [path]
91 else:
92 files = itertools.chain([path], path.rglob("*"))
94 for filename in sorted(files):
95 ti = append_root(tar.gettarinfo(filename))
97 # copy hardlinks as regular files
98 if ti.islnk():
99 ti.type = tarfile.REGTYPE
100 ti.linkname = ""
101 ti.size = filename.stat().st_size
103 ti = apply_filters(ti)
104 if ti.isfile():
105 with open(filename, "rb") as f:
106 tar.addfile(ti, f)
107 else:
108 tar.addfile(ti)
111 class ExtractChecksum:
113 A writable stream which only calculates the final file size and
114 sha256sum, while discarding the actual contents.
117 def __init__(self):
118 self._digest = hashlib.sha256()
119 self._size = 0
121 def write(self, data):
122 self._digest.update(data)
123 self._size += len(data)
125 def extract(self):
127 Returns: Hex-encoded sha256sum and size as a tuple.
129 return (self._digest.hexdigest(), self._size)
132 FromImage = namedtuple("FromImage", ["tar", "manifest_json", "image_json"])
133 # Some metadata for a layer
134 LayerInfo = namedtuple("LayerInfo", ["size", "checksum", "path", "paths"])
137 def load_from_image(from_image_str):
139 Loads the given base image, if any.
141 from_image_str: Path to the base image archive.
143 Returns: A 'FromImage' object with references to the loaded base image,
144 or 'None' if no base image was provided.
146 if from_image_str is None:
147 return None
149 base_tar = tarfile.open(from_image_str)
151 manifest_json_tarinfo = base_tar.getmember("manifest.json")
152 with base_tar.extractfile(manifest_json_tarinfo) as f:
153 manifest_json = json.load(f)
155 image_json_tarinfo = base_tar.getmember(manifest_json[0]["Config"])
156 with base_tar.extractfile(image_json_tarinfo) as f:
157 image_json = json.load(f)
159 return FromImage(base_tar, manifest_json, image_json)
162 def add_base_layers(tar, from_image):
164 Adds the layers from the given base image to the final image.
166 tar: 'tarfile.TarFile' object for new layers to be added to.
167 from_image: 'FromImage' object with references to the loaded base image.
169 if from_image is None:
170 print("No 'fromImage' provided", file=sys.stderr)
171 return []
173 layers = from_image.manifest_json[0]["Layers"]
174 checksums = from_image.image_json["rootfs"]["diff_ids"]
175 layers_checksums = zip(layers, checksums)
177 for num, (layer, checksum) in enumerate(layers_checksums, start=1):
178 layer_tarinfo = from_image.tar.getmember(layer)
179 checksum = re.sub(r"^sha256:", "", checksum)
181 tar.addfile(layer_tarinfo, from_image.tar.extractfile(layer_tarinfo))
182 path = layer_tarinfo.path
183 size = layer_tarinfo.size
185 print("Adding base layer", num, "from", path, file=sys.stderr)
186 yield LayerInfo(size=size, checksum=checksum, path=path, paths=[path])
188 from_image.tar.close()
191 def overlay_base_config(from_image, final_config):
193 Overlays the final image 'config' JSON on top of selected defaults from the
194 base image 'config' JSON.
196 from_image: 'FromImage' object with references to the loaded base image.
197 final_config: 'dict' object of the final image 'config' JSON.
199 if from_image is None:
200 return final_config
202 base_config = from_image.image_json["config"]
204 # Preserve environment from base image
205 final_env = base_config.get("Env", []) + final_config.get("Env", [])
206 if final_env:
207 # Resolve duplicates (last one wins) and format back as list
208 resolved_env = {entry.split("=", 1)[0]: entry for entry in final_env}
209 final_config["Env"] = list(resolved_env.values())
210 return final_config
213 def add_layer_dir(tar, paths, store_dir, mtime, uid, gid, uname, gname):
215 Appends given store paths to a TarFile object as a new layer.
217 tar: 'tarfile.TarFile' object for the new layer to be added to.
218 paths: List of store paths.
219 store_dir: the root directory of the nix store
220 mtime: 'mtime' of the added files and the layer tarball.
221 Should be an integer representing a POSIX time.
223 Returns: A 'LayerInfo' object containing some metadata of
224 the layer added.
227 invalid_paths = [i for i in paths if not i.startswith(store_dir)]
228 assert (
229 len(invalid_paths) == 0
230 ), f"Expecting absolute paths from {store_dir}, but got: {invalid_paths}"
232 # First, calculate the tarball checksum and the size.
233 extract_checksum = ExtractChecksum()
234 archive_paths_to(extract_checksum, paths, mtime, uid, gid, uname, gname)
235 (checksum, size) = extract_checksum.extract()
237 path = f"{checksum}/layer.tar"
238 layer_tarinfo = tarfile.TarInfo(path)
239 layer_tarinfo.size = size
240 layer_tarinfo.mtime = mtime
242 # Then actually stream the contents to the outer tarball.
243 read_fd, write_fd = os.pipe()
244 with open(read_fd, "rb") as read, open(write_fd, "wb") as write:
246 def producer():
247 archive_paths_to(write, paths, mtime, uid, gid, uname, gname)
248 write.close()
250 # Closing the write end of the fifo also closes the read end,
251 # so we don't need to wait until this thread is finished.
253 # Any exception from the thread will get printed by the default
254 # exception handler, and the 'addfile' call will fail since it
255 # won't be able to read required amount of bytes.
256 threading.Thread(target=producer).start()
257 tar.addfile(layer_tarinfo, read)
259 return LayerInfo(size=size, checksum=checksum, path=path, paths=paths)
262 def add_customisation_layer(target_tar, customisation_layer, mtime):
264 Adds the customisation layer as a new layer. This is layer is structured
265 differently; given store path has the 'layer.tar' and corresponding
266 sha256sum ready.
268 tar: 'tarfile.TarFile' object for the new layer to be added to.
269 customisation_layer: Path containing the layer archive.
270 mtime: 'mtime' of the added layer tarball.
273 checksum_path = os.path.join(customisation_layer, "checksum")
274 with open(checksum_path) as f:
275 checksum = f.read().strip()
276 assert len(checksum) == 64, f"Invalid sha256 at ${checksum_path}."
278 layer_path = os.path.join(customisation_layer, "layer.tar")
280 path = f"{checksum}/layer.tar"
281 tarinfo = target_tar.gettarinfo(layer_path)
282 tarinfo.name = path
283 tarinfo.mtime = mtime
285 with open(layer_path, "rb") as f:
286 target_tar.addfile(tarinfo, f)
288 return LayerInfo(
289 size=None, checksum=checksum, path=path, paths=[customisation_layer]
293 def add_bytes(tar, path, content, mtime):
295 Adds a file to the tarball with given path and contents.
297 tar: 'tarfile.TarFile' object.
298 path: Path of the file as a string.
299 content: Contents of the file.
300 mtime: 'mtime' of the file. Should be an integer representing a POSIX time.
302 assert type(content) is bytes
304 ti = tarfile.TarInfo(path)
305 ti.size = len(content)
306 ti.mtime = mtime
307 tar.addfile(ti, io.BytesIO(content))
310 now = datetime.now(tz=timezone.utc)
313 def parse_time(s):
314 if s == "now":
315 return now
316 return datetime.fromisoformat(s)
319 def main():
320 arg_parser = argparse.ArgumentParser(
321 description="""
322 This script generates a Docker image from a set of store paths. Uses
323 Docker Image Specification v1.2 as reference [1].
325 [1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md
328 arg_parser.add_argument(
329 "conf",
330 type=str,
331 help="""
332 JSON file with the following properties and writes the
333 image as an uncompressed tarball to stdout:
335 * "architecture", "config", "os", "created", "repo_tag" correspond to
336 the fields with the same name on the image spec [2].
337 * "created" can be "now".
338 * "created" is also used as mtime for files added to the image.
339 * "uid", "gid", "uname", "gname" is the file ownership, for example,
340 0, 0, "root", "root".
341 * "store_layers" is a list of layers in ascending order, where each
342 layer is the list of store paths to include in that layer.
343 """,
345 arg_parser.add_argument(
346 "--repo_tag", "-t", type=str,
347 help="Override the RepoTags from the configuration"
350 args = arg_parser.parse_args()
351 with open(args.conf, "r") as f:
352 conf = json.load(f)
354 created = parse_time(conf["created"])
355 mtime = int(parse_time(conf["mtime"]).timestamp())
356 uid = int(conf["uid"])
357 gid = int(conf["gid"])
358 uname = conf["uname"]
359 gname = conf["gname"]
360 store_dir = conf["store_dir"]
362 from_image = load_from_image(conf["from_image"])
364 with tarfile.open(mode="w|", fileobj=sys.stdout.buffer) as tar:
365 layers = []
366 layers.extend(add_base_layers(tar, from_image))
368 start = len(layers) + 1
369 for num, store_layer in enumerate(conf["store_layers"], start=start):
370 print(
371 "Creating layer",
372 num,
373 "from paths:",
374 store_layer,
375 file=sys.stderr,
377 info = add_layer_dir(
378 tar, store_layer, store_dir, mtime, uid, gid, uname, gname
380 layers.append(info)
382 print(
383 "Creating layer",
384 len(layers) + 1,
385 "with customisation...",
386 file=sys.stderr,
388 layers.append(
389 add_customisation_layer(
390 tar, conf["customisation_layer"], mtime=mtime
394 print("Adding manifests...", file=sys.stderr)
396 image_json = {
397 "created": datetime.isoformat(created),
398 "architecture": conf["architecture"],
399 "os": "linux",
400 "config": overlay_base_config(from_image, conf["config"]),
401 "rootfs": {
402 "diff_ids": [f"sha256:{layer.checksum}" for layer in layers],
403 "type": "layers",
405 "history": [
407 "created": datetime.isoformat(created),
408 "comment": f"store paths: {layer.paths}",
410 for layer in layers
414 image_json = json.dumps(image_json, indent=4).encode("utf-8")
415 image_json_checksum = hashlib.sha256(image_json).hexdigest()
416 image_json_path = f"{image_json_checksum}.json"
417 add_bytes(tar, image_json_path, image_json, mtime=mtime)
419 manifest_json = [
421 "Config": image_json_path,
422 "RepoTags": [args.repo_tag or conf["repo_tag"]],
423 "Layers": [layer.path for layer in layers],
426 manifest_json = json.dumps(manifest_json, indent=4).encode("utf-8")
427 add_bytes(tar, "manifest.json", manifest_json, mtime=mtime)
429 print("Done.", file=sys.stderr)
432 if __name__ == "__main__":
433 main()