2 This script generates a Docker image from a set of store paths. Uses
3 Docker Image Specification v1.2 as reference [1].
5 It expects a JSON file with the following properties and writes the
6 image as an uncompressed tarball to stdout:
8 * "architecture", "config", "os", "created", "repo_tag" correspond to
9 the fields with the same name on the image spec [2].
10 * "created" can be "now".
11 * "created" is also used as mtime for files added to the image.
12 * "uid", "gid", "uname", "gname" is the file ownership, for example,
14 * "store_layers" is a list of layers in ascending order, where each
15 layer is the list of store paths to include in that layer.
17 The main challenge for this script to create the final image in a
18 streaming fashion, without dumping any intermediate data to disk
21 A docker image has each layer contents archived as separate tarballs,
22 and they later all get enveloped into a single big tarball in a
23 content addressed fashion. However, because how "tar" format works,
24 we have to know about the name (which includes the checksum in our
25 case) and the size of the tarball before we can start adding it to the
26 outer tarball. We achieve that by creating the layer tarballs twice;
27 on the first iteration we calculate the file size and the checksum,
28 and on the second one we actually stream the contents. 'add_layer_dir'
29 function does all this.
31 [1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md
32 [2]: https://github.com/moby/moby/blob/4fb59c20a4fb54f944fe170d0ff1d00eb4a24d6f/image/spec/v1.2.md#image-json-field-descriptions
46 from datetime
import datetime
, timezone
47 from collections
import namedtuple
50 def archive_paths_to(obj
, paths
, mtime
, uid
, gid
, uname
, gname
):
52 Writes the given store paths as a tar file to the given stream.
54 obj: Stream to write to. Should have a 'write' method.
55 paths: List of store paths.
58 # gettarinfo makes the paths relative, this makes them
61 ti
.name
= "/" + ti
.name
64 def apply_filters(ti
):
73 ti
.mode
= 0o0755 # rwxr-xr-x
77 ti
= tarfile
.TarInfo(path
)
78 ti
.type = tarfile
.DIRTYPE
81 with tarfile
.open(fileobj
=obj
, mode
="w|") as tar
:
82 # To be consistent with the docker utilities, we need to have
83 # these directories first when building layer tarballs.
84 tar
.addfile(apply_filters(nix_root(dir("/nix"))))
85 tar
.addfile(apply_filters(nix_root(dir("/nix/store"))))
88 path
= pathlib
.Path(path
)
92 files
= itertools
.chain([path
], path
.rglob("*"))
94 for filename
in sorted(files
):
95 ti
= append_root(tar
.gettarinfo(filename
))
97 # copy hardlinks as regular files
99 ti
.type = tarfile
.REGTYPE
101 ti
.size
= filename
.stat().st_size
103 ti
= apply_filters(ti
)
105 with
open(filename
, "rb") as f
:
111 class ExtractChecksum
:
113 A writable stream which only calculates the final file size and
114 sha256sum, while discarding the actual contents.
118 self
._digest
= hashlib
.sha256()
121 def write(self
, data
):
122 self
._digest
.update(data
)
123 self
._size
+= len(data
)
127 Returns: Hex-encoded sha256sum and size as a tuple.
129 return (self
._digest
.hexdigest(), self
._size
)
132 FromImage
= namedtuple("FromImage", ["tar", "manifest_json", "image_json"])
133 # Some metadata for a layer
134 LayerInfo
= namedtuple("LayerInfo", ["size", "checksum", "path", "paths"])
137 def load_from_image(from_image_str
):
139 Loads the given base image, if any.
141 from_image_str: Path to the base image archive.
143 Returns: A 'FromImage' object with references to the loaded base image,
144 or 'None' if no base image was provided.
146 if from_image_str
is None:
149 base_tar
= tarfile
.open(from_image_str
)
151 manifest_json_tarinfo
= base_tar
.getmember("manifest.json")
152 with base_tar
.extractfile(manifest_json_tarinfo
) as f
:
153 manifest_json
= json
.load(f
)
155 image_json_tarinfo
= base_tar
.getmember(manifest_json
[0]["Config"])
156 with base_tar
.extractfile(image_json_tarinfo
) as f
:
157 image_json
= json
.load(f
)
159 return FromImage(base_tar
, manifest_json
, image_json
)
162 def add_base_layers(tar
, from_image
):
164 Adds the layers from the given base image to the final image.
166 tar: 'tarfile.TarFile' object for new layers to be added to.
167 from_image: 'FromImage' object with references to the loaded base image.
169 if from_image
is None:
170 print("No 'fromImage' provided", file=sys
.stderr
)
173 layers
= from_image
.manifest_json
[0]["Layers"]
174 checksums
= from_image
.image_json
["rootfs"]["diff_ids"]
175 layers_checksums
= zip(layers
, checksums
)
177 for num
, (layer
, checksum
) in enumerate(layers_checksums
, start
=1):
178 layer_tarinfo
= from_image
.tar
.getmember(layer
)
179 checksum
= re
.sub(r
"^sha256:", "", checksum
)
181 tar
.addfile(layer_tarinfo
, from_image
.tar
.extractfile(layer_tarinfo
))
182 path
= layer_tarinfo
.path
183 size
= layer_tarinfo
.size
185 print("Adding base layer", num
, "from", path
, file=sys
.stderr
)
186 yield LayerInfo(size
=size
, checksum
=checksum
, path
=path
, paths
=[path
])
188 from_image
.tar
.close()
191 def overlay_base_config(from_image
, final_config
):
193 Overlays the final image 'config' JSON on top of selected defaults from the
194 base image 'config' JSON.
196 from_image: 'FromImage' object with references to the loaded base image.
197 final_config: 'dict' object of the final image 'config' JSON.
199 if from_image
is None:
202 base_config
= from_image
.image_json
["config"]
204 # Preserve environment from base image
205 final_env
= base_config
.get("Env", []) + final_config
.get("Env", [])
207 # Resolve duplicates (last one wins) and format back as list
208 resolved_env
= {entry
.split("=", 1)[0]: entry
for entry
in final_env
}
209 final_config
["Env"] = list(resolved_env
.values())
213 def add_layer_dir(tar
, paths
, store_dir
, mtime
, uid
, gid
, uname
, gname
):
215 Appends given store paths to a TarFile object as a new layer.
217 tar: 'tarfile.TarFile' object for the new layer to be added to.
218 paths: List of store paths.
219 store_dir: the root directory of the nix store
220 mtime: 'mtime' of the added files and the layer tarball.
221 Should be an integer representing a POSIX time.
223 Returns: A 'LayerInfo' object containing some metadata of
227 invalid_paths
= [i
for i
in paths
if not i
.startswith(store_dir
)]
229 len(invalid_paths
) == 0
230 ), f
"Expecting absolute paths from {store_dir}, but got: {invalid_paths}"
232 # First, calculate the tarball checksum and the size.
233 extract_checksum
= ExtractChecksum()
234 archive_paths_to(extract_checksum
, paths
, mtime
, uid
, gid
, uname
, gname
)
235 (checksum
, size
) = extract_checksum
.extract()
237 path
= f
"{checksum}/layer.tar"
238 layer_tarinfo
= tarfile
.TarInfo(path
)
239 layer_tarinfo
.size
= size
240 layer_tarinfo
.mtime
= mtime
242 # Then actually stream the contents to the outer tarball.
243 read_fd
, write_fd
= os
.pipe()
244 with
open(read_fd
, "rb") as read
, open(write_fd
, "wb") as write
:
247 archive_paths_to(write
, paths
, mtime
, uid
, gid
, uname
, gname
)
250 # Closing the write end of the fifo also closes the read end,
251 # so we don't need to wait until this thread is finished.
253 # Any exception from the thread will get printed by the default
254 # exception handler, and the 'addfile' call will fail since it
255 # won't be able to read required amount of bytes.
256 threading
.Thread(target
=producer
).start()
257 tar
.addfile(layer_tarinfo
, read
)
259 return LayerInfo(size
=size
, checksum
=checksum
, path
=path
, paths
=paths
)
262 def add_customisation_layer(target_tar
, customisation_layer
, mtime
):
264 Adds the customisation layer as a new layer. This is layer is structured
265 differently; given store path has the 'layer.tar' and corresponding
268 tar: 'tarfile.TarFile' object for the new layer to be added to.
269 customisation_layer: Path containing the layer archive.
270 mtime: 'mtime' of the added layer tarball.
273 checksum_path
= os
.path
.join(customisation_layer
, "checksum")
274 with
open(checksum_path
) as f
:
275 checksum
= f
.read().strip()
276 assert len(checksum
) == 64, f
"Invalid sha256 at ${checksum_path}."
278 layer_path
= os
.path
.join(customisation_layer
, "layer.tar")
280 path
= f
"{checksum}/layer.tar"
281 tarinfo
= target_tar
.gettarinfo(layer_path
)
283 tarinfo
.mtime
= mtime
285 with
open(layer_path
, "rb") as f
:
286 target_tar
.addfile(tarinfo
, f
)
289 size
=None, checksum
=checksum
, path
=path
, paths
=[customisation_layer
]
293 def add_bytes(tar
, path
, content
, mtime
):
295 Adds a file to the tarball with given path and contents.
297 tar: 'tarfile.TarFile' object.
298 path: Path of the file as a string.
299 content: Contents of the file.
300 mtime: 'mtime' of the file. Should be an integer representing a POSIX time.
302 assert type(content
) is bytes
304 ti
= tarfile
.TarInfo(path
)
305 ti
.size
= len(content
)
307 tar
.addfile(ti
, io
.BytesIO(content
))
310 now
= datetime
.now(tz
=timezone
.utc
)
316 return datetime
.fromisoformat(s
)
320 arg_parser
= argparse
.ArgumentParser(
322 This script generates a Docker image from a set of store paths. Uses
323 Docker Image Specification v1.2 as reference [1].
325 [1]: https://github.com/moby/moby/blob/master/image/spec/v1.2.md
328 arg_parser
.add_argument(
332 JSON file with the following properties and writes the
333 image as an uncompressed tarball to stdout:
335 * "architecture", "config", "os", "created", "repo_tag" correspond to
336 the fields with the same name on the image spec [2].
337 * "created" can be "now".
338 * "created" is also used as mtime for files added to the image.
339 * "uid", "gid", "uname", "gname" is the file ownership, for example,
340 0, 0, "root", "root".
341 * "store_layers" is a list of layers in ascending order, where each
342 layer is the list of store paths to include in that layer.
345 arg_parser
.add_argument(
346 "--repo_tag", "-t", type=str,
347 help="Override the RepoTags from the configuration"
350 args
= arg_parser
.parse_args()
351 with
open(args
.conf
, "r") as f
:
354 created
= parse_time(conf
["created"])
355 mtime
= int(parse_time(conf
["mtime"]).timestamp())
356 uid
= int(conf
["uid"])
357 gid
= int(conf
["gid"])
358 uname
= conf
["uname"]
359 gname
= conf
["gname"]
360 store_dir
= conf
["store_dir"]
362 from_image
= load_from_image(conf
["from_image"])
364 with tarfile
.open(mode
="w|", fileobj
=sys
.stdout
.buffer) as tar
:
366 layers
.extend(add_base_layers(tar
, from_image
))
368 start
= len(layers
) + 1
369 for num
, store_layer
in enumerate(conf
["store_layers"], start
=start
):
377 info
= add_layer_dir(
378 tar
, store_layer
, store_dir
, mtime
, uid
, gid
, uname
, gname
385 "with customisation...",
389 add_customisation_layer(
390 tar
, conf
["customisation_layer"], mtime
=mtime
394 print("Adding manifests...", file=sys
.stderr
)
397 "created": datetime
.isoformat(created
),
398 "architecture": conf
["architecture"],
400 "config": overlay_base_config(from_image
, conf
["config"]),
402 "diff_ids": [f
"sha256:{layer.checksum}" for layer
in layers
],
407 "created": datetime
.isoformat(created
),
408 "comment": f
"store paths: {layer.paths}",
414 image_json
= json
.dumps(image_json
, indent
=4).encode("utf-8")
415 image_json_checksum
= hashlib
.sha256(image_json
).hexdigest()
416 image_json_path
= f
"{image_json_checksum}.json"
417 add_bytes(tar
, image_json_path
, image_json
, mtime
=mtime
)
421 "Config": image_json_path
,
422 "RepoTags": [args
.repo_tag
or conf
["repo_tag"]],
423 "Layers": [layer
.path
for layer
in layers
],
426 manifest_json
= json
.dumps(manifest_json
, indent
=4).encode("utf-8")
427 add_bytes(tar
, "manifest.json", manifest_json
, mtime
=mtime
)
429 print("Done.", file=sys
.stderr
)
432 if __name__
== "__main__":