1 { config, pkgs, lib, utils, ... }:
4 toplevelConfig = config;
6 inherit (utils.systemdUtils.lib) mkPathSafeName;
8 options.systemd.services = lib.mkOption {
9 type = types.attrsOf (types.submodule ({ name, config, ... }: {
10 options.confinement.enable = lib.mkOption {
14 If set, all the required runtime store paths for this service are
15 bind-mounted into a `tmpfs`-based
20 options.confinement.fullUnit = lib.mkOption {
24 Whether to include the full closure of the systemd unit file into the
25 chroot, instead of just the dependencies for the executables.
28 While it may be tempting to just enable this option to
29 make things work quickly, please be aware that this might add paths
30 to the closure of the chroot that you didn't anticipate. It's better
31 to use {option}`confinement.packages` to **explicitly** add additional store paths to the
37 options.confinement.packages = lib.mkOption {
38 type = types.listOf (types.either types.str types.package);
41 mkScOption = optName: "{option}`serviceConfig.${optName}`";
43 Additional packages or strings with context to add to the closure of
44 the chroot. By default, this includes all the packages from the
45 ${lib.concatMapStringsSep ", " mkScOption [
46 "ExecReload" "ExecStartPost" "ExecStartPre" "ExecStop"
48 ]} and ${mkScOption "ExecStart"} options. If you want to have all the
49 dependencies of this systemd unit, you can use
50 {option}`confinement.fullUnit`.
53 The store paths listed in {option}`path` are
54 **not** included in the closure as
55 well as paths from other options except those listed
61 options.confinement.binSh = lib.mkOption {
62 type = types.nullOr types.path;
63 default = toplevelConfig.environment.binsh;
64 defaultText = lib.literalExpression "config.environment.binsh";
65 example = lib.literalExpression ''"''${pkgs.dash}/bin/dash"'';
67 The program to make available as {file}`/bin/sh` inside
68 the chroot. If this is set to `null`, no
69 {file}`/bin/sh` is provided at all.
71 This is useful for some applications, which for example use the
72 {manpage}`system(3)` library function to execute commands.
76 options.confinement.mode = lib.mkOption {
77 type = types.enum [ "full-apivfs" "chroot-only" ];
78 default = "full-apivfs";
80 The value `full-apivfs` (the default) sets up
81 private {file}`/dev`, {file}`/proc`,
82 {file}`/sys`, {file}`/tmp` and {file}`/var/tmp` file systems
83 in a separate user name space.
85 If this is set to `chroot-only`, only the file
86 system name space is set up along with the call to
89 In all cases, unless `serviceConfig.PrivateTmp=true` is set,
90 both {file}`/tmp` and {file}`/var/tmp` paths are added to `InaccessiblePaths=`.
91 This is to overcome options like `DynamicUser=true`
92 implying `PrivateTmp=true` without letting it being turned off.
93 Beware however that giving processes the `CAP_SYS_ADMIN` and `@mount` privileges
94 can let them undo the effects of `InaccessiblePaths=`.
97 This doesn't cover network namespaces and is solely for
98 file system level isolation.
104 inherit (config.confinement) binSh fullUnit;
105 wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs");
106 in lib.mkIf config.confinement.enable {
108 ReadOnlyPaths = [ "+/" ];
109 RuntimeDirectory = [ "confinement/${mkPathSafeName name}" ];
110 RootDirectory = "/run/confinement/${mkPathSafeName name}";
111 InaccessiblePaths = [
112 "-+/run/confinement/${mkPathSafeName name}"
114 PrivateMounts = lib.mkDefault true;
116 # https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt
117 # to change some of these to default to true.
119 # If we run in chroot-only mode, having something like PrivateDevices
120 # set to true by default will mount /dev within the chroot, whereas
121 # with "chroot-only" it's expected that there are no /dev, /proc and
122 # /sys file systems available.
124 # However, if this suddenly becomes true, the attack surface will
125 # increase, so let's explicitly set these options to true/false
126 # depending on the mode.
127 MountAPIVFS = wantsAPIVFS;
128 PrivateDevices = wantsAPIVFS;
129 PrivateTmp = wantsAPIVFS;
130 PrivateUsers = wantsAPIVFS;
131 ProtectControlGroups = wantsAPIVFS;
132 ProtectKernelModules = wantsAPIVFS;
133 ProtectKernelTunables = wantsAPIVFS;
135 confinement.packages = let
137 "ExecReload" "ExecStart" "ExecStartPost" "ExecStartPre" "ExecStop"
140 execPkgs = lib.concatMap (opt: let
141 isSet = config.serviceConfig ? ${opt};
142 in lib.flatten (lib.optional isSet config.serviceConfig.${opt})) execOpts;
143 unitAttrs = toplevelConfig.systemd.units."${name}.service";
144 allPkgs = lib.singleton (builtins.toJSON unitAttrs);
145 unitPkgs = if fullUnit then allPkgs else execPkgs;
146 in unitPkgs ++ lib.optional (binSh != null) binSh;
151 config.assertions = lib.concatLists (lib.mapAttrsToList (name: cfg: let
152 whatOpt = optName: "The 'serviceConfig' option '${optName}' for"
153 + " service '${name}' is enabled in conjunction with"
154 + " 'confinement.enable'";
155 in lib.optionals cfg.confinement.enable [
156 { assertion = !cfg.serviceConfig.RootDirectoryStartOnly or false;
157 message = "${whatOpt "RootDirectoryStartOnly"}, but right now systemd"
158 + " doesn't support restricting bind-mounts to 'ExecStart'."
159 + " Please either define a separate service or find a way to run"
160 + " commands other than ExecStart within the chroot.";
162 ]) config.systemd.services);
164 config.systemd.packages = lib.concatLists (lib.mapAttrsToList (name: cfg: let
166 contents = lib.concatStringsSep "\n" cfg.confinement.packages;
167 in pkgs.writeText "${mkPathSafeName name}-string-contexts.txt" contents;
169 chrootPaths = pkgs.runCommand "${mkPathSafeName name}-chroot-paths" {
170 closureInfo = pkgs.closureInfo { inherit rootPaths; };
171 serviceName = "${name}.service";
172 excludedPath = rootPaths;
174 mkdir -p "$out/lib/systemd/system/$serviceName.d"
175 serviceFile="$out/lib/systemd/system/$serviceName.d/confinement.conf"
177 echo '[Service]' > "$serviceFile"
179 # /bin/sh is special here, because the option value could contain a
180 # symlink and we need to properly resolve it.
181 ${lib.optionalString (cfg.confinement.binSh != null) ''
182 binsh=${lib.escapeShellArg cfg.confinement.binSh}
183 realprog="$(readlink -e "$binsh")"
184 echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile"
187 # If DynamicUser= is enabled, PrivateTmp=true is implied (and cannot be turned off).
188 # so disable them unless PrivateTmp=true is explicitely set.
189 ${lib.optionalString (!cfg.serviceConfig.PrivateTmp) ''
190 echo "InaccessiblePaths=-+/tmp" >> "$serviceFile"
191 echo "InaccessiblePaths=-+/var/tmp" >> "$serviceFile"
194 while read storePath; do
195 if [ -L "$storePath" ]; then
196 # Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths,
197 # so let's just bind-mount the target to that location.
198 echo "BindReadOnlyPaths=$(readlink -e "$storePath"):$storePath"
199 elif [ "$storePath" != "$excludedPath" ]; then
200 echo "BindReadOnlyPaths=$storePath"
202 done < "$closureInfo/store-paths" >> "$serviceFile"
204 in lib.optional cfg.confinement.enable chrootPaths) config.systemd.services);