Containers: Use systemd-nspawn's --network-veth flag

Note that this causes the name of the host-side interface to change
from c-<name> to ve-<name>.
This commit is contained in:
Eelco Dolstra 2014-05-07 17:00:46 +02:00
parent 810680bcae
commit 6f7aaf10a5
5 changed files with 73 additions and 105 deletions

View File

@ -213,8 +213,8 @@ $ ping -c1 10.233.4.2
<para>Networking is implemented using a pair of virtual Ethernet <para>Networking is implemented using a pair of virtual Ethernet
devices. The network interface in the container is called devices. The network interface in the container is called
<literal>eth0</literal>, while the matching interface in the host is <literal>eth0</literal>, while the matching interface in the host is
called <literal>c-<replaceable>container-name</replaceable></literal> called <literal>ve-<replaceable>container-name</replaceable></literal>
(e.g., <literal>c-foo</literal>). The container has its own network (e.g., <literal>ve-foo</literal>). The container has its own network
namespace and the <literal>CAP_NET_ADMIN</literal> capability, so it namespace and the <literal>CAP_NET_ADMIN</literal> capability, so it
can perform arbitrary network configuration such as setting up can perform arbitrary network configuration such as setting up
firewall rules, without affecting or having access to the hosts firewall rules, without affecting or having access to the hosts
@ -228,11 +228,11 @@ on the host:
<programlisting> <programlisting>
networking.nat.enable = true; networking.nat.enable = true;
networking.nat.internalInterfaces = ["c-+"]; networking.nat.internalInterfaces = ["ve-+"];
networking.nat.externalInterface = "eth0"; networking.nat.externalInterface = "eth0";
</programlisting> </programlisting>
where <literal>eth0</literal> should be replaced with the desired where <literal>eth0</literal> should be replaced with the desired
external interface. Note that <literal>c-+</literal> is a wildcard external interface. Note that <literal>ve-+</literal> is a wildcard
that matches all container interfaces.</para> that matches all container interfaces.</para>
</section> </section>

View File

@ -4,6 +4,28 @@
<title>Release notes</title> <title>Release notes</title>
<!--==================================================================-->
<section xml:id="sec-release-14.10">
<title>Release 14.10 (“Caterpillar”, 2014/10/??)</title>
<para>When upgrading from a previous release, please be aware of the
following incompatible changes:
<itemizedlist>
<listitem><para>The host side of a container virtual Ethernet pair
is now called <literal>ve-<replaceable>container-name</replaceable></literal>
rather than <literal>c-<replaceable>container-name</replaceable></literal>.</para></listitem>
</itemizedlist>
</para>
</section>
<!--==================================================================--> <!--==================================================================-->
<section xml:id="sec-release-14.04"> <section xml:id="sec-release-14.04">

View File

@ -34,9 +34,8 @@ let
# Ignore peth* devices; on Xen, they're renamed physical # Ignore peth* devices; on Xen, they're renamed physical
# Ethernet cards used for bridging. Likewise for vif* and tap* # Ethernet cards used for bridging. Likewise for vif* and tap*
# (Xen) and virbr* and vnet* (libvirt) and c-* and ctmp-* (NixOS # (Xen) and virbr* and vnet* (libvirt).
# containers). denyinterfaces ${toString ignoredInterfaces} lo peth* vif* tap* tun* virbr* vnet* vboxnet*
denyinterfaces ${toString ignoredInterfaces} lo peth* vif* tap* tun* virbr* vnet* vboxnet* c-* ctmp-*
${config.networking.dhcpcd.extraConfig} ${config.networking.dhcpcd.extraConfig}
''; '';

View File

@ -4,16 +4,6 @@ with lib;
let let
runInNetns = pkgs.stdenv.mkDerivation {
name = "run-in-netns";
unpackPhase = "true";
buildPhase = ''
mkdir -p $out/bin
gcc ${./run-in-netns.c} -o $out/bin/run-in-netns
'';
installPhase = "true";
};
nixos-container = pkgs.substituteAll { nixos-container = pkgs.substituteAll {
name = "nixos-container"; name = "nixos-container";
dir = "bin"; dir = "bin";
@ -23,6 +13,28 @@ let
inherit (pkgs) socat; inherit (pkgs) socat;
}; };
# The container's init script, a small wrapper around the regular
# NixOS stage-2 init script.
containerInit = pkgs.writeScript "container-init"
''
#! ${pkgs.stdenv.shell} -e
# Initialise the container side of the veth pair.
if [ "$PRIVATE_NETWORK" = 1 ]; then
ip link set host0 name eth0
ip link set dev eth0 up
if [ -n "$HOST_ADDRESS" ]; then
ip route add $HOST_ADDRESS dev eth0
ip route add default via $HOST_ADDRESS
fi
if [ -n "$LOCAL_ADDRESS" ]; then
ip addr add $LOCAL_ADDRESS dev eth0
fi
fi
exec "$1"
'';
system = config.nixpkgs.system; system = config.nixpkgs.system;
in in
@ -70,7 +82,7 @@ in
Whether to give the container its own private virtual Whether to give the container its own private virtual
Ethernet interface. The interface is called Ethernet interface. The interface is called
<literal>eth0</literal>, and is hooked up to the interface <literal>eth0</literal>, and is hooked up to the interface
<literal>c-<replaceable>container-name</replaceable></literal> <literal>ve-<replaceable>container-name</replaceable></literal>
on the host. If this option is not set, then the on the host. If this option is not set, then the
container shares the network interfaces of the host, container shares the network interfaces of the host,
and can bind to any port on any interface. and can bind to any port on any interface.
@ -176,39 +188,8 @@ in
"/nix/var/nix/profiles/per-container/$INSTANCE" \ "/nix/var/nix/profiles/per-container/$INSTANCE" \
"/nix/var/nix/gcroots/per-container/$INSTANCE" "/nix/var/nix/gcroots/per-container/$INSTANCE"
if [ -f "/etc/containers/$INSTANCE.conf" ]; then
. "/etc/containers/$INSTANCE.conf"
fi
# Cleanup from last time.
ifaceHost=c-$INSTANCE
ifaceCont=ctmp-$INSTANCE
ns=net-$INSTANCE
ip netns del $ns 2> /dev/null || true
ip link del $ifaceHost 2> /dev/null || true
ip link del $ifaceCont 2> /dev/null || true
if [ "$PRIVATE_NETWORK" = 1 ]; then if [ "$PRIVATE_NETWORK" = 1 ]; then
# Create a pair of virtual ethernet devices. On the host, extraFlags="--network-veth"
# we get c-<container-name, and on the guest, we get
# eth0.
ip link add $ifaceHost type veth peer name $ifaceCont
ip netns add $ns
ip link set $ifaceCont netns $ns
ip netns exec $ns ip link set $ifaceCont name eth0
ip netns exec $ns ip link set dev eth0 up
ip link set dev $ifaceHost up
if [ -n "$HOST_ADDRESS" ]; then
ip addr add $HOST_ADDRESS dev $ifaceHost
ip netns exec $ns ip route add $HOST_ADDRESS dev eth0
ip netns exec $ns ip route add default via $HOST_ADDRESS
fi
if [ -n "$LOCAL_ADDRESS" ]; then
ip netns exec $ns ip addr add $LOCAL_ADDRESS dev eth0
ip route add $LOCAL_ADDRESS dev $ifaceHost
fi
runInNetNs="${runInNetns}/bin/run-in-netns $ns"
extraFlags="--capability=CAP_NET_ADMIN"
fi fi
# If the host is 64-bit and the container is 32-bit, add a # If the host is 64-bit and the container is 32-bit, add a
@ -219,7 +200,7 @@ in
fi fi
''} ''}
exec $runInNetNs ${config.systemd.package}/bin/systemd-nspawn \ exec ${config.systemd.package}/bin/systemd-nspawn \
--keep-unit \ --keep-unit \
-M "$INSTANCE" -D "$root" $extraFlags \ -M "$INSTANCE" -D "$root" $extraFlags \
--bind-ro=/nix/store \ --bind-ro=/nix/store \
@ -227,7 +208,11 @@ in
--bind-ro=/nix/var/nix/daemon-socket \ --bind-ro=/nix/var/nix/daemon-socket \
--bind="/nix/var/nix/profiles/per-container/$INSTANCE:/nix/var/nix/profiles" \ --bind="/nix/var/nix/profiles/per-container/$INSTANCE:/nix/var/nix/profiles" \
--bind="/nix/var/nix/gcroots/per-container/$INSTANCE:/nix/var/nix/gcroots" \ --bind="/nix/var/nix/gcroots/per-container/$INSTANCE:/nix/var/nix/gcroots" \
"''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init" --setenv PRIVATE_NETWORK="$PRIVATE_NETWORK" \
--setenv HOST_ADDRESS="$HOST_ADDRESS" \
--setenv LOCAL_ADDRESS="$LOCAL_ADDRESS" \
--setenv PATH="$PATH" \
${containerInit} "''${SYSTEM_PATH:-/nix/var/nix/profiles/system}/init"
''; '';
postStart = postStart =
@ -237,6 +222,17 @@ in
# until the start timeout expires if systemd-nspawn exits. # until the start timeout expires if systemd-nspawn exits.
read x < $root/var/lib/startup-done read x < $root/var/lib/startup-done
rm -f $root/var/lib/startup-done rm -f $root/var/lib/startup-done
if [ "$PRIVATE_NETWORK" = 1 ]; then
ifaceHost=ve-$INSTANCE
ip link set dev $ifaceHost up
if [ -n "$HOST_ADDRESS" ]; then
ip addr add $HOST_ADDRESS dev $ifaceHost
fi
if [ -n "$LOCAL_ADDRESS" ]; then
ip route add $LOCAL_ADDRESS dev $ifaceHost
fi
fi
''; '';
preStop = preStop =
@ -251,14 +247,13 @@ in
'' ''
#! ${pkgs.stdenv.shell} -e #! ${pkgs.stdenv.shell} -e
SYSTEM_PATH=/nix/var/nix/profiles/system SYSTEM_PATH=/nix/var/nix/profiles/system
if [ -f "/etc/containers/$INSTANCE.conf" ]; then
. "/etc/containers/$INSTANCE.conf"
fi
echo $SYSTEM_PATH/bin/switch-to-configuration test | \ echo $SYSTEM_PATH/bin/switch-to-configuration test | \
${pkgs.socat}/bin/socat unix:$root/var/lib/run-command.socket - ${pkgs.socat}/bin/socat unix:$root/var/lib/run-command.socket -
''; '';
serviceConfig.SyslogIdentifier = "container %i"; serviceConfig.SyslogIdentifier = "container %i";
serviceConfig.EnvironmentFile = "-/etc/containers/%i.conf";
}; };
# Generate a configuration file in /etc/containers for each # Generate a configuration file in /etc/containers for each
@ -288,6 +283,8 @@ in
${cfg.localAddress} ${name}.containers ${cfg.localAddress} ${name}.containers
'') config.containers); '') config.containers);
networking.dhcpcd.denyInterfaces = [ "ve-*" ];
environment.systemPackages = [ nixos-container ]; environment.systemPackages = [ nixos-container ];
}; };

View File

@ -1,50 +0,0 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <fcntl.h>
#include <linux/limits.h>
int main(int argc, char * * argv)
{
if (argc < 3) {
fprintf(stderr, "%s: missing arguments\n", argv[0]);
return 1;
}
char nsPath[PATH_MAX];
sprintf(nsPath, "/run/netns/%s", argv[1]);
int fd = open(nsPath, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "%s: opening network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
if (setns(fd, CLONE_NEWNET) == -1) {
fprintf(stderr, "%s: setting network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
umount2(nsPath, MNT_DETACH);
if (unlink(nsPath) == -1) {
fprintf(stderr, "%s: unlinking network namespace: %s\n", argv[0], strerror(errno));
return 1;
}
/* FIXME: Remount /sys so that /sys/class/net reflects the
interfaces visible in the network namespace. This requires
bind-mounting /sys/fs/cgroups etc. */
execv(argv[2], argv + 2);
fprintf(stderr, "%s: running command: %s\n", argv[0], strerror(errno));
return 1;
}