Introduces a new security.forcePageTableIsolation option (default false on !hardened, true on hardened) that forces pti=on.
		
			
				
	
	
		
			138 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Nix
		
	
	
	
	
	
			
		
		
	
	
			138 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Nix
		
	
	
	
	
	
{ config, lib, ... }:
 | 
						|
 | 
						|
with lib;
 | 
						|
 | 
						|
{
 | 
						|
  meta = {
 | 
						|
    maintainers = [ maintainers.joachifm ];
 | 
						|
  };
 | 
						|
 | 
						|
  options = {
 | 
						|
    security.allowUserNamespaces = mkOption {
 | 
						|
      type = types.bool;
 | 
						|
      default = true;
 | 
						|
      description = ''
 | 
						|
        Whether to allow creation of user namespaces.
 | 
						|
 | 
						|
        The motivation for disabling user namespaces is the potential
 | 
						|
        presence of code paths where the kernel's permission checking
 | 
						|
        logic fails to account for namespacing, instead permitting a
 | 
						|
        namespaced process to act outside the namespace with the same
 | 
						|
        privileges as it would have inside it.  This is particularly
 | 
						|
        damaging in the common case of running as root within the namespace.
 | 
						|
 | 
						|
        When user namespace creation is disallowed, attempting to create a
 | 
						|
        user namespace fails with "no space left on device" (ENOSPC).
 | 
						|
        root may re-enable user namespace creation at runtime.
 | 
						|
      '';
 | 
						|
    };
 | 
						|
 | 
						|
    security.protectKernelImage = mkOption {
 | 
						|
      type = types.bool;
 | 
						|
      default = false;
 | 
						|
      description = ''
 | 
						|
        Whether to prevent replacing the running kernel image.
 | 
						|
      '';
 | 
						|
    };
 | 
						|
 | 
						|
    security.allowSimultaneousMultithreading = mkOption {
 | 
						|
      type = types.bool;
 | 
						|
      default = true;
 | 
						|
      description = ''
 | 
						|
        Whether to allow SMT/hyperthreading.  Disabling SMT means that only
 | 
						|
        physical CPU cores will be usable at runtime, potentially at
 | 
						|
        significant performance cost.
 | 
						|
 | 
						|
        The primary motivation for disabling SMT is to mitigate the risk of
 | 
						|
        leaking data between threads running on the same CPU core (due to
 | 
						|
        e.g., shared caches).  This attack vector is unproven.
 | 
						|
 | 
						|
        Disabling SMT is a supplement to the L1 data cache flushing mitigation
 | 
						|
        (see <xref linkend="opt-security.virtualisation.flushL1DataCache"/>)
 | 
						|
        versus malicious VM guests (SMT could "bring back" previously flushed
 | 
						|
        data).
 | 
						|
      '';
 | 
						|
    };
 | 
						|
 | 
						|
    security.forcePageTableIsolation = mkOption {
 | 
						|
      type = types.bool;
 | 
						|
      default = false;
 | 
						|
      description = ''
 | 
						|
        Whether to force-enable the Page Table Isolation (PTI) Linux kernel
 | 
						|
        feature even on CPU models that claim to be safe from Meltdown.
 | 
						|
 | 
						|
        This hardening feature is most beneficial to systems that run untrusted
 | 
						|
        workloads that rely on address space isolation for security.
 | 
						|
      '';
 | 
						|
    };
 | 
						|
 | 
						|
    security.virtualisation.flushL1DataCache = mkOption {
 | 
						|
      type = types.nullOr (types.enum [ "never" "cond" "always" ]);
 | 
						|
      default = null;
 | 
						|
      description = ''
 | 
						|
        Whether the hypervisor should flush the L1 data cache before
 | 
						|
        entering guests.
 | 
						|
        See also <xref linkend="opt-security.allowSimultaneousMultithreading"/>.
 | 
						|
 | 
						|
        <variablelist>
 | 
						|
          <varlistentry>
 | 
						|
            <term><literal>null</literal></term>
 | 
						|
            <listitem><para>uses the kernel default</para></listitem>
 | 
						|
          </varlistentry>
 | 
						|
          <varlistentry>
 | 
						|
            <term><literal>"never"</literal></term>
 | 
						|
            <listitem><para>disables L1 data cache flushing entirely.
 | 
						|
            May be appropriate if all guests are trusted.</para></listitem>
 | 
						|
          </varlistentry>
 | 
						|
          <varlistentry>
 | 
						|
            <term><literal>"cond"</literal></term>
 | 
						|
            <listitem><para>flushes L1 data cache only for pre-determined
 | 
						|
            code paths.  May leak information about the host address space
 | 
						|
            layout.</para></listitem>
 | 
						|
          </varlistentry>
 | 
						|
          <varlistentry>
 | 
						|
            <term><literal>"always"</literal></term>
 | 
						|
            <listitem><para>flushes L1 data cache every time the hypervisor
 | 
						|
            enters the guest.  May incur significant performance cost.
 | 
						|
            </para></listitem>
 | 
						|
          </varlistentry>
 | 
						|
        </variablelist>
 | 
						|
      '';
 | 
						|
    };
 | 
						|
  };
 | 
						|
 | 
						|
  config = mkMerge [
 | 
						|
    (mkIf (!config.security.allowUserNamespaces) {
 | 
						|
      # Setting the number of allowed user namespaces to 0 effectively disables
 | 
						|
      # the feature at runtime.  Note that root may raise the limit again
 | 
						|
      # at any time.
 | 
						|
      boot.kernel.sysctl."user.max_user_namespaces" = 0;
 | 
						|
 | 
						|
      assertions = [
 | 
						|
        { assertion = config.nix.useSandbox -> config.security.allowUserNamespaces;
 | 
						|
          message = "`nix.useSandbox = true` conflicts with `!security.allowUserNamespaces`.";
 | 
						|
        }
 | 
						|
      ];
 | 
						|
    })
 | 
						|
 | 
						|
    (mkIf config.security.protectKernelImage {
 | 
						|
      # Disable hibernation (allows replacing the running kernel)
 | 
						|
      boot.kernelParams = [ "nohibernate" ];
 | 
						|
      # Prevent replacing the running kernel image w/o reboot
 | 
						|
      boot.kernel.sysctl."kernel.kexec_load_disabled" = mkDefault true;
 | 
						|
    })
 | 
						|
 | 
						|
    (mkIf (!config.security.allowSimultaneousMultithreading) {
 | 
						|
      boot.kernelParams = [ "nosmt" ];
 | 
						|
    })
 | 
						|
 | 
						|
    (mkIf config.security.forcePageTableIsolation {
 | 
						|
      boot.kernelParams = [ "pti=on" ];
 | 
						|
    })
 | 
						|
 | 
						|
    (mkIf (config.security.virtualisation.flushL1DataCache != null) {
 | 
						|
      boot.kernelParams = [ "kvm-intel.vmentry_l1d_flush=${config.security.virtualisation.flushL1DataCache}" ];
 | 
						|
    })
 | 
						|
  ];
 | 
						|
}
 |