From 7df55477fd04b1443051f767fd77ea42cb50ed59 Mon Sep 17 00:00:00 2001
From: Charles Duffy <charles@dyfis.net>
Date: Sun, 14 Oct 2018 00:40:37 -0500
Subject: [PATCH 1/3] bees: init at 0.6.1

Introduce an extent-layer (as opposed to the existing file-level) deduplication
system for btrfs. This provides a means of finding similarities within
non-identical files, when they contain identical, aligned blocks.
---
 .../filesystems/bees/bees-service-wrapper     | 223 ++++++++++++++++++
 pkgs/tools/filesystems/bees/default.nix       |  69 ++++++
 pkgs/top-level/all-packages.nix               |   2 +
 3 files changed, 294 insertions(+)
 create mode 100755 pkgs/tools/filesystems/bees/bees-service-wrapper
 create mode 100644 pkgs/tools/filesystems/bees/default.nix

diff --git a/pkgs/tools/filesystems/bees/bees-service-wrapper b/pkgs/tools/filesystems/bees/bees-service-wrapper
new file mode 100755
index 00000000000..8ef46afc18f
--- /dev/null
+++ b/pkgs/tools/filesystems/bees/bees-service-wrapper
@@ -0,0 +1,223 @@
+#!@bash@/bin/bash
+PATH=@bash@/bin:@coreutils@/bin:@utillinux@/bin:@btrfsProgs@/bin:$PATH
+beesd_bin=@bees@/lib/bees/bees
+# PLEASE KEEP NIX-ISMS ABOVE THIS LINE TO EASE UPSTREAM MERGE
+#!/usr/bin/env bash
+
+shopt -s extglob
+
+# Upstream wrapper requires UUID to be used for configuration.
+
+# However, when declaratively describing a host, we may not know its UUID, and
+# shouldn't need to persist something that will differ between hosts built from
+# the same configuration template.
+
+# Thus, for using bees from NixOS, we have our own wrapper, which supports not
+# just UUID but any specification permitted by findmnt
+
+[[ $bees_debug ]] && { PS4=':${BASH_SOURCE##*/}:$LINENO+'; set -x; }
+
+usage() {
+  cat >&2 <<EOF
+Usage: ${BASH_SOURCE##*/} run|cleanup config-name|fsSpec [idxSizeMB=...] [verbosity=...] [workDir=...] [-- daemon-options...]
+
+  fsSpec should be in a format recognized by findmnt. Alternately,
+  "config-name" may refer to a file that exists in ${bees_config_dir:-/etc/bees}
+  with a .conf extension; if that file does not specify UUID, findmnt will be
+  used in addition.
+
+  Note that while config files may presently use shell arithmetic, use of this
+  functionality is not encouraged going forward: Setting ''idxSizeMB=4096'' is
+  preferred over ''DB_SIZE=$((1024*1024*1024*4))'' or ''DB_SIZE=$(( AL16M * 256 ))'',
+  although both of these are presently supported.
+
+  If fsSpec contains a /, it assumed to be a mount point to be looked up by
+  findmnt, not a config file name.
+
+  daemon-options are passed directly through to the daemon on startup, as
+  documented at https://github.com/Zygo/bees/blob/master/docs/options.md.
+EOF
+  exit 1
+}
+
+die() { echo "$*" >&2; exit 1; }
+
+allConfigNames=( blockdev fsSpec home idxSize idxSizeMB mntDir runDir status verbosity workDir )
+
+# Alternate names for configuration values; "bees_" will always be prepended
+declare -A altConfigNames=(
+  # from original bees wrapper
+  [BEESHOME]=home
+  [BEESSTATUS]=status
+  [MNT_DIR]=mntDir
+  [UUID]=uuid
+  [WORK_DIR]=runDir
+  [DB_SIZE]=idxSize
+)
+
+# legacy bees config files can be arbitrary shell scripts, so we need to actually evaluate them
+sandboxedConfigFileEval() {
+  bash_exe=$(type -P bash) || exit
+  PATH=/var/empty ENV='' BASH_ENV='' AL128K="$((128*1024))" AL16M="$((16*1024*1024))" "$bash_exe" -r ${bees_debug+-x} \
+    -c 'eval "$(</dev/stdin)" >&2; for var; do [[ ${!var} ]] && printf "%q=%s\\0" "$var" "${!var}"; done' \
+    "${!altConfigNames[@]}" "${allConfigNames[@]}" \
+    <"$1"
+}
+
+readConfigFileIfExists() {
+  local line
+  [[ -s $1 ]] || return 1
+  while IFS= read -r -d '' line; do
+    line=${line%%+([[:space:]])"#"*}
+    [[ $line ]] || continue
+    [[ $line = *=* ]] || {
+      printf 'WARNING: Config file line not recognized: %q\n' "$line" >&2
+      continue
+    }
+    set_option "$line"
+  done < <(sandboxedConfigFileEval "$1")
+}
+
+set_option() {
+  local k v
+  k="${1%%=*}" v="${1#*=}"
+  [[ ${altConfigNames[$k]} ]] && k=${altConfigNames[$k]}
+  printf -v "bees_$k" %s "$v"
+}
+
+uuid_re='^[[:xdigit:]]{8}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{4}-[[:xdigit:]]{12}$'
+
+# Shared code for setting configuration used by other operations.
+#
+# Reads from global associative array "opts" containing options passed in as
+# key=value pairs on the command line, looks for config-file overrides, and
+# sets individual global variables.
+_setup() {
+  declare fstype
+  bees_fsSpec=$1; shift
+
+  # Look for file-based configuration, additional to honoring configuration on the command line
+  bees_config_dir="${bees_config_dir:-/etc/bees}"
+  if [[ $bees_fsSpec =~ $uuid_re ]]; then
+    bees_uuid=$bees_fsSpec
+    # If our spec looks like a bare UUID, and no config file exists in the new
+    # format, fall back to legacy config file search mechanism (grep; ewww).
+    if ! readConfigFileIfExists "$bees_config_dir/UUID=$bees_fsSpec.conf"; then
+      # Legacy approach to finding a config file: Grep for a *.conf file
+      # containing the UUID within its text. Permitting spaces around the "="
+      # appears to be a bug, but is retained for compatibility with the
+      # original upstream script.
+      allConfFiles=( "$bees_config_dir"/*.conf )
+      if (( ${#allConfFiles[@]} )); then
+        # in read or readarray with -d '', the NUL terminating the empty string is used as delimiter character.
+        readarray -d '' -t matchingConfFiles < <(grep -E -l -Z "^[^#]*UUID[[:space:]]*=[[:space:]]*" "${allConfFiles[@]}")
+      else
+        matchingConfFiles=( )
+      fi
+      if (( ${#matchingConfFiles[@]} == 1 )); then
+        # Exactly one configuration file exists in our target directory with a reference to the UUID given.
+        bees_config_file=${matchingConfFiles[0]}
+        readConfigFileIfExists "$bees_config_file"
+        echo "NOTE: Please consider renaming $bees_config_file to $bees_config_dir/UUID=$bees_fsSpec" >&2
+        echo "      ...and passing UUID=$bees_fsSpec on startup." >&2
+      elif (( ${#matchingConfFiles[@]} > 1 )); then
+        # The legacy wrapper would silently use the first file and ignore
+        # others, but... no.
+        echo "ERROR: Passed a bare UUID, but multiple configuration files match it:" >&2
+        printf ' - %q\n' "${matchingConfFiles[@]}" >&2
+        die "Unable to continue."
+      fi
+    fi
+  else
+    # For a non-UUID fsSpec that is not a path, look only for a config file
+    # exactly matching its text.
+    #
+    # (Passing a mount point as a fsSpec is only supported with the new
+    # wrapper; all key=value pairs can be passed on the command line in this
+    # mode, so config file support is not needed).
+    [[ $bees_fsSpec = */* ]] || readConfigFileIfExists "$bees_config_dir/$bees_fsSpec.conf"
+  fi
+
+  [[ $bees_uuid ]] || {
+    # if bees_uuid is not in our .conf file, look it up with findmnt
+    read -r bees_uuid fstype < <(findmnt -n -o uuid,fstype "$bees_fsSpec") && [[ $fstype ]] || exit
+    [[ $fstype = btrfs ]] || die "Device type is $fstype, not btrfs"
+  }
+
+  [[ $bees_uuid = */* ]] || readConfigFileIfExists "$bees_config_dir/UUID=$bees_uuid.conf"
+
+  # Honor any values read from config files above; otherwise, set defaults.
+  bees_workDir="${bees_workDir:-.beeshome}"
+  bees_runDir="${bees_runDir:-/run/bees}"
+  bees_mntDir="${bees_mntDir:-$bees_runDir/mnt/$bees_uuid}"
+  bees_home="${bees_home:-$bees_mntDir/$bees_workDir}"
+  bees_status="${bees_status:-${bees_runDir}/$bees_uuid.status}"
+  bees_verbosity="${bees_verbosity:-6}"
+  bees_idxSizeMB="${bees_idxSizeMB:-1024}"
+  bees_idxSize=${bees_idxSize:-"$(( bees_idxSizeMB * 1024 * 1024 ))"}
+  bees_blockdev=${bees_blockdev:-"/dev/disk/by-uuid/$bees_uuid"}
+
+  [[ -b $bees_blockdev ]] || die "Block device $bees_blockdev missing"
+  (( bees_idxSize % (16 * 1024 * 1024) == 0 )) || die "DB size must be divisible by 16MB"
+}
+
+do_run() {
+  local db old_db_size
+
+  _setup "$1"; shift
+  mkdir -p -- "$bees_mntDir" || exit
+
+  # subvol id 5 is reserved for the root subvolume of a btrfs filesystem.
+  mountpoint -q "$bees_mntDir" || mount -osubvolid=5 -- "$bees_blockdev" "$bees_mntDir" || exit
+  if [[ -d $bees_home ]]; then
+    btrfs subvolume show "$bees_home" >/dev/null 2>&1 || die "$bees_home exists but is not a subvolume"
+  else
+    btrfs subvolume create "$bees_home" || exit
+    sync # workaround for Zygo/bees#93
+  fi
+  db=$bees_home/beeshash.dat
+  touch -- "$db"
+
+  old_db_size=$(stat -c %s -- "$db")
+  new_db_size=$bees_idxSize
+
+  if (( old_db_size != new_db_size )); then
+    rm -f -- "$bees_home"/beescrawl."$bees_uuid".dat
+    truncate -s "$new_db_size" -- "$db" || exit
+  fi
+  chmod 700 -- "$bees_home"
+
+  # BEESSTATUS and BEESHOME are the only variables handled by the legacy
+  # wrapper for which getenv() is called in C code.
+  BEESSTATUS=$bees_status BEESHOME=$bees_home exec "${beesd_bin:-/lib/bees/bees}" \
+    --verbose "$bees_verbosity" \
+    "$@" "$bees_mntDir" || exit
+}
+
+do_cleanup() {
+  _setup "$1"; shift
+  mountpoint -q "$bees_mntDir" && umount -l -- "$bees_mntDir" || exit
+}
+
+(( $# >= 2 )) || usage
+declare -f "do_$1" >/dev/null 2>&1 || usage
+mode=$1; shift # must be a do_* function; currently "run" or "cleanup"
+
+declare -a args=( "$1" ); shift  # pass first argument (config-name|fsSpec) through literally
+
+# parse other arguments as key=value pairs, or pass them through literally if they do not match that form.
+# similarly, any option after "--" will be passed through literally.
+while (( $# )); do
+  if [[ $1 = *=* ]]; then
+    set_option "$1"
+  elif [[ $1 = -- ]]; then
+    shift
+    args+=( "$@" )
+    break
+  else
+    args+=( "$1" )
+  fi
+  shift
+done
+
+"do_$mode" "${args[@]}"
diff --git a/pkgs/tools/filesystems/bees/default.nix b/pkgs/tools/filesystems/bees/default.nix
new file mode 100644
index 00000000000..c43962cb075
--- /dev/null
+++ b/pkgs/tools/filesystems/bees/default.nix
@@ -0,0 +1,69 @@
+{ stdenv, runCommand, makeWrapper, fetchFromGitHub, bash, btrfs-progs, coreutils, pythonPackages, utillinux }:
+
+let
+
+  version = "0.6.1";
+  sha256 = "0h7idclmhyp14mq6786x7f2237vqpn70gyi88ik4g70xl84yfgyh";
+
+  bees = stdenv.mkDerivation rec {
+    name = "bees-${version}";
+    inherit version;
+
+    src = fetchFromGitHub {
+      owner = "Zygo";
+      repo = "bees";
+      rev = "v${version}";
+      inherit sha256;
+    };
+
+    buildInputs = [
+      btrfs-progs               # for btrfs/ioctl.h
+      utillinux                 # for uuid.h
+    ];
+
+    nativeBuildInputs = [
+      pythonPackages.markdown   # documentation build
+    ];
+
+    preBuild = ''
+      git() { if [[ $1 = describe ]]; then echo ${version}; else command git "$@"; fi; }
+      export -f git
+    '';
+
+    postBuild = ''
+      unset -f git
+    '';
+
+    buildFlags = [
+      "ETC_PREFIX=/var/run/bees/configs"
+    ];
+
+    makeFlags = [
+      "SHELL=bash"
+      "PREFIX=$(out)"
+      "ETC_PREFIX=$(out)/etc"
+      "BEES_VERSION=${version}"
+      "SYSTEMD_SYSTEM_UNIT_DIR=$(out)/etc/systemd/system"
+    ];
+
+    meta = with stdenv.lib; {
+      homepage = "https://github.com/Zygo/bees";
+      description = "Block-oriented BTRFS deduplication service";
+      license = licenses.gpl3;
+      platforms = platforms.linux;
+      maintainers = with maintainers; [ chaduffy ];
+      longDescription = "Best-Effort Extent-Same: bees finds not just identical files, but also identical extents within files that differ";
+    };
+  };
+
+in
+
+runCommand "bees-service-${version}" {
+  inherit bash bees coreutils utillinux;
+  btrfsProgs = btrfs-progs; # needs to be a valid shell variable name
+} ''
+  mkdir -p -- "$out/bin"
+  substituteAll ${./bees-service-wrapper} "$out"/bin/bees-service-wrapper
+  chmod +x "$out"/bin/bees-service-wrapper
+  ln -s ${bees}/bin/beesd "$out"/bin/beesd
+''
diff --git a/pkgs/top-level/all-packages.nix b/pkgs/top-level/all-packages.nix
index 09f566ccd7f..c7fe0a978c6 100644
--- a/pkgs/top-level/all-packages.nix
+++ b/pkgs/top-level/all-packages.nix
@@ -21893,6 +21893,8 @@ with pkgs;
 
   beep = callPackage ../misc/beep { };
 
+  bees = callPackage ../tools/filesystems/bees { };
+
   blackbird = callPackage ../misc/themes/blackbird { };
 
   bootil = callPackage ../development/libraries/bootil { };

From 86db2f394cdc8d96f84c50f92da0e5bb96843b52 Mon Sep 17 00:00:00 2001
From: Charles Duffy <charles@dyfis.net>
Date: Sun, 14 Oct 2018 10:58:56 -0500
Subject: [PATCH 2/3] nixos/modules: services.bees init

---
 nixos/modules/module-list.nix        |   1 +
 nixos/modules/services/misc/bees.nix | 123 +++++++++++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 nixos/modules/services/misc/bees.nix

diff --git a/nixos/modules/module-list.nix b/nixos/modules/module-list.nix
index ae5084ca2a2..5074976fafa 100644
--- a/nixos/modules/module-list.nix
+++ b/nixos/modules/module-list.nix
@@ -336,6 +336,7 @@
   ./services/misc/apache-kafka.nix
   ./services/misc/autofs.nix
   ./services/misc/autorandr.nix
+  ./services/misc/bees.nix
   ./services/misc/bepasty.nix
   ./services/misc/canto-daemon.nix
   ./services/misc/calibre-server.nix
diff --git a/nixos/modules/services/misc/bees.nix b/nixos/modules/services/misc/bees.nix
new file mode 100644
index 00000000000..b0ed2d5c286
--- /dev/null
+++ b/nixos/modules/services/misc/bees.nix
@@ -0,0 +1,123 @@
+{ config, lib, pkgs, ... }:
+
+with lib;
+
+let
+
+  cfg = config.services.beesd;
+
+  logLevels = { emerg = 0; alert = 1; crit = 2; err = 3; warning = 4; notice = 5; info = 6; debug = 7; };
+
+  fsOptions = with types; {
+    options.spec = mkOption {
+      type = str;
+      description = ''
+        Description of how to identify the filesystem to be duplicated by this
+        instance of bees. Note that deduplication crosses subvolumes; one must
+        not configure multiple instances for subvolumes of the same filesystem
+        (or block devices which are part of the same filesystem), but only for
+        completely independent btrfs filesystems.
+        </para>
+        <para>
+        This must be in a format usable by findmnt; that could be a key=value
+        pair, or a bare path to a mount point.
+      '';
+      example = "LABEL=MyBulkDataDrive";
+    };
+    options.hashTableSizeMB = mkOption {
+      type = types.addCheck types.int (n: mod n 16 == 0);
+      default = 1024; # 1GB; default from upstream beesd script
+      description = ''
+        Hash table size in MB; must be a multiple of 16.
+        </para>
+        <para>
+        A larger ratio of index size to storage size means smaller blocks of
+        duplicate content are recognized.
+        </para>
+        <para>
+        If you have 1TB of data, a 4GB hash table (which is to say, a value of
+        4096) will permit 4KB extents (the smallest possible size) to be
+        recognized, whereas a value of 1024 -- creating a 1GB hash table --
+        will recognize only aligned duplicate blocks of 16KB.
+      '';
+    };
+    options.verbosity = mkOption {
+      type = types.enum (attrNames logLevels ++ attrValues logLevels);
+      apply = v: if isString v then logLevels.${v} else v;
+      default = "info";
+      description = "Log verbosity (syslog keyword/level).";
+    };
+    options.workDir = mkOption {
+      type = str;
+      default = ".beeshome";
+      description = ''
+        Name (relative to the root of the filesystem) of the subvolume where
+        the hash table will be stored.
+      '';
+    };
+    options.extraOptions = mkOption {
+      type = listOf str;
+      default = [];
+      description = ''
+        Extra command-line options passed to the daemon. See upstream bees documentation.
+      '';
+      example = literalExample ''
+        [ "--thread-count" "4" ]
+      '';
+    };
+  };
+
+in {
+
+  options.services.beesd = {
+    filesystems = mkOption {
+      type = with types; attrsOf (submodule fsOptions);
+      description = "BTRFS filesystems to run block-level deduplication on.";
+      default = { };
+      example = literalExample ''
+        {
+          root = {
+            spec = "LABEL=root";
+            hashTableSizeMB = 2048;
+            verbosity = "crit";
+            extraOptions = [ "--loadavg-target" "5.0" ];
+          };
+        }
+      '';
+    };
+  };
+  config = {
+    systemd.services = mapAttrs' (name: fs: nameValuePair "beesd@${name}" {
+      description = "Block-level BTRFS deduplication for %i";
+      after = [ "sysinit.target" ];
+
+      serviceConfig = let
+        configOpts = [
+          fs.spec
+          "verbosity=${toString fs.verbosity}"
+          "idxSizeMB=${toString fs.hashTableSizeMB}"
+          "workDir=${fs.workDir}"
+        ];
+        configOptsStr = escapeShellArgs configOpts;
+      in {
+        # Values from https://github.com/Zygo/bees/blob/v0.6.1/scripts/beesd%40.service.in
+        ExecStart = "${pkgs.bees}/bin/bees-service-wrapper run ${configOptsStr} -- --no-timestamps ${escapeShellArgs fs.extraOptions}";
+        ExecStopPost = "${pkgs.bees}/bin/bees-service-wrapper cleanup ${configOptsStr}";
+        CPUAccounting = true;
+        CPUWeight = 12;
+        IOSchedulingClass = "idle";
+        IOSchedulingPriority = 7;
+        IOWeight = 10;
+        KillMode = "control-group";
+        KillSignal = "SIGTERM";
+        MemoryAccounting = true;
+        Nice = 19;
+        Restart = "on-abnormal";
+        StartupCPUWeight = 25;
+        StartupIOWeight = 25;
+        SyslogIdentifier = "bees"; # would otherwise be "bees-service-wrapper"
+      };
+      wantedBy = ["multi-user.target"];
+    }) cfg.filesystems;
+  };
+}

From f50bfe267a312515d88e86c12ae002c4feefcc1f Mon Sep 17 00:00:00 2001
From: Charles Duffy <charles@dyfis.net>
Date: Mon, 26 Nov 2018 13:47:58 -0600
Subject: [PATCH 3/3] nixos.tests.bees: init

---
 nixos/tests/bees.nix | 55 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 nixos/tests/bees.nix

diff --git a/nixos/tests/bees.nix b/nixos/tests/bees.nix
new file mode 100644
index 00000000000..6f68c2f834f
--- /dev/null
+++ b/nixos/tests/bees.nix
@@ -0,0 +1,55 @@
+import ./make-test.nix ({ lib, ... }:
+{
+  name = "bees";
+
+  machine = { config, pkgs, ... }: {
+    boot.initrd.postDeviceCommands = ''
+      ${pkgs.btrfs-progs}/bin/mkfs.btrfs -f -L aux1 /dev/vdb
+      ${pkgs.btrfs-progs}/bin/mkfs.btrfs -f -L aux2 /dev/vdc
+    '';
+    virtualisation.emptyDiskImages = [ 4096 4096 ];
+    fileSystems = lib.mkVMOverride {
+      "/aux1" = { # filesystem configured to be deduplicated
+        device = "/dev/disk/by-label/aux1";
+        fsType = "btrfs";
+      };
+      "/aux2" = { # filesystem not configured to be deduplicated
+        device = "/dev/disk/by-label/aux2";
+        fsType = "btrfs";
+      };
+    };
+    services.beesd.filesystems = {
+      aux1 = {
+        spec = "LABEL=aux1";
+        hashTableSizeMB = 16;
+        verbosity = "debug";
+      };
+    };
+  };
+
+  testScript =
+  let
+    withRetry = content: maxTests: sleepTime: ''
+      max_tests=${lib.escapeShellArg maxTests}; sleep_time=${lib.escapeShellArg sleepTime}; for ((i=0; i<max_tests; i++)); do ${content} && exit 0; sleep "$sleep_time"; done; exit 1;
+    '';
+    someContentIsShared = loc: ''[[ $(btrfs fi du -s --raw ${lib.escapeShellArg loc}/dedup-me-{1,2} | awk 'BEGIN { count=0; } NR>1 && $3 == 0 { count++ } END { print count }') -eq 0 ]]'';
+  in ''
+    # shut down the instance started by systemd at boot, so we can test our test procedure
+    $machine->succeed("systemctl stop beesd\@aux1.service");
+
+    $machine->succeed("dd if=/dev/urandom of=/aux1/dedup-me-1 bs=1M count=8");
+    $machine->succeed("cp --reflink=never /aux1/dedup-me-1 /aux1/dedup-me-2");
+    $machine->succeed("cp --reflink=never /aux1/* /aux2/");
+    $machine->succeed("sync");
+    $machine->fail(q(${someContentIsShared "/aux1"}));
+    $machine->fail(q(${someContentIsShared "/aux2"}));
+    $machine->succeed("systemctl start beesd\@aux1.service");
+
+    # assert that "Set Shared" column is nonzero
+    $machine->succeed(q(${withRetry (someContentIsShared "/aux1") 20 2}));
+    $machine->fail(q(${someContentIsShared "/aux2"}));
+
+    # assert that 16MB hash table size requested was honored
+    $machine->succeed(q([[ $(stat -c %s /aux1/.beeshome/beeshash.dat) = $(( 16 * 1024 * 1024)) ]]))
+  '';
+})