diff --git a/nixos/modules/misc/ids.nix b/nixos/modules/misc/ids.nix index c368cd91186..d9ba2efa0c8 100644 --- a/nixos/modules/misc/ids.nix +++ b/nixos/modules/misc/ids.nix @@ -337,6 +337,7 @@ alerta = 310; minetest = 311; rss2email = 312; + cockroachdb = 313; # When adding a uid, make sure it doesn't match an existing gid. And don't use uids above 399! @@ -634,6 +635,7 @@ alerta = 310; minetest = 311; rss2email = 312; + cockroachdb = 313; # When adding a gid, make sure it doesn't match an existing # uid. Users and groups with the same name should have equal diff --git a/nixos/modules/module-list.nix b/nixos/modules/module-list.nix index 75e8446523f..7a6fbab7c36 100644 --- a/nixos/modules/module-list.nix +++ b/nixos/modules/module-list.nix @@ -212,6 +212,7 @@ ./services/databases/aerospike.nix ./services/databases/cassandra.nix ./services/databases/clickhouse.nix + ./services/databases/cockroachdb.nix ./services/databases/couchdb.nix ./services/databases/firebird.nix ./services/databases/foundationdb.nix diff --git a/nixos/modules/services/databases/cockroachdb.nix b/nixos/modules/services/databases/cockroachdb.nix new file mode 100644 index 00000000000..1bc20a25804 --- /dev/null +++ b/nixos/modules/services/databases/cockroachdb.nix @@ -0,0 +1,221 @@ +{ config, lib, pkgs, ... }: + +with lib; + +let + cfg = config.services.cockroachdb; + crdb = cfg.package; + + escape = builtins.replaceStrings ["%"] ["%%"]; + ifNotNull = v: s: optionalString (!isNull v) s; + + startupCommand = lib.concatStringsSep " " + [ # Basic startup + "${crdb}/bin/cockroach start" + "--logtostderr" + "--store=${cfg.dataDir}" + (ifNotNull cfg.locality "--locality='${cfg.locality}'") + + # WebUI settings + "--http-addr='${cfg.http.address}:${toString cfg.http.port}'" + + # Cluster listen address + "--listen-addr='${cfg.listen.address}:${toString cfg.listen.port}'" + + # Cluster configuration + (ifNotNull cfg.join "--join=${cfg.join}") + + # Cache and memory settings. Must be escaped. + "--cache='${escape cfg.cache}'" + "--max-sql-memory='${escape cfg.maxSqlMemory}'" + + # Certificate/security settings. + (if cfg.insecure then "--insecure" else "--certs-dir=${cfg.certsDir}") + ]; + + addressOption = descr: defaultPort: { + address = mkOption { + type = types.str; + default = "localhost"; + description = "Address to bind to for ${descr}"; + }; + + port = mkOption { + type = types.int; + default = defaultPort; + description = "Port to bind to for ${descr}"; + }; + }; +in + +{ + options = { + services.cockroachdb = { + enable = mkEnableOption "CockroachDB Server"; + + listen = addressOption "intra-cluster communication" 26257; + + http = addressOption "http-based Admin UI" 8080; + + locality = mkOption { + type = types.nullOr types.str; + default = null; + description = '' + An ordered, comma-separated list of key-value pairs that describe the + topography of the machine. Topography might include country, + datacenter or rack designations. Data is automatically replicated to + maximize diversities of each tier. The order of tiers is used to + determine the priority of the diversity, so the more inclusive + localities like country should come before less inclusive localities + like datacenter. The tiers and order must be the same on all nodes. + Including more tiers is better than including fewer. For example: + + country=us,region=us-west,datacenter=us-west-1b,rack=12 + country=ca,region=ca-east,datacenter=ca-east-2,rack=4 + + planet=earth,province=manitoba,colo=secondary,power=3 + ''; + }; + + join = mkOption { + type = types.nullOr types.str; + default = null; + description = "The addresses for connecting the node to a cluster."; + }; + + dataDir = mkOption { + type = types.path; + default = "/var/lib/cockroachdb"; + description = "Location where CockroachDB stores its table files"; + }; + + insecure = mkOption { + type = types.bool; + default = false; + description = "Run in insecure mode."; + }; + + certsDir = mkOption { + type = types.nullOr types.path; + default = null; + description = "The path to the certificate directory."; + }; + + user = mkOption { + type = types.str; + default = "cockroachdb"; + description = "User account under which CockroachDB runs"; + }; + + group = mkOption { + type = types.str; + default = "cockroachdb"; + description = "User account under which CockroachDB runs"; + }; + + openPorts = mkOption { + type = types.bool; + default = false; + description = "Open firewall ports for cluster communication by default"; + }; + + cache = mkOption { + type = types.str; + default = "25%"; + description = '' + The total size for caches. + + This can be a percentage, expressed with a fraction sign or as a + decimal-point number, or any bytes-based unit. For example, "25%", + "0.25" both represent 25% of the available system memory. The values + "1000000000" and "1GB" both represent 1 gigabyte of memory. + ''; + }; + + maxSqlMemory = mkOption { + type = types.str; + default = "25%"; + description = '' + The maximum in-memory storage capacity available to store temporary + data for SQL queries. + + This can be a percentage, expressed with a fraction sign or as a + decimal-point number, or any bytes-based unit. For example, "25%", + "0.25" both represent 25% of the available system memory. The values + "1000000000" and "1GB" both represent 1 gigabyte of memory. + ''; + }; + + package = mkOption { + type = types.package; + default = pkgs.cockroachdb; + description = '' + The CockroachDB derivation to use for running the service. + + This would primarily be useful to enable Enterprise Edition features + in your own custom CockroachDB build (Nixpkgs CockroachDB binaries + only contain open source features and open source code). + ''; + }; + }; + }; + + config = mkIf config.services.cockroachdb.enable { + assertions = [ + { assertion = !cfg.insecure -> !(isNull cfg.certsDir); + message = "CockroachDB must have a set of SSL certificates (.certsDir), or run in Insecure Mode (.insecure = true)"; + } + ]; + + environment.systemPackages = [ crdb ]; + + users.users = optionalAttrs (cfg.user == "cockroachdb") (singleton + { name = "cockroachdb"; + description = "CockroachDB Server User"; + uid = config.ids.uids.cockroachdb; + group = cfg.group; + }); + + users.groups = optionalAttrs (cfg.group == "cockroachdb") (singleton + { name = "cockroachdb"; + gid = config.ids.gids.cockroachdb; + }); + + networking.firewall.allowedTCPPorts = lib.optionals cfg.openPorts + [ cfg.http.port cfg.listen.port ]; + + systemd.services.cockroachdb = + { description = "CockroachDB Server"; + documentation = [ "man:cockroach(1)" "https://www.cockroachlabs.com" ]; + + after = [ "network.target" "time-sync.target" ]; + requires = [ "time-sync.target" ]; + wantedBy = [ "multi-user.target" ]; + + unitConfig.RequiresMountsFor = "${cfg.dataDir}"; + + preStart = '' + if ! test -e ${cfg.dataDir}; then + mkdir -m 0700 -p ${cfg.dataDir} + chown -R ${cfg.user} ${cfg.dataDir} + fi + ''; + + serviceConfig = + { ExecStart = startupCommand; + Type = "notify"; + User = cfg.user; + PermissionsStartOnly = true; + + Restart = "always"; + TimeoutStopSec="60"; + RestartSec="10"; + StandardOutput="syslog"; + StandardError="syslog"; + SyslogIdentifier="cockroach"; + }; + }; + }; + + meta.maintainers = with lib.maintainers; [ thoughtpolice ]; +} diff --git a/nixos/tests/cockroachdb.nix b/nixos/tests/cockroachdb.nix new file mode 100644 index 00000000000..56c624d8cf2 --- /dev/null +++ b/nixos/tests/cockroachdb.nix @@ -0,0 +1,126 @@ +# This performs a full 'end-to-end' test of a multi-node CockroachDB cluster +# using the built-in 'cockroach workload' command, to simulate a semi-realistic +# test load. It generally takes anywhere from 3-5 minutes to run and 1-2GB of +# RAM (though each of 3 workers gets 1GB allocated) +# +# CockroachDB requires synchronized system clocks within a small error window +# (~500ms by default) on each node in order to maintain a multi-node cluster. +# Cluster joins that are outside this window will fail, and nodes that skew +# outside the window after joining will promptly get kicked out. +# +# To accomodate this, we use QEMU/virtio infrastructure and load the 'ptp_kvm' +# driver inside a guest. This driver allows the host machine to pass its clock +# through to the guest as a hardware clock that appears as a Precision Time +# Protocol (PTP) Clock device, generally /dev/ptp0. PTP devices can be measured +# and used as hardware reference clocks (similar to an on-board GPS clock) by +# NTP software. In our case, we use Chrony to synchronize to the reference +# clock. +# +# This test is currently NOT enabled as a continuously-checked NixOS test. +# Ideally, this test would be run by Hydra and Borg on all relevant changes, +# except: +# +# - Not every build machine is compatible with the ptp_kvm driver. +# Virtualized EC2 instances, for example, do not support loading the ptp_kvm +# driver into guests. However, bare metal builders (e.g. Packet) do seem to +# work just fine. In practice, this means x86_64-linux builds would fail +# randomly, depending on which build machine got the job. (This is probably +# worth some investigation; I imagine it's based on ptp_kvm's usage of paravirt +# support which may not be available in 'nested' environments.) +# +# - ptp_kvm is not supported on aarch64, otherwise it seems likely Cockroach +# could be tested there, as well. This seems to be due to the usage of +# the TSC in ptp_kvm, which isn't supported (easily) on AArch64. (And: +# testing stuff, not just making sure it builds, is important to ensure +# aarch64 support remains viable.) +# +# For future developers who are reading this message, are daring and would want +# to fix this, some options are: +# +# - Just test a single node cluster instead (boring and less thorough). +# - Move all CI to bare metal packet builders, and we can at least do x86_64-linux. +# - Get virtualized clocking working in aarch64, somehow. +# - Add a 4th node that acts as an NTP service and uses no PTP clocks for +# references, at the client level. This bloats the node and memory +# requirements, but would probably allow both aarch64/x86_64 to work. +# + +let + + # Creates a node. If 'joinNode' parameter, a string containing an IP address, + # is non-null, then the CockroachDB server will attempt to join/connect to + # the cluster node specified at that address. + makeNode = locality: myAddr: joinNode: + { nodes, pkgs, lib, config, ... }: + + { + # Bank/TPC-C benchmarks take some memory to complete + virtualisation.memorySize = 1024; + + # Install the KVM PTP "Virtualized Clock" driver. This allows a /dev/ptp0 + # device to appear as a reference clock, synchronized to the host clock. + # Because CockroachDB *requires* a time-synchronization mechanism for + # the system time in a cluster scenario, this is necessary to work. + boot.kernelModules = [ "ptp_kvm" ]; + + # Enable and configure Chrony, using the given virtualized clock passed + # through by KVM. + services.chrony.enable = true; + services.chrony.servers = lib.mkForce [ ]; + services.chrony.extraConfig = '' + refclock PHC /dev/ptp0 poll 2 prefer require refid KVM + makestep 0.1 3 + ''; + + # Enable CockroachDB. In order to ensure that Chrony has performed its + # first synchronization at boot-time (which may take ~10 seconds) before + # starting CockroachDB, we block the ExecStartPre directive using the + # 'waitsync' command. This ensures Cockroach doesn't have its system time + # leap forward out of nowhere during startup/execution. + # + # Note that the default threshold for NTP-based skew in CockroachDB is + # ~500ms by default, so making sure it's started *after* accurate time + # synchronization is extremely important. + services.cockroachdb.enable = true; + services.cockroachdb.insecure = true; + services.cockroachdb.openPorts = true; + services.cockroachdb.locality = locality; + services.cockroachdb.listen.address = myAddr; + services.cockroachdb.join = lib.mkIf (joinNode != null) joinNode; + + # Hold startup until Chrony has performed its first measurement (which + # will probably result in a full timeskip, thanks to makestep) + systemd.services.cockroachdb.preStart = '' + ${pkgs.chrony}/bin/chronyc waitsync + ''; + }; + +in import ./make-test.nix ({ pkgs, ...} : { + name = "cockroachdb"; + meta.maintainers = with pkgs.stdenv.lib.maintainers; + [ thoughtpolice ]; + + nodes = rec { + node1 = makeNode "country=us,region=east,dc=1" "192.168.1.1" null; + node2 = makeNode "country=us,region=west,dc=2b" "192.168.1.2" "192.168.1.1"; + node3 = makeNode "country=eu,region=west,dc=2" "192.168.1.3" "192.168.1.1"; + }; + + # NOTE: All the nodes must start in order and you must NOT use startAll, because + # there's otherwise no way to guarantee that node1 will start before the others try + # to join it. + testScript = '' + $node1->start; + $node1->waitForUnit("cockroachdb"); + + $node2->start; + $node2->waitForUnit("cockroachdb"); + + $node3->start; + $node3->waitForUnit("cockroachdb"); + + $node1->mustSucceed("cockroach sql --host=192.168.1.1 --insecure -e 'SHOW ALL CLUSTER SETTINGS' 2>&1"); + $node1->mustSucceed("cockroach workload init bank 'postgresql://root\@192.168.1.1:26257?sslmode=disable'"); + $node1->mustSucceed("cockroach workload run bank --duration=1m 'postgresql://root\@192.168.1.1:26257?sslmode=disable'"); + ''; +})