From 212f2dea6336e8baa35614fa9df4ef8ba41b207a Mon Sep 17 00:00:00 2001 From: Josh Black Date: Mon, 24 Jul 2023 16:05:53 -0700 Subject: [PATCH 1/4] Don't allow setting dead server last contact threshold to less than 1 minute --- vault/external_tests/raft/raft_autopilot_test.go | 8 ++++++++ vault/logical_system_raft.go | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/vault/external_tests/raft/raft_autopilot_test.go b/vault/external_tests/raft/raft_autopilot_test.go index 5b324876b85e3..d50d015fa3156 100644 --- a/vault/external_tests/raft/raft_autopilot_test.go +++ b/vault/external_tests/raft/raft_autopilot_test.go @@ -194,6 +194,14 @@ func TestRaft_Autopilot_Configuration(t *testing.T) { writeConfigFunc(writableConfig, true) configCheckFunc(config) + // Check dead server last contact threshold minimum + writableConfig = map[string]interface{}{ + "cleanup_dead_servers": true, + "dead_server_last_contact_threshold": "5s", + } + writeConfigFunc(writableConfig, true) + configCheckFunc(config) + // Ensure that the configuration stays across reboots leaderCore := cluster.Cores[0] testhelpers.EnsureCoreSealed(t, cluster.Cores[0]) diff --git a/vault/logical_system_raft.go b/vault/logical_system_raft.go index ca475eddcc77c..483c24fc5b8b3 100644 --- a/vault/logical_system_raft.go +++ b/vault/logical_system_raft.go @@ -533,6 +533,10 @@ func (b *SystemBackend) handleStorageRaftAutopilotConfigUpdate() framework.Opera return logical.ErrorResponse(fmt.Sprintf("min_quorum must be set when cleanup_dead_servers is set and it should at least be 3; cleanup_dead_servers: %#v, min_quorum: %#v", effectiveConf.CleanupDeadServers, effectiveConf.MinQuorum)), logical.ErrInvalidRequest } + if effectiveConf.CleanupDeadServers && effectiveConf.DeadServerLastContactThreshold.Seconds() < 60 { + return logical.ErrorResponse(fmt.Sprintf("dead_server_last_contact_threshold should not be set to less than 1m; received: %v", deadServerLastContactThreshold)), logical.ErrInvalidRequest + } + // Persist only the user supplied fields if persist { entry, err := logical.StorageEntryJSON(raftAutopilotConfigurationStoragePath, config) From c1099e5375a5db119866927dae887361b2019e9e Mon Sep 17 00:00:00 2001 From: Josh Black Date: Tue, 25 Jul 2023 09:01:49 -0700 Subject: [PATCH 2/4] add changelog --- changelog/22040.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changelog/22040.txt diff --git a/changelog/22040.txt b/changelog/22040.txt new file mode 100644 index 0000000000000..e96a428b95af8 --- /dev/null +++ b/changelog/22040.txt @@ -0,0 +1,3 @@ +```release-note:improvement +storage/raft: Cap the minimum dead_server_last_contact_threshold to 1m. +``` From 1527861a3e119b83bdf8f481fb5e84219b538dd7 Mon Sep 17 00:00:00 2001 From: Josh Black Date: Tue, 25 Jul 2023 09:02:17 -0700 Subject: [PATCH 3/4] document the minimum dead server last contact threshold --- website/content/api-docs/system/storage/raftautopilot.mdx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/content/api-docs/system/storage/raftautopilot.mdx b/website/content/api-docs/system/storage/raftautopilot.mdx index 18eff4c8ded47..6c8e7c9d5fecb 100644 --- a/website/content/api-docs/system/storage/raftautopilot.mdx +++ b/website/content/api-docs/system/storage/raftautopilot.mdx @@ -210,7 +210,8 @@ This endpoint is used to modify the configuration of the autopilot subsystem of - `dead_server_last_contact_threshold` `(string: "24h")` - Limit on the amount of time a server can go without leader contact before being considered failed. This - takes effect only when `cleanup_dead_servers` is `true`. + takes effect only when `cleanup_dead_servers` is `true`. This can not be set to a value + smaller than 1m. - `max_trailing_logs` `(int: 1000)` - Amount of entries in the Raft Log that a server can be behind before being considered unhealthy. From cc2c78ff83524b37b298dd90bbb8b5210ace991b Mon Sep 17 00:00:00 2001 From: Josh Black Date: Tue, 25 Jul 2023 13:29:59 -0700 Subject: [PATCH 4/4] adjust test with new value --- vault/external_tests/raft/raft_autopilot_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vault/external_tests/raft/raft_autopilot_test.go b/vault/external_tests/raft/raft_autopilot_test.go index d50d015fa3156..3be64b676b511 100644 --- a/vault/external_tests/raft/raft_autopilot_test.go +++ b/vault/external_tests/raft/raft_autopilot_test.go @@ -458,7 +458,7 @@ func TestRaft_Autopilot_DeadServerCleanup(t *testing.T) { // Ensure Autopilot has the aggressive settings config.CleanupDeadServers = true config.ServerStabilizationTime = 5 * time.Second - config.DeadServerLastContactThreshold = 10 * time.Second + config.DeadServerLastContactThreshold = 1 * time.Minute config.MaxTrailingLogs = 10 config.LastContactThreshold = 10 * time.Second config.MinQuorum = 3