Skip to content

Commit

Permalink
Don't allow setting dead server last contact threshold to less than 1…
Browse files Browse the repository at this point in the history
… minute (#22040)

* Don't allow setting dead server last contact threshold to less than 1 minute

* add changelog

* document the minimum dead server last contact threshold
  • Loading branch information
raskchanky committed Jul 25, 2023
1 parent 64b50ad commit d407078
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 2 deletions.
3 changes: 3 additions & 0 deletions changelog/22040.txt
@@ -0,0 +1,3 @@
```release-note:improvement
storage/raft: Cap the minimum dead_server_last_contact_threshold to 1m.
```
10 changes: 9 additions & 1 deletion vault/external_tests/raft/raft_autopilot_test.go
Expand Up @@ -194,6 +194,14 @@ func TestRaft_Autopilot_Configuration(t *testing.T) {
writeConfigFunc(writableConfig, true)
configCheckFunc(config)

// Check dead server last contact threshold minimum
writableConfig = map[string]interface{}{
"cleanup_dead_servers": true,
"dead_server_last_contact_threshold": "5s",
}
writeConfigFunc(writableConfig, true)
configCheckFunc(config)

// Ensure that the configuration stays across reboots
leaderCore := cluster.Cores[0]
testhelpers.EnsureCoreSealed(t, cluster.Cores[0])
Expand Down Expand Up @@ -450,7 +458,7 @@ func TestRaft_Autopilot_DeadServerCleanup(t *testing.T) {
// Ensure Autopilot has the aggressive settings
config.CleanupDeadServers = true
config.ServerStabilizationTime = 5 * time.Second
config.DeadServerLastContactThreshold = 10 * time.Second
config.DeadServerLastContactThreshold = 1 * time.Minute
config.MaxTrailingLogs = 10
config.LastContactThreshold = 10 * time.Second
config.MinQuorum = 3
Expand Down
4 changes: 4 additions & 0 deletions vault/logical_system_raft.go
Expand Up @@ -533,6 +533,10 @@ func (b *SystemBackend) handleStorageRaftAutopilotConfigUpdate() framework.Opera
return logical.ErrorResponse(fmt.Sprintf("min_quorum must be set when cleanup_dead_servers is set and it should at least be 3; cleanup_dead_servers: %#v, min_quorum: %#v", effectiveConf.CleanupDeadServers, effectiveConf.MinQuorum)), logical.ErrInvalidRequest
}

if effectiveConf.CleanupDeadServers && effectiveConf.DeadServerLastContactThreshold.Seconds() < 60 {
return logical.ErrorResponse(fmt.Sprintf("dead_server_last_contact_threshold should not be set to less than 1m; received: %v", deadServerLastContactThreshold)), logical.ErrInvalidRequest
}

// Persist only the user supplied fields
if persist {
entry, err := logical.StorageEntryJSON(raftAutopilotConfigurationStoragePath, config)
Expand Down
3 changes: 2 additions & 1 deletion website/content/api-docs/system/storage/raftautopilot.mdx
Expand Up @@ -210,7 +210,8 @@ This endpoint is used to modify the configuration of the autopilot subsystem of

- `dead_server_last_contact_threshold` `(string: "24h")` - Limit on the amount of time
a server can go without leader contact before being considered failed. This
takes effect only when `cleanup_dead_servers` is `true`.
takes effect only when `cleanup_dead_servers` is `true`. This can not be set to a value
smaller than 1m.

- `max_trailing_logs` `(int: 1000)` - Amount of entries in the Raft Log that a server
can be behind before being considered unhealthy.
Expand Down

0 comments on commit d407078

Please sign in to comment.