Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distribution metric to monitor the start of get-entries requests #1364

Merged
merged 1 commit into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

* Add build tags for AIX operating system

### Monitoring
* Add a distribution metric to monitor the start of get-entries requests

### Misc

* Return HTTP 504 instead of HTTP 408 upon timeout or cancellation of a backend connection context by @robstradling in https://github.com/google/certificate-transparency-go/pull/1313
Expand Down
49 changes: 35 additions & 14 deletions trillian/ctfe/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ import (
)

var (
alignGetEntries = flag.Bool("align_getentries", true, "Enable get-entries request alignment")
alignGetEntries = flag.Bool("align_getentries", true, "Enable get-entries request alignment")
getEntriesMetrics = flag.Bool("getentries_metrics", false, "Export get-entries distribution metrics")
)

const (
Expand Down Expand Up @@ -106,19 +107,20 @@ const (
var (
// Metrics are all per-log (label "logid"), but may also be
// per-entrypoint (label "ep") or per-return-code (label "rc").
once sync.Once
knownLogs monitoring.Gauge // logid => value (always 1.0)
isMirrorLog monitoring.Gauge // logid => value (either 0.0 or 1.0)
maxMergeDelay monitoring.Gauge // logid => value
expMergeDelay monitoring.Gauge // logid => value
lastSCTTimestamp monitoring.Gauge // logid => value
lastSTHTimestamp monitoring.Gauge // logid => value
lastSTHTreeSize monitoring.Gauge // logid => value
frozenSTHTimestamp monitoring.Gauge // logid => value
reqsCounter monitoring.Counter // logid, ep => value
rspsCounter monitoring.Counter // logid, ep, rc => value
rspLatency monitoring.Histogram // logid, ep, rc => value
alignedGetEntries monitoring.Counter // logid, aligned => count
once sync.Once
knownLogs monitoring.Gauge // logid => value (always 1.0)
isMirrorLog monitoring.Gauge // logid => value (either 0.0 or 1.0)
maxMergeDelay monitoring.Gauge // logid => value
expMergeDelay monitoring.Gauge // logid => value
lastSCTTimestamp monitoring.Gauge // logid => value
lastSTHTimestamp monitoring.Gauge // logid => value
lastSTHTreeSize monitoring.Gauge // logid => value
frozenSTHTimestamp monitoring.Gauge // logid => value
reqsCounter monitoring.Counter // logid, ep => value
rspsCounter monitoring.Counter // logid, ep, rc => value
rspLatency monitoring.Histogram // logid, ep, rc => value
alignedGetEntries monitoring.Counter // logid, aligned => count
getEntriesStartPercentiles monitoring.Histogram // logid => percentile
)

// setupMetrics initializes all the exported metrics.
Expand All @@ -135,6 +137,12 @@ func setupMetrics(mf monitoring.MetricFactory) {
rspsCounter = mf.NewCounter("http_rsps", "Number of responses", "logid", "ep", "rc")
rspLatency = mf.NewHistogram("http_latency", "Latency of responses in seconds", "logid", "ep", "rc")
alignedGetEntries = mf.NewCounter("aligned_get_entries", "Number of get-entries requests which were aligned to size limit boundaries", "logid", "aligned")
getEntriesStartPercentiles = mf.NewHistogramWithBuckets(
"get_leaves_start_percentiles",
"Start index of GetLeavesByRange request using percentage of current log size at the time",
monitoring.PercentileBuckets(5),
"logid",
)
}

// Entrypoints is a list of entrypoint names as exposed in statistics/logging.
Expand Down Expand Up @@ -750,6 +758,10 @@ func getEntries(ctx context.Context, li *logInfo, w http.ResponseWriter, r *http
// explicitly here.
return http.StatusBadRequest, fmt.Errorf("need tree size: %d to get leaves but only got: %d", start+1, currentRoot.TreeSize)
}
if *getEntriesMetrics {
label := strconv.FormatInt(req.LogId, 10)
recordStartPercent(start, currentRoot.TreeSize, label)
}
// Do some sanity checks on the result.
if len(rsp.Leaves) > int(count) {
return http.StatusInternalServerError, fmt.Errorf("backend returned too many leaves: %d vs [%d,%d]", len(rsp.Leaves), start, end)
Expand Down Expand Up @@ -1122,3 +1134,12 @@ func (li *logInfo) toHTTPStatus(err error) int {
return http.StatusInternalServerError
}
}

// recordStartPercent works out what percentage of the current log size an index corresponds to,
// and records this to the getEntriesStartPercentiles histogram.
func recordStartPercent(leafIndex int64, treeSize uint64, labelVals ...string) {
if treeSize > 0 {
percent := float64(leafIndex) / float64(treeSize) * 100.0
getEntriesStartPercentiles.Observe(percent, labelVals...)
}
}