Skip to content

Commit

Permalink
Export key rocksdb metrics via nodectrl /metrics endpoint
Browse files Browse the repository at this point in the history
This exports key tickers, histograms, and properties from rocksdb through the /metrics endpoint. The exported metrics are reported in prometheus exposition format.
Additionally, another http endpoint `/rocksdb-stats` that returns the raw rocksdb statistics output (can be useful in performance investigations).

Note that rocksdb metric reporting is only triggered if the scraping endpoint is queried.


The change requires changes in rust-rocksdb to support pulling individual tickers and histograms out of rocksdb. The change is proposed in rust-rocksdb/rust-rocksdb#853
Additionally, the rust-rocksdb interface doesn't make it easy to get memory usage information when we use optimistic transactional database (which we do), for that, another change in rust-rocksdb/rust-rocksdb#854 is needed to get access to memory usage information.

Both changes are merged into a `restate` branch at https://github.com/restatedev/rust-rocksdb/tree/next. I updated Cargo to use that until changes are merged upstream.

Test Plan:

```
> http localhost:5122/metrics | head -n 10



# TYPE invoker_invocation_task_started_total counter
invoker_invocation_task_started_total{rpc_service="CheckoutProcess"} 12

# TYPE invoker_invocation_task_failed_total counter
invoker_invocation_task_failed_total{rpc_service="CheckoutProcess",transient="true"} 12

# TYPE invoker_inflight_invocations_total gauge
invoker_inflight_invocations_total{rpc_service="CheckoutProcess"} 0

# TYPE rocksdb_memtable_miss_total counter
rocksdb_memtable_miss_total 2066

# TYPE rocksdb_bytes_read_total counter
rocksdb_bytes_read_total 1736

# TYPE rocksdb_bytes_written_total counter
rocksdb_bytes_written_total 0

# TYPE rocksdb_db_get_seconds summary
rocksdb_db_get_seconds{quantile="0.5"} 0.000006297482837528604
rocksdb_db_get_seconds{quantile="0.95"} 0.000019823636363636356
rocksdb_db_get_seconds{quantile="0.99"} 0.00012004999999999938
rocksdb_db_get_seconds{quantile="1.0"} 0.000667
rocksdb_db_get_seconds_sum 0.022774
rocksdb_db_get_seconds_count 2066

# TYPE rocksdb_db_write_seconds summary
rocksdb_db_write_seconds{quantile="0.5"} 0
rocksdb_db_write_seconds{quantile="0.95"} 0
rocksdb_db_write_seconds{quantile="0.99"} 0
rocksdb_db_write_seconds{quantile="1.0"} 0
rocksdb_db_write_seconds_sum 0
rocksdb_db_write_seconds_count 0

# TYPE rocksdb_db_seek_seconds summary
rocksdb_db_seek_seconds{quantile="0.5"} 0.000007813397129186603
rocksdb_db_seek_seconds{quantile="0.95"} 0.00002298974358974355
rocksdb_db_seek_seconds{quantile="0.99"} 0.000049989444444444286
rocksdb_db_seek_seconds{quantile="1.0"} 0.000573
rocksdb_db_seek_seconds_sum 0.032024
rocksdb_db_seek_seconds_count 3107

# TYPE rocksdb_db_multiget_seconds summary
rocksdb_db_multiget_seconds{quantile="0.5"} 0
rocksdb_db_multiget_seconds{quantile="0.95"} 0
rocksdb_db_multiget_seconds{quantile="0.99"} 0
rocksdb_db_multiget_seconds{quantile="1.0"} 0
rocksdb_db_multiget_seconds_sum 0
rocksdb_db_multiget_seconds_count 0

# TYPE rocksdb_bytes_per_write_bytes summary
rocksdb_bytes_per_write_bytes{quantile="0.5"} 0
rocksdb_bytes_per_write_bytes{quantile="0.95"} 0
rocksdb_bytes_per_write_bytes{quantile="0.99"} 0
rocksdb_bytes_per_write_bytes{quantile="1.0"} 0
rocksdb_bytes_per_write_bytes_sum 0
rocksdb_bytes_per_write_bytes_count 0

# TYPE rocksdb_bytes_per_read_bytes summary
rocksdb_bytes_per_read_bytes{quantile="0.5"} 0.5048899755501223
rocksdb_bytes_per_read_bytes{quantile="0.95"} 0.9592909535452323
rocksdb_bytes_per_read_bytes{quantile="0.99"} 0.999682151589242
rocksdb_bytes_per_read_bytes{quantile="1.0"} 106
rocksdb_bytes_per_read_bytes_sum 1736
rocksdb_bytes_per_read_bytes_count 2065

# TYPE rocksdb_num_immutable_mem_table_count gauge
rocksdb_num_immutable_mem_table_count 0

# TYPE rocksdb_mem_table_flush_pending_count gauge
rocksdb_mem_table_flush_pending_count 0

# TYPE rocksdb_compaction_pending_count gauge
rocksdb_compaction_pending_count 0

# TYPE rocksdb_background_errors_count gauge
rocksdb_background_errors_count 0

# TYPE rocksdb_cur_size_active_mem_table_bytes gauge
rocksdb_cur_size_active_mem_table_bytes 2048

# TYPE rocksdb_cur_size_all_mem_tables_bytes gauge
rocksdb_cur_size_all_mem_tables_bytes 2048

# TYPE rocksdb_size_all_mem_tables_bytes gauge
rocksdb_size_all_mem_tables_bytes 2048

# TYPE rocksdb_num_entries_active_mem_table_count gauge
rocksdb_num_entries_active_mem_table_count 0

# TYPE rocksdb_num_entries_imm_mem_tables_count gauge
rocksdb_num_entries_imm_mem_tables_count 0

# TYPE rocksdb_num_deletes_active_mem_table_count gauge
rocksdb_num_deletes_active_mem_table_count 0

# TYPE rocksdb_num_deletes_imm_mem_tables_count gauge
rocksdb_num_deletes_imm_mem_tables_count 0

# TYPE rocksdb_estimate_num_keys_count gauge
rocksdb_estimate_num_keys_count 0

# TYPE rocksdb_estimate_table_readers_mem_bytes gauge
rocksdb_estimate_table_readers_mem_bytes 0

# TYPE rocksdb_num_live_versions_count gauge
rocksdb_num_live_versions_count 1

# TYPE rocksdb_estimate_live_data_size_bytes gauge
rocksdb_estimate_live_data_size_bytes 0

# TYPE rocksdb_min_log_number_to_keep_count gauge
rocksdb_min_log_number_to_keep_count 240

# TYPE rocksdb_live_sst_files_size_bytes gauge
rocksdb_live_sst_files_size_bytes 0

# TYPE rocksdb_estimate_pending_compaction_bytes_bytes gauge
rocksdb_estimate_pending_compaction_bytes_bytes 0

# TYPE rocksdb_num_running_compactions_count gauge
rocksdb_num_running_compactions_count 0

# TYPE rocksdb_actual_delayed_write_rate_count gauge
rocksdb_actual_delayed_write_rate_count 0

# TYPE rocksdb_block_cache_capacity_bytes gauge
rocksdb_block_cache_capacity_bytes 33554432

# TYPE rocksdb_block_cache_usage_bytes gauge
rocksdb_block_cache_usage_bytes 87

# TYPE rocksdb_block_cache_pinned_usage_bytes gauge
rocksdb_block_cache_pinned_usage_bytes 87

# TYPE rocksdb_num_files_at_level0_count gauge
rocksdb_num_files_at_level0_count 0

# TYPE rocksdb_num_files_at_level1_count gauge
rocksdb_num_files_at_level1_count 0

# TYPE rocksdb_num_files_at_level2_count gauge
rocksdb_num_files_at_level2_count 0

# TYPE rocksdb_memory_approximate_cache_bytes gauge
rocksdb_memory_approximate_cache_bytes 87

# TYPE rocksdb_memory_approx_memtable_bytes gauge
rocksdb_memory_approx_memtable_bytes 18432

# TYPE rocksdb_memory_approx_memtable_unflushed_bytes gauge
rocksdb_memory_approx_memtable_unflushed_bytes 18432

# TYPE rocksdb_memory_approx_memtable_readers_bytes gauge
rocksdb_memory_approx_memtable_readers_bytes 8863

```
  • Loading branch information
AhmedSoliman committed Jan 17, 2024
1 parent 6d2469c commit 4713886
Show file tree
Hide file tree
Showing 12 changed files with 611 additions and 95 deletions.
181 changes: 110 additions & 71 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ prost-build = "0.12.1"
prost-reflect = "0.12.0"
prost-types = "0.12.1"
rand = "0.8.5"
# we need to use RocksDB >= 8.9.1 because it fixes https://github.com/facebook/rocksdb/pull/11680
rocksdb = { git = "https://github.com/rust-rocksdb/rust-rocksdb.git", rev = "66f04df013b6e6bd42b5a8c353406e09a7c7da2a" }
# we need to use RocksDB with fine-grained statistics access. We use this branch until proposed changes are merged upstream
rocksdb = { git = "https://github.com/restatedev/rust-rocksdb.git", branch = "next"}
rustls = "0.21.6"
schemars = { version = "0.8", features = ["bytes"] }
serde = { version = "1.0", features = ["derive"] }
Expand Down
4 changes: 3 additions & 1 deletion crates/node-ctrl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ options_schema = ["dep:schemars"]
[dependencies]
restate-errors = { workspace = true }
restate-node-ctrl-proto = { workspace = true }
restate-storage-rocksdb = { workspace = true }

axum = { workspace = true }
async-trait = { workspace = true }
axum = { workspace = true }
codederror = { workspace = true }
derive_builder = { workspace = true }
drain = { workspace = true }
Expand All @@ -28,6 +29,7 @@ metrics = { workspace = true }
metrics-exporter-prometheus = { version = "0.13", default-features = false, features = ["async-runtime"] }
metrics-tracing-context = { version = "0.15.0" }
metrics-util = { version = "0.16.0" }
rocksdb = { workspace = true }
schemars = { workspace = true, optional = true }
serde = { workspace = true }
serde_json = { workspace = true }
Expand Down
243 changes: 232 additions & 11 deletions crates/node-ctrl/src/handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,227 @@
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0.

use std::fmt::Write;
use std::iter::once;

use axum::extract::State;
use axum::response::IntoResponse;
use metrics_exporter_prometheus::formatting;
use restate_storage_rocksdb::{TableKind, DB};
use rocksdb::statistics::{Histogram, Ticker};
use rocksdb::AsColumnFamilyRef;
use tonic::{Request, Response, Status};

use restate_node_ctrl_proto::proto::node_ctrl_server::NodeCtrl;
use restate_node_ctrl_proto::proto::{IdentResponse, NodeStatus};

use crate::prometheus_helpers::{
format_rocksdb_histogram_for_prometheus, format_rocksdb_property_for_prometheus,
format_rocksdb_stat_ticker_for_prometheus, MetricUnit,
};
use crate::state::HandlerState;

static ROCKSDB_TICKERS: &[Ticker] = &[
Ticker::BlockCacheDataBytesInsert,
Ticker::BlockCacheDataHit,
Ticker::BlockCacheDataMiss,
Ticker::BloomFilterUseful,
Ticker::BytesRead,
Ticker::BytesWritten,
Ticker::CompactReadBytes,
Ticker::CompactWriteBytes,
Ticker::FlushWriteBytes,
Ticker::MemtableHit,
Ticker::MemtableMiss,
Ticker::NoIteratorCreated,
Ticker::NoIteratorDeleted,
Ticker::NumberKeysRead,
Ticker::NumberKeysUpdated,
Ticker::NumberKeysWritten,
Ticker::StallMicros,
Ticker::WalFileBytes,
Ticker::WalFileSynced,
Ticker::WriteWithWal,
];

static ROCKSDB_HISTOGRAMS: &[(Histogram, &str, MetricUnit)] = &[
(Histogram::DbGet, "rocksdb.db.get", MetricUnit::Micros),
(
Histogram::DbMultiget,
"rocksdb.db.multiget",
MetricUnit::Micros,
),
(Histogram::DbWrite, "rocksdb.db.write", MetricUnit::Micros),
(Histogram::DbSeek, "rocksdb.db.seek", MetricUnit::Micros),
(Histogram::FlushTime, "rocksdb.db.flush", MetricUnit::Micros),
(
Histogram::WalFileSyncMicros,
"rocksdb.wal.file.sync",
MetricUnit::Micros,
),
(
Histogram::CompactionTime,
"rocksdb.compaction.times",
MetricUnit::Micros,
),
(
Histogram::SstBatchSize,
Histogram::SstBatchSize.name(),
MetricUnit::Bytes,
),
(
Histogram::BytesPerWrite,
Histogram::BytesPerWrite.name(),
MetricUnit::Bytes,
),
(
Histogram::BytesPerRead,
Histogram::BytesPerRead.name(),
MetricUnit::Bytes,
),
(
Histogram::BytesPerMultiget,
Histogram::BytesPerMultiget.name(),
MetricUnit::Bytes,
),
];

// Per column-family properties
static ROCKSDB_PROPERTIES: &[(&str, MetricUnit)] = &[
("rocksdb.num-immutable-mem-table", MetricUnit::Count),
("rocksdb.mem-table-flush-pending", MetricUnit::Count),
("rocksdb.compaction-pending", MetricUnit::Count),
("rocksdb.background-errors", MetricUnit::Count),
("rocksdb.cur-size-active-mem-table", MetricUnit::Bytes),
("rocksdb.cur-size-all-mem-tables", MetricUnit::Bytes),
("rocksdb.size-all-mem-tables", MetricUnit::Bytes),
("rocksdb.num-entries-active-mem-table", MetricUnit::Count),
("rocksdb.num-entries-imm-mem-tables", MetricUnit::Count),
("rocksdb.num-deletes-active-mem-table", MetricUnit::Count),
("rocksdb.num-deletes-imm-mem-tables", MetricUnit::Count),
("rocksdb.estimate-num-keys", MetricUnit::Count),
("rocksdb.estimate-table-readers-mem", MetricUnit::Bytes),
("rocksdb.num-live-versions", MetricUnit::Count),
("rocksdb.estimate-live-data-size", MetricUnit::Bytes),
("rocksdb.min-log-number-to-keep", MetricUnit::Count),
("rocksdb.live-sst-files-size", MetricUnit::Bytes),
(
"rocksdb.estimate-pending-compaction-bytes",
MetricUnit::Bytes,
),
("rocksdb.num-running-compactions", MetricUnit::Count),
("rocksdb.actual-delayed-write-rate", MetricUnit::Count),
("rocksdb.block-cache-capacity", MetricUnit::Bytes),
("rocksdb.block-cache-usage", MetricUnit::Bytes),
("rocksdb.block-cache-pinned-usage", MetricUnit::Bytes),
("rocksdb.num-files-at-level0", MetricUnit::Count),
("rocksdb.num-files-at-level1", MetricUnit::Count),
// Add more as needed.
("rocksdb.num-files-at-level2", MetricUnit::Count),
];

// -- Direct HTTP Handlers --
pub async fn render_metrics(State(state): State<HandlerState>) -> (http::StatusCode, String) {
pub async fn render_metrics(State(state): State<HandlerState>) -> String {
let mut out = String::new();

// Response content type is plain/text and that's expected.
if let Some(prometheus_handle) = state.prometheus_handle {
(http::StatusCode::OK, prometheus_handle.render())
} else {
// We want to fail scraping to prevent silent failures.
(
// We respond with 422 since this is technically not a server error.
// We indicate that that the request is valid but cannot process this
// request due to semantic errors (i.e. not enabled in this case).
http::StatusCode::UNPROCESSABLE_ENTITY,
"Prometheus metric collection is not enabled.".to_string(),
)
// Internal system metrics
let _ = write!(&mut out, "{}", prometheus_handle.render());
}

// Load metrics from rocksdb (if the node runs rocksdb, and rocksdb
// stat collection is enabled)
let Some(db) = state.rocksdb_storage else {
return out;
};

let raw_db = db.inner();
let options = db.options();
// Tickers (Counters)
for ticker in ROCKSDB_TICKERS {
format_rocksdb_stat_ticker_for_prometheus(&mut out, &options, *ticker);
}
// Histograms
for (histogram, name, unit) in ROCKSDB_HISTOGRAMS {
format_rocksdb_histogram_for_prometheus(
&mut out,
name,
options.get_histogram_data(*histogram),
*unit,
);
}

// Properties (Gauges)
// For properties, we need to get them for each column family.
let all_cfs = TableKind::all()
.map(TableKind::cf_name)
// Include the default column family
.chain(once("default"));

for cf in all_cfs {
let Some(cf_handle) = raw_db.cf_handle(cf) else {
continue;
};
let sanitized_cf_name = formatting::sanitize_label_value(cf);
let labels = [format!("cf=\"{}\"", sanitized_cf_name)];
for (property, unit) in ROCKSDB_PROPERTIES {
format_rocksdb_property_for_prometheus(
&mut out,
&labels,
*unit,
property,
get_property(&raw_db, &cf_handle, property),
);
}
}

// Memory Usage Stats (Gauges)
let cache = db.cache();
let memory_usage =
get_memory_usage_stats(Some(&[&raw_db]), cache).expect("get_memory_usage_stats");

format_rocksdb_property_for_prometheus(
&mut out,
&[],
MetricUnit::Bytes,
"rocksdb.memory.approximate-cache",
memory_usage.approximate_cache_total(),
);

format_rocksdb_property_for_prometheus(
&mut out,
&[],
MetricUnit::Bytes,
"rocksdb.memory.approx-memtable",
memory_usage.approximate_mem_table_total(),
);

format_rocksdb_property_for_prometheus(
&mut out,
&[],
MetricUnit::Bytes,
"rocksdb.memory.approx-memtable-unflushed",
memory_usage.approximate_mem_table_unflushed(),
);

format_rocksdb_property_for_prometheus(
&mut out,
&[],
MetricUnit::Bytes,
"rocksdb.memory.approx-memtable-readers",
memory_usage.approximate_mem_table_readers_total(),
);
out
}

pub async fn rocksdb_stats(State(state): State<HandlerState>) -> impl IntoResponse {
let Some(db) = state.rocksdb_storage else {
return String::new();
};

let options = db.options();
options.get_statistics().unwrap_or_default()
}

// -- GRPC Service Handlers --
Expand All @@ -54,3 +252,26 @@ impl NodeCtrl for Handler {
}));
}
}

// -- Local Helpers
#[inline]
fn get_property(db: &DB, cf_handle: &impl AsColumnFamilyRef, name: &str) -> u64 {
db.property_int_value_cf(cf_handle, name)
.unwrap_or_default()
.unwrap_or_default()
}

fn get_memory_usage_stats(
dbs: Option<&[&DB]>,
cache: Option<rocksdb::Cache>,
) -> Result<rocksdb::perf::MemoryUsage, rocksdb::Error> {
let mut builder = rocksdb::perf::MemoryUsageBuilder::new()?;
if let Some(dbs_) = dbs {
dbs_.iter().for_each(|db| builder.add_db(db));
}
if let Some(cache) = cache {
builder.add_cache(&cache);
}

builder.build()
}
1 change: 1 addition & 0 deletions crates/node-ctrl/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ mod handler;
mod metrics;
mod multiplex;
mod options;
mod prometheus_helpers;
pub mod service;
mod state;

Expand Down
5 changes: 3 additions & 2 deletions crates/node-ctrl/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

use std::net::SocketAddr;

use restate_storage_rocksdb::RocksDBStorage;
use serde_with::serde_as;

use crate::service::NodeCtrlService;
Expand Down Expand Up @@ -47,7 +48,7 @@ impl Default for Options {
}

impl Options {
pub fn build(self) -> NodeCtrlService {
NodeCtrlService::new(self)
pub fn build(self, rocksdb_storage: Option<RocksDBStorage>) -> NodeCtrlService {
NodeCtrlService::new(self, rocksdb_storage)
}
}

0 comments on commit 4713886

Please sign in to comment.