From 1e99e8b9dc48b23fca3b1801c66f30cf17766825 Mon Sep 17 00:00:00 2001 From: Andrew Fields Date: Wed, 5 Jul 2023 06:25:19 -0500 Subject: [PATCH] Add Prometheus endpoint (#3456) Add a server for serving Prometheus metrics. Include a configuration block in the config file. Provide HTTP metrics on the API, along with process-level metrics and DB pool metrics. --- Cargo.lock | 50 +++++++++++ Cargo.toml | 5 +- config/defaults.hjson | 4 + crates/utils/src/settings/structs.rs | 17 ++++ docker/Dockerfile | 7 +- docker/docker-compose.yml | 5 ++ src/lib.rs | 30 ++++++- src/prometheus_metrics.rs | 120 +++++++++++++++++++++++++++ 8 files changed, 232 insertions(+), 6 deletions(-) create mode 100644 src/prometheus_metrics.rs diff --git a/Cargo.lock b/Cargo.lock index 81a6ffb6b9..4ef78c1000 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -317,6 +317,18 @@ dependencies = [ "syn 1.0.103", ] +[[package]] +name = "actix-web-prom" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9df3127d20a5d01c9fc9aceb969a38d31a6767e1b48a54d55a8f56c769a84923" +dependencies = [ + "actix-web", + "futures-core", + "pin-project-lite", + "prometheus", +] + [[package]] name = "addr2line" version = "0.19.0" @@ -2765,6 +2777,7 @@ dependencies = [ "activitypub_federation", "actix-cors", "actix-web", + "actix-web-prom", "chrono", "clokwerk", "console-subscriber", @@ -2782,6 +2795,7 @@ dependencies = [ "opentelemetry 0.17.0", "opentelemetry-otlp 0.10.0", "pict-rs", + "prometheus", "reqwest", "reqwest-middleware", "reqwest-tracing", @@ -4052,6 +4066,36 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1de8dacb0873f77e6aefc6d71e044761fcc68060290f5b1089fcdf84626bb69" +dependencies = [ + "bitflags 1.3.2", + "byteorder", + "hex", + "lazy_static", + "rustix", +] + +[[package]] +name = "prometheus" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "449811d15fbdf5ceb5c1144416066429cf82316e2ec8ce0c1f6f8a02e7bbcf8c" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "libc", + "memchr", + "parking_lot 0.12.1", + "procfs", + "protobuf", + "thiserror", +] + [[package]] name = "prost" version = "0.9.0" @@ -4138,6 +4182,12 @@ dependencies = [ "prost 0.11.0", ] +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + [[package]] name = "psm" version = "0.1.21" diff --git a/Cargo.toml b/Cargo.toml index 311f5d7df4..72e30e38fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ lto = "thin" embed-pictrs = ["pict-rs"] console = ["console-subscriber", "opentelemetry", "opentelemetry-otlp", "tracing-opentelemetry", "reqwest-tracing/opentelemetry_0_16"] json-log = ["tracing-subscriber/json"] +prometheus-metrics = ["prometheus", "actix-web-prom"] default = [] [workspace] @@ -143,4 +144,6 @@ rustls = { workspace = true } futures-util = { workspace = true } tokio-postgres = { workspace = true } tokio-postgres-rustls = { workspace = true } -chrono = { workspace = true } \ No newline at end of file +chrono = { workspace = true } +prometheus = { version = "0.13.3", features = ["process"], optional = true } +actix-web-prom = { version = "0.6.0", optional = true } \ No newline at end of file diff --git a/config/defaults.hjson b/config/defaults.hjson index 6032f8fc9a..1e5597a117 100644 --- a/config/defaults.hjson +++ b/config/defaults.hjson @@ -80,4 +80,8 @@ worker_count: 0 # The number of activitypub federation retry workers that can be in-flight concurrently retry_count: 0 + prometheus: { + bind: "127.0.0.1" + port: 10002 + } } diff --git a/crates/utils/src/settings/structs.rs b/crates/utils/src/settings/structs.rs index 5d0e642f6a..16a27b93a6 100644 --- a/crates/utils/src/settings/structs.rs +++ b/crates/utils/src/settings/structs.rs @@ -45,6 +45,10 @@ pub struct Settings { /// The number of activitypub federation retry workers that can be in-flight concurrently #[default(0)] pub retry_count: usize, + // Prometheus configuration. + #[default(None)] + #[doku(example = "Some(Default::default())")] + pub prometheus: Option, } #[derive(Debug, Deserialize, Serialize, Clone, SmartDefault, Document)] @@ -157,3 +161,16 @@ pub struct SetupConfig { #[default(None)] pub admin_email: Option, } + +#[derive(Debug, Deserialize, Serialize, Clone, SmartDefault, Document)] +#[serde(deny_unknown_fields)] +pub struct PrometheusConfig { + // Address that the Prometheus metrics will be served on. + #[default(Some(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1))))] + #[doku(example = "127.0.0.1")] + pub bind: Option, + // Port that the Prometheus metrics will be served on. + #[default(Some(10002))] + #[doku(example = "10002")] + pub port: Option, +} diff --git a/docker/Dockerfile b/docker/Dockerfile index 0105275154..e81d9d0c27 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,6 +2,9 @@ FROM clux/muslrust:1.70.0 as builder WORKDIR /app ARG CARGO_BUILD_TARGET=x86_64-unknown-linux-musl +# comma-seperated list of features to enable +ARG CARGO_BUILD_FEATURES=default + # This can be set to release using --build-arg ARG RUST_RELEASE_MODE="debug" @@ -13,7 +16,7 @@ COPY . . RUN --mount=type=cache,target=/app/target \ if [ "$RUST_RELEASE_MODE" = "debug" ] ; then \ echo "pub const VERSION: &str = \"$(git describe --tag)\";" > "crates/utils/src/version.rs" \ - && cargo build --target ${CARGO_BUILD_TARGET} \ + && cargo build --target ${CARGO_BUILD_TARGET} --features ${CARGO_BUILD_FEATURES} \ && cp ./target/$CARGO_BUILD_TARGET/$RUST_RELEASE_MODE/lemmy_server /app/lemmy_server; \ fi @@ -21,7 +24,7 @@ RUN --mount=type=cache,target=/app/target \ RUN \ if [ "$RUST_RELEASE_MODE" = "release" ] ; then \ echo "pub const VERSION: &str = \"$(git describe --tag)\";" > "crates/utils/src/version.rs" \ - && cargo build --target ${CARGO_BUILD_TARGET} --release \ + && cargo build --target ${CARGO_BUILD_TARGET} --features ${CARGO_BUILD_FEATURES} --release \ && cp ./target/$CARGO_BUILD_TARGET/$RUST_RELEASE_MODE/lemmy_server /app/lemmy_server; \ fi diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index f9522e9065..ab4f0ca485 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -32,12 +32,17 @@ services: dockerfile: docker/Dockerfile # args: # RUST_RELEASE_MODE: release + # CARGO_BUILD_FEATURES: default # this hostname is used in nginx reverse proxy and also for lemmy ui to connect to the backend, do not change hostname: lemmy restart: always environment: - RUST_LOG="warn,lemmy_server=debug,lemmy_api=debug,lemmy_api_common=debug,lemmy_api_crud=debug,lemmy_apub=debug,lemmy_db_schema=debug,lemmy_db_views=debug,lemmy_db_views_actor=debug,lemmy_db_views_moderator=debug,lemmy_routes=debug,lemmy_utils=debug,lemmy_websocket=debug" - RUST_BACKTRACE=full + ports: + # prometheus metrics available at the path /metrics on port 10002 by default + # enable prometheus metrics by setting the CARGO_BUILD_FEATURES build arg above to "prometheus-metrics" + - "10002:10002" volumes: - ./lemmy.hjson:/config/config.hjson:Z depends_on: diff --git a/src/lib.rs b/src/lib.rs index f84842fe07..ce62d0d311 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ pub mod api_routes_http; pub mod code_migrations; +#[cfg(feature = "prometheus-metrics")] +pub mod prometheus_metrics; pub mod root_span_builder; pub mod scheduled_tasks; #[cfg(feature = "console")] @@ -35,6 +37,12 @@ use tracing_error::ErrorLayer; use tracing_log::LogTracer; use tracing_subscriber::{filter::Targets, layer::SubscriberExt, Layer, Registry}; use url::Url; +#[cfg(feature = "prometheus-metrics")] +use { + actix_web_prom::PrometheusMetricsBuilder, + prometheus::default_registry, + prometheus_metrics::serve_prometheus, +}; /// Max timeout for http requests pub(crate) const REQWEST_TIMEOUT: Duration = Duration::from_secs(10); @@ -119,6 +127,9 @@ pub async fn start_lemmy_server() -> Result<(), LemmyError> { }); } + #[cfg(feature = "prometheus-metrics")] + serve_prometheus(settings.prometheus.as_ref(), context.clone()); + let settings_bind = settings.clone(); let federation_config = FederationConfig::builder() @@ -134,6 +145,14 @@ pub async fn start_lemmy_server() -> Result<(), LemmyError> { .build() .await?; + // this must come before the HttpServer creation + // creates a middleware that populates http metrics for each path, method, and status code + #[cfg(feature = "prometheus-metrics")] + let prom_api_metrics = PrometheusMetricsBuilder::new("lemmy_api") + .registry(default_registry().clone()) + .build() + .unwrap(); + // Create Http server with websocket support HttpServer::new(move || { let cors_config = if cfg!(debug_assertions) { @@ -145,7 +164,7 @@ pub async fn start_lemmy_server() -> Result<(), LemmyError> { .allowed_origin(&settings.get_protocol_and_hostname()) }; - App::new() + let app = App::new() .wrap(middleware::Logger::new( // This is the default log format save for the usage of %{r}a over %a to guarantee to record the client's (forwarded) IP and not the last peer address, since the latter is frequently just a reverse proxy "%{r}a '%r' %s %b '%{Referer}i' '%{User-Agent}i' %T", @@ -155,8 +174,13 @@ pub async fn start_lemmy_server() -> Result<(), LemmyError> { .wrap(TracingLogger::::new()) .app_data(Data::new(context.clone())) .app_data(Data::new(rate_limit_cell.clone())) - .wrap(FederationMiddleware::new(federation_config.clone())) - // The routes + .wrap(FederationMiddleware::new(federation_config.clone())); + + #[cfg(feature = "prometheus-metrics")] + let app = app.wrap(prom_api_metrics.clone()); + + // The routes + app .configure(|cfg| api_routes_http::config(cfg, rate_limit_cell)) .configure(|cfg| { if federation_enabled { diff --git a/src/prometheus_metrics.rs b/src/prometheus_metrics.rs new file mode 100644 index 0000000000..1ff47a54ba --- /dev/null +++ b/src/prometheus_metrics.rs @@ -0,0 +1,120 @@ +use actix_web::{rt::System, web, App, HttpResponse, HttpServer, Responder}; +use lemmy_api_common::context::LemmyContext; +use lemmy_utils::settings::structs::PrometheusConfig; +use prometheus::{default_registry, Encoder, Gauge, Opts, TextEncoder}; +use std::{ + net::{IpAddr, Ipv4Addr}, + sync::Arc, + thread, +}; + +struct PromContext { + lemmy: LemmyContext, + db_pool_metrics: DbPoolMetrics, +} + +struct DbPoolMetrics { + max_size: Gauge, + size: Gauge, + available: Gauge, +} + +static DEFAULT_BIND: IpAddr = IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)); +static DEFAULT_PORT: i32 = 10002; + +pub fn serve_prometheus(config: Option<&PrometheusConfig>, lemmy_context: LemmyContext) { + let context = Arc::new(PromContext { + lemmy: lemmy_context, + db_pool_metrics: create_db_pool_metrics(), + }); + + let (bind, port) = match config { + Some(config) => ( + config.bind.unwrap_or(DEFAULT_BIND), + config.port.unwrap_or(DEFAULT_PORT), + ), + None => (DEFAULT_BIND, DEFAULT_PORT), + }; + + // spawn thread that blocks on handling requests + // only mapping /metrics to a handler + thread::spawn(move || { + let sys = System::new(); + sys.block_on(async { + let server = HttpServer::new(move || { + App::new() + .app_data(web::Data::new(Arc::clone(&context))) + .route("/metrics", web::get().to(metrics)) + }) + .bind((bind, port as u16)) + .expect(&format!("Cannot bind to {}:{}", bind, port)) + .run(); + + if let Err(err) = server.await { + eprintln!("Prometheus server error: {}", err); + } + }) + }); +} + +// handler for the /metrics path +async fn metrics(context: web::Data>) -> impl Responder { + // collect metrics + collect_db_pool_metrics(&context).await; + + let mut buffer = Vec::new(); + let encoder = TextEncoder::new(); + + // gather metrics from registry and encode in prometheus format + let metric_families = prometheus::gather(); + encoder.encode(&metric_families, &mut buffer).unwrap(); + let output = String::from_utf8(buffer).unwrap(); + + HttpResponse::Ok().body(output) +} + +// create lemmy_db_pool_* metrics and register them with the default registry +fn create_db_pool_metrics() -> DbPoolMetrics { + let metrics = DbPoolMetrics { + max_size: Gauge::with_opts(Opts::new( + "lemmy_db_pool_max_connections", + "Maximum number of connections in the pool", + )) + .unwrap(), + size: Gauge::with_opts(Opts::new( + "lemmy_db_pool_connections", + "Current number of connections in the pool", + )) + .unwrap(), + available: Gauge::with_opts(Opts::new( + "lemmy_db_pool_available_connections", + "Number of available connections in the pool", + )) + .unwrap(), + }; + + default_registry() + .register(Box::new(metrics.max_size.clone())) + .unwrap(); + default_registry() + .register(Box::new(metrics.size.clone())) + .unwrap(); + default_registry() + .register(Box::new(metrics.available.clone())) + .unwrap(); + + return metrics; +} + +async fn collect_db_pool_metrics(context: &PromContext) { + let pool_status = context.lemmy.pool().status(); + context + .db_pool_metrics + .max_size + .set(pool_status.max_size as f64); + context.db_pool_metrics.size.set(pool_status.size as f64); + context + .db_pool_metrics + .available + .set(pool_status.available as f64); +}