diff --git a/contrib/docker/docker-compose.prometheus.yml b/contrib/docker/docker-compose.prometheus.yml new file mode 100644 index 0000000000..28c1c69766 --- /dev/null +++ b/contrib/docker/docker-compose.prometheus.yml @@ -0,0 +1,11 @@ +services: + prometheus: + image: prom/prometheus:latest + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--web.enable-lifecycle' + ports: + - "9090:9090" + network_mode: host \ No newline at end of file diff --git a/contrib/docker/prometheus.yml b/contrib/docker/prometheus.yml new file mode 100644 index 0000000000..fc74e0080b --- /dev/null +++ b/contrib/docker/prometheus.yml @@ -0,0 +1,16 @@ +global: + scrape_interval: 15s + +scrape_configs: + - job_name: 'emissary-health' + metrics_path: /api/health + params: + format: [ "prometheus" ] + static_configs: + - targets: [ 'localhost:8001' ] + - job_name: 'emissary-metrics' + metrics_path: /api/metrics + params: + format: ["prometheus"] + static_configs: + - targets: ['localhost:8001'] diff --git a/pom.xml b/pom.xml index 27aec5d7b6..e17dc14038 100644 --- a/pom.xml +++ b/pom.xml @@ -83,8 +83,10 @@ 7.2 5.17.0 4.1.125.Final + 1.53.0 1.3.0 4.7.4 + 0.16.0 2.26.0 3.25.5 1.7.0 @@ -229,6 +231,41 @@ resilience4j-retry ${dep.resilience4j.version} + + io.opentelemetry + opentelemetry-api + ${dep.opentelemetry.version} + + + io.opentelemetry + opentelemetry-exporter-prometheus + ${dep.opentelemetry.version}-alpha + + + io.opentelemetry + opentelemetry-sdk + ${dep.opentelemetry.version} + + + io.opentelemetry + opentelemetry-sdk-metrics + ${dep.opentelemetry.version} + + + io.prometheus + simpleclient + ${dep.prometheus.version} + + + io.prometheus + simpleclient_common + ${dep.prometheus.version} + + + io.prometheus + simpleclient_dropwizard + ${dep.prometheus.version} + jakarta.annotation jakarta.annotation-api @@ -403,6 +440,13 @@ pom import + + io.opentelemetry + opentelemetry-bom + ${dep.opentelemetry.version} + pom + import + org.eclipse.jetty jetty-bom @@ -521,6 +565,21 @@ io.grpc grpc-stub + + io.prometheus + simpleclient + 0.16.0 + + + io.prometheus + simpleclient_common + 0.16.0 + + + io.prometheus + simpleclient_dropwizard + 0.16.0 + jakarta.annotation jakarta.annotation-api diff --git a/src/main/java/emissary/core/MetricsManager.java b/src/main/java/emissary/core/MetricsManager.java index 415b9ff92f..289a3174d2 100644 --- a/src/main/java/emissary/core/MetricsManager.java +++ b/src/main/java/emissary/core/MetricsManager.java @@ -16,6 +16,8 @@ import com.codahale.metrics.health.HealthCheckRegistry; import com.codahale.metrics.health.jvm.ThreadDeadlockHealthCheck; import com.codahale.metrics.jmx.JmxReporter; +import com.codahale.metrics.jvm.BufferPoolMetricSet; +import com.codahale.metrics.jvm.ClassLoadingGaugeSet; import com.codahale.metrics.jvm.FileDescriptorRatioGauge; import com.codahale.metrics.jvm.GarbageCollectorMetricSet; import com.codahale.metrics.jvm.MemoryUsageGaugeSet; @@ -24,12 +26,15 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.lang.management.ManagementFactory; import java.net.InetSocketAddress; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.TimeUnit; +import static java.lang.management.ManagementFactory.getPlatformMBeanServer; + /** * Manages the interactions with CodaHale's Metrics package, including configuration */ @@ -127,6 +132,32 @@ protected void initMetrics() { this.metrics.registerAll(new GarbageCollectorMetricSet()); this.metrics.registerAll(new ThreadStatesGaugeSet()); this.metrics.register("file.descriptor.info", new FileDescriptorRatioGauge()); + + // Add additional JVM metrics for comprehensive monitoring if enabled + if (this.conf.findBooleanEntry("JVM_ADDITIONAL_METRICS_ENABLED", false)) { + logger.debug("Additional JVM Metrics are enabled"); + this.metrics.registerAll(new ClassLoadingGaugeSet()); + this.metrics.registerAll(new BufferPoolMetricSet(getPlatformMBeanServer())); + + // Add JVM uptime metric + this.metrics.register("jvm.uptime", new Gauge() { + @Override + public Long getValue() { + return ManagementFactory.getRuntimeMXBean().getUptime(); + } + }); + + // Add available processors metric + this.metrics.register("jvm.processors", new Gauge() { + @Override + public Integer getValue() { + return Runtime.getRuntime().availableProcessors(); + } + }); + } else { + logger.debug("Additional JVM Metrics are disabled"); + } + } else { logger.debug("JVM Metrics are disabled"); } diff --git a/src/main/java/emissary/grpc/retry/RetryHandler.java b/src/main/java/emissary/grpc/retry/RetryHandler.java index 33e71acfcf..8996f49b47 100644 --- a/src/main/java/emissary/grpc/retry/RetryHandler.java +++ b/src/main/java/emissary/grpc/retry/RetryHandler.java @@ -1,9 +1,13 @@ package emissary.grpc.retry; import emissary.config.Configurator; +import emissary.core.MetricsManager; +import emissary.core.NamespaceException; import emissary.grpc.exceptions.PoolException; import emissary.grpc.exceptions.ServiceNotAvailableException; +import com.codahale.metrics.Counter; +import com.google.common.base.VerifyException; import io.github.resilience4j.core.IntervalFunction; import io.github.resilience4j.retry.Retry; import io.github.resilience4j.retry.RetryConfig; @@ -13,6 +17,7 @@ import org.slf4j.LoggerFactory; import org.slf4j.event.Level; +import java.util.Locale; import java.util.function.Supplier; /** @@ -52,6 +57,7 @@ public final class RetryHandler { private final int maxAttempts; private final int numFailsBeforeWarn; private final Retry retry; + private final Counter grpcRetryCounter; /** * Constructs the retry policy for a given gRPC service. @@ -66,6 +72,14 @@ public RetryHandler(Configurator configG, String retryName) { maxAttempts = configG.findIntEntry(GRPC_RETRY_MAX_ATTEMPTS, 4); numFailsBeforeWarn = configG.findIntEntry(GRPC_RETRY_NUM_FAILS_BEFORE_WARN, 3); + // Initialize Dropwizard counter for gRPC retries + try { + grpcRetryCounter = MetricsManager.lookup().getMetricRegistry() + .counter("grpc.retries." + sanitizeMetricName(retryName)); + } catch (NamespaceException e) { + throw new VerifyException("Failed to initialize gRPC retry counter", e); + } + retry = Retry.of(internalName, RetryConfig.custom() .maxAttempts(maxAttempts) .intervalFunction(IntervalFunction.ofExponentialBackoff( @@ -79,7 +93,17 @@ public RetryHandler(Configurator configG, String retryName) { .onError(this::logMessageOnError); } + /** + * Sanitizes metric names to be compatible with Prometheus naming conventions + */ + private static String sanitizeMetricName(String name) { + return name.replaceAll("[^a-zA-Z0-9_]", "_").toLowerCase(Locale.ROOT); + } + private void logMessageOnRetry(RetryOnRetryEvent event) { + // Increment the Dropwizard counter for each retry attempt + grpcRetryCounter.inc(); + int attemptNumber = event.getNumberOfRetryAttempts(); Level level = attemptNumber <= numFailsBeforeWarn ? Level.INFO : Level.WARN; logger.atLevel(level).log("{} failed gRPC connection attempt #{} with event error: {}", internalName, attemptNumber, event); diff --git a/src/main/java/emissary/server/EmissaryServer.java b/src/main/java/emissary/server/EmissaryServer.java index 60c9de9fd7..e04ae517f0 100644 --- a/src/main/java/emissary/server/EmissaryServer.java +++ b/src/main/java/emissary/server/EmissaryServer.java @@ -627,7 +627,18 @@ private static ConstraintSecurityHandler buildSecurityHandler() { health.setPathSpec("/api/health"); health.setConstraint(noAuthConstraint); - handler.setConstraintMappings(new ConstraintMapping[] {mapping, health}); + // TODO: figure out how to allow Digest Authenticator to work with Prometheus + + ConstraintMapping metrics = new ConstraintMapping(); + metrics.setPathSpec("/api/metrics"); + metrics.setConstraint(noAuthConstraint); + + ConstraintMapping metricsPrometheus = new ConstraintMapping(); + metricsPrometheus.setPathSpec("/api/metrics?format=prometheus"); + metricsPrometheus.setConstraint(noAuthConstraint); + + handler.setConstraintMappings( + new ConstraintMapping[] {mapping, health, metrics, metricsPrometheus}); handler.setAuthenticator(new DigestAuthenticator()); return handler; } diff --git a/src/main/java/emissary/server/api/HealthCheckAction.java b/src/main/java/emissary/server/api/HealthCheckAction.java index b78fd80852..7949319149 100644 --- a/src/main/java/emissary/server/api/HealthCheckAction.java +++ b/src/main/java/emissary/server/api/HealthCheckAction.java @@ -3,14 +3,18 @@ import emissary.core.MetricsManager; import emissary.core.NamespaceException; +import com.codahale.metrics.health.HealthCheck; import jakarta.ws.rs.GET; import jakarta.ws.rs.Path; import jakarta.ws.rs.Produces; +import jakarta.ws.rs.QueryParam; import jakarta.ws.rs.core.MediaType; import jakarta.ws.rs.core.Response; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Map; + @Path("") // context is /api, set in EmissaryServer public class HealthCheckAction { @@ -20,13 +24,56 @@ public class HealthCheckAction { @GET @Path("/" + HEALTH) - @Produces(MediaType.APPLICATION_JSON) - public Response clusterAgents() { + @Produces({MediaType.APPLICATION_JSON, MediaType.TEXT_PLAIN}) + public Response healthCheck(@QueryParam("format") String format) { try { - return Response.ok().entity(MetricsManager.lookup().getHealthCheckRegistry().runHealthChecks()).build(); + Map results = MetricsManager.lookup().getHealthCheckRegistry().runHealthChecks(); + + if ("prometheus".equals(format)) { + return generatePrometheusResponse(results); + } else { + // Default to JSON format + return generateJsonResponse(results); + } } catch (NamespaceException ex) { logger.warn("Could not lookup MetricsManager", ex); - return Response.serverError().entity("Could not lookup MetricsManager").build(); + + if ("prometheus".equals(format)) { + return generatePrometheusErrorResponse(); + } else { + return generateJsonErrorResponse(); + } + } + } + + protected Response generateJsonResponse(Map results) { + return Response.ok().entity(results).build(); + } + + protected Response generateJsonErrorResponse() { + return Response.serverError().entity("Could not lookup MetricsManager").build(); + } + + private static Response generatePrometheusResponse(Map results) { + boolean allHealthy = results.values().stream().allMatch(HealthCheck.Result::isHealthy); + + StringBuilder metricsOutput = new StringBuilder(); + metricsOutput.append("# HELP emissary_health_status Health status of emissary service (1=healthy, 0=unhealthy)\n"); + metricsOutput.append("# TYPE emissary_health_status gauge\n"); + metricsOutput.append("emissary_health_status ").append(allHealthy ? "1" : "0").append("\n"); + + if (allHealthy) { + return Response.ok(metricsOutput.toString()).build(); + } else { + return Response.status(Response.Status.SERVICE_UNAVAILABLE).entity(metricsOutput.toString()).build(); } } + + private static Response generatePrometheusErrorResponse() { + StringBuilder errorMetrics = new StringBuilder(); + errorMetrics.append("# HELP emissary_health_status Health status of emissary service (1=healthy, 0=unhealthy)\n"); + errorMetrics.append("# TYPE emissary_health_status gauge\n"); + errorMetrics.append("emissary_health_status 0\n"); + return Response.status(Response.Status.SERVICE_UNAVAILABLE).entity(errorMetrics.toString()).build(); + } } diff --git a/src/main/java/emissary/server/api/MetricsAction.java b/src/main/java/emissary/server/api/MetricsAction.java index 9b92706b94..6d67a90a86 100644 --- a/src/main/java/emissary/server/api/MetricsAction.java +++ b/src/main/java/emissary/server/api/MetricsAction.java @@ -3,14 +3,20 @@ import emissary.core.MetricsManager; import emissary.core.NamespaceException; +import io.prometheus.client.CollectorRegistry; +import io.prometheus.client.dropwizard.DropwizardExports; +import io.prometheus.client.exporter.common.TextFormat; import jakarta.ws.rs.GET; import jakarta.ws.rs.Path; -import jakarta.ws.rs.Produces; +import jakarta.ws.rs.QueryParam; import jakarta.ws.rs.core.MediaType; import jakarta.ws.rs.core.Response; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.io.StringWriter; + @Path("") // context is /api, set in EmissaryServer public class MetricsAction { @@ -18,14 +24,33 @@ public class MetricsAction { @GET @Path("/metrics") - @Produces(MediaType.APPLICATION_JSON) - public Response clusterAgents() { + public Response metrics(@QueryParam("format") String format) { try { - - return Response.ok().entity(MetricsManager.lookup().getMetricRegistry()).build(); - } catch (NamespaceException ex) { - logger.warn("Could not lookup MetricsManager", ex); - return Response.serverError().entity("Could not lookup MetricsManager").build(); + if ("prometheus".equals(format)) { + return prometheusMetrics(); + } else { + return jsonMetrics(); + } + } catch (Exception e) { + logger.warn("Could not retrieve metrics", e); + return Response.serverError().entity("Could not retrieve metrics: " + e.getMessage()).build(); } } + + private static Response jsonMetrics() throws NamespaceException { + return Response.ok() + .entity(MetricsManager.lookup().getMetricRegistry()) + .type(MediaType.APPLICATION_JSON) + .build(); + } + + private static Response prometheusMetrics() throws NamespaceException, IOException { + CollectorRegistry registry = new CollectorRegistry(); + registry.register(new DropwizardExports(MetricsManager.lookup().getMetricRegistry())); + StringWriter writer = new StringWriter(); + TextFormat.writeFormat(TextFormat.CONTENT_TYPE_004, writer, registry.metricFamilySamples()); + return Response.ok(writer.toString()) + .type(TextFormat.CONTENT_TYPE_004) + .build(); + } } diff --git a/src/main/resources/emissary/core/MetricsManager.cfg b/src/main/resources/emissary/core/MetricsManager.cfg index fab7af2c88..ff8f2bf05c 100644 --- a/src/main/resources/emissary/core/MetricsManager.cfg +++ b/src/main/resources/emissary/core/MetricsManager.cfg @@ -1,4 +1,6 @@ # Placeholder for now +JVM_METRICS_ENABLED = true +JVM_ADDITIONAL_METRICS_ENABLED = true JMX_METRICS_ENABLED = true SLF4J_METRICS_ENABLED = false GRAPHITE_METRICS_ENABLED = false @@ -9,5 +11,3 @@ GRAPHITE_METRICS_ENABLED = false # If not specified or commented out, there are no limits #MAX_FILE_COUNT_BEFORE_UNHEALTHY = 2000 #MAX_AGGREGATE_FIZE_SIZE_BEFORE_UNHEALTHY_BYTES = 1000000000 - - diff --git a/src/test/java/emissary/server/api/EmissaryApiTest.java b/src/test/java/emissary/server/api/EmissaryApiTest.java index 05e009e848..b8f2ac273c 100644 --- a/src/test/java/emissary/server/api/EmissaryApiTest.java +++ b/src/test/java/emissary/server/api/EmissaryApiTest.java @@ -293,7 +293,7 @@ void metrics() { when(manager.getMetricRegistry()).thenReturn(registry); Namespace.bind("MetricsManager", manager); - try (Response response = metrics.clusterAgents()) { + try (Response response = metrics.metrics(null)) { assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); Map metricsMap = ((MetricRegistry) response.getEntity()).getMetrics(); assertEquals(100, ((Counter) metricsMap.get("testing")).getCount()); @@ -314,7 +314,7 @@ void healthcheck() { when(registry.runHealthChecks()).thenReturn(results); Namespace.bind("MetricsManager", manager); - try (Response response = health.clusterAgents()) { + try (Response response = health.generateJsonResponse(results)) { assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); assertTrue(response.getEntity().toString().contains("isHealthy=true")); assertTrue(response.getEntity().toString().contains("message=Okay"));