diff --git a/contrib/docker/docker-compose.prometheus.yml b/contrib/docker/docker-compose.prometheus.yml
new file mode 100644
index 0000000000..28c1c69766
--- /dev/null
+++ b/contrib/docker/docker-compose.prometheus.yml
@@ -0,0 +1,11 @@
+services:
+ prometheus:
+ image: prom/prometheus:latest
+ volumes:
+ - ./prometheus.yml:/etc/prometheus/prometheus.yml
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--web.enable-lifecycle'
+ ports:
+ - "9090:9090"
+ network_mode: host
\ No newline at end of file
diff --git a/contrib/docker/prometheus.yml b/contrib/docker/prometheus.yml
new file mode 100644
index 0000000000..fc74e0080b
--- /dev/null
+++ b/contrib/docker/prometheus.yml
@@ -0,0 +1,16 @@
+global:
+ scrape_interval: 15s
+
+scrape_configs:
+ - job_name: 'emissary-health'
+ metrics_path: /api/health
+ params:
+ format: [ "prometheus" ]
+ static_configs:
+ - targets: [ 'localhost:8001' ]
+ - job_name: 'emissary-metrics'
+ metrics_path: /api/metrics
+ params:
+ format: ["prometheus"]
+ static_configs:
+ - targets: ['localhost:8001']
diff --git a/pom.xml b/pom.xml
index 27aec5d7b6..e17dc14038 100644
--- a/pom.xml
+++ b/pom.xml
@@ -83,8 +83,10 @@
7.2
5.17.0
4.1.125.Final
+ 1.53.0
1.3.0
4.7.4
+ 0.16.0
2.26.0
3.25.5
1.7.0
@@ -229,6 +231,41 @@
resilience4j-retry
${dep.resilience4j.version}
+
+ io.opentelemetry
+ opentelemetry-api
+ ${dep.opentelemetry.version}
+
+
+ io.opentelemetry
+ opentelemetry-exporter-prometheus
+ ${dep.opentelemetry.version}-alpha
+
+
+ io.opentelemetry
+ opentelemetry-sdk
+ ${dep.opentelemetry.version}
+
+
+ io.opentelemetry
+ opentelemetry-sdk-metrics
+ ${dep.opentelemetry.version}
+
+
+ io.prometheus
+ simpleclient
+ ${dep.prometheus.version}
+
+
+ io.prometheus
+ simpleclient_common
+ ${dep.prometheus.version}
+
+
+ io.prometheus
+ simpleclient_dropwizard
+ ${dep.prometheus.version}
+
jakarta.annotation
jakarta.annotation-api
@@ -403,6 +440,13 @@
pom
import
+
+ io.opentelemetry
+ opentelemetry-bom
+ ${dep.opentelemetry.version}
+ pom
+ import
+
org.eclipse.jetty
jetty-bom
@@ -521,6 +565,21 @@
io.grpc
grpc-stub
+
+ io.prometheus
+ simpleclient
+ 0.16.0
+
+
+ io.prometheus
+ simpleclient_common
+ 0.16.0
+
+
+ io.prometheus
+ simpleclient_dropwizard
+ 0.16.0
+
jakarta.annotation
jakarta.annotation-api
diff --git a/src/main/java/emissary/core/MetricsManager.java b/src/main/java/emissary/core/MetricsManager.java
index 415b9ff92f..289a3174d2 100644
--- a/src/main/java/emissary/core/MetricsManager.java
+++ b/src/main/java/emissary/core/MetricsManager.java
@@ -16,6 +16,8 @@
import com.codahale.metrics.health.HealthCheckRegistry;
import com.codahale.metrics.health.jvm.ThreadDeadlockHealthCheck;
import com.codahale.metrics.jmx.JmxReporter;
+import com.codahale.metrics.jvm.BufferPoolMetricSet;
+import com.codahale.metrics.jvm.ClassLoadingGaugeSet;
import com.codahale.metrics.jvm.FileDescriptorRatioGauge;
import com.codahale.metrics.jvm.GarbageCollectorMetricSet;
import com.codahale.metrics.jvm.MemoryUsageGaugeSet;
@@ -24,12 +26,15 @@
import org.slf4j.LoggerFactory;
import java.io.IOException;
+import java.lang.management.ManagementFactory;
import java.net.InetSocketAddress;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
+import static java.lang.management.ManagementFactory.getPlatformMBeanServer;
+
/**
* Manages the interactions with CodaHale's Metrics package, including configuration
*/
@@ -127,6 +132,32 @@ protected void initMetrics() {
this.metrics.registerAll(new GarbageCollectorMetricSet());
this.metrics.registerAll(new ThreadStatesGaugeSet());
this.metrics.register("file.descriptor.info", new FileDescriptorRatioGauge());
+
+ // Add additional JVM metrics for comprehensive monitoring if enabled
+ if (this.conf.findBooleanEntry("JVM_ADDITIONAL_METRICS_ENABLED", false)) {
+ logger.debug("Additional JVM Metrics are enabled");
+ this.metrics.registerAll(new ClassLoadingGaugeSet());
+ this.metrics.registerAll(new BufferPoolMetricSet(getPlatformMBeanServer()));
+
+ // Add JVM uptime metric
+ this.metrics.register("jvm.uptime", new Gauge() {
+ @Override
+ public Long getValue() {
+ return ManagementFactory.getRuntimeMXBean().getUptime();
+ }
+ });
+
+ // Add available processors metric
+ this.metrics.register("jvm.processors", new Gauge() {
+ @Override
+ public Integer getValue() {
+ return Runtime.getRuntime().availableProcessors();
+ }
+ });
+ } else {
+ logger.debug("Additional JVM Metrics are disabled");
+ }
+
} else {
logger.debug("JVM Metrics are disabled");
}
diff --git a/src/main/java/emissary/grpc/retry/RetryHandler.java b/src/main/java/emissary/grpc/retry/RetryHandler.java
index 33e71acfcf..8996f49b47 100644
--- a/src/main/java/emissary/grpc/retry/RetryHandler.java
+++ b/src/main/java/emissary/grpc/retry/RetryHandler.java
@@ -1,9 +1,13 @@
package emissary.grpc.retry;
import emissary.config.Configurator;
+import emissary.core.MetricsManager;
+import emissary.core.NamespaceException;
import emissary.grpc.exceptions.PoolException;
import emissary.grpc.exceptions.ServiceNotAvailableException;
+import com.codahale.metrics.Counter;
+import com.google.common.base.VerifyException;
import io.github.resilience4j.core.IntervalFunction;
import io.github.resilience4j.retry.Retry;
import io.github.resilience4j.retry.RetryConfig;
@@ -13,6 +17,7 @@
import org.slf4j.LoggerFactory;
import org.slf4j.event.Level;
+import java.util.Locale;
import java.util.function.Supplier;
/**
@@ -52,6 +57,7 @@ public final class RetryHandler {
private final int maxAttempts;
private final int numFailsBeforeWarn;
private final Retry retry;
+ private final Counter grpcRetryCounter;
/**
* Constructs the retry policy for a given gRPC service.
@@ -66,6 +72,14 @@ public RetryHandler(Configurator configG, String retryName) {
maxAttempts = configG.findIntEntry(GRPC_RETRY_MAX_ATTEMPTS, 4);
numFailsBeforeWarn = configG.findIntEntry(GRPC_RETRY_NUM_FAILS_BEFORE_WARN, 3);
+ // Initialize Dropwizard counter for gRPC retries
+ try {
+ grpcRetryCounter = MetricsManager.lookup().getMetricRegistry()
+ .counter("grpc.retries." + sanitizeMetricName(retryName));
+ } catch (NamespaceException e) {
+ throw new VerifyException("Failed to initialize gRPC retry counter", e);
+ }
+
retry = Retry.of(internalName, RetryConfig.custom()
.maxAttempts(maxAttempts)
.intervalFunction(IntervalFunction.ofExponentialBackoff(
@@ -79,7 +93,17 @@ public RetryHandler(Configurator configG, String retryName) {
.onError(this::logMessageOnError);
}
+ /**
+ * Sanitizes metric names to be compatible with Prometheus naming conventions
+ */
+ private static String sanitizeMetricName(String name) {
+ return name.replaceAll("[^a-zA-Z0-9_]", "_").toLowerCase(Locale.ROOT);
+ }
+
private void logMessageOnRetry(RetryOnRetryEvent event) {
+ // Increment the Dropwizard counter for each retry attempt
+ grpcRetryCounter.inc();
+
int attemptNumber = event.getNumberOfRetryAttempts();
Level level = attemptNumber <= numFailsBeforeWarn ? Level.INFO : Level.WARN;
logger.atLevel(level).log("{} failed gRPC connection attempt #{} with event error: {}", internalName, attemptNumber, event);
diff --git a/src/main/java/emissary/server/EmissaryServer.java b/src/main/java/emissary/server/EmissaryServer.java
index 60c9de9fd7..e04ae517f0 100644
--- a/src/main/java/emissary/server/EmissaryServer.java
+++ b/src/main/java/emissary/server/EmissaryServer.java
@@ -627,7 +627,18 @@ private static ConstraintSecurityHandler buildSecurityHandler() {
health.setPathSpec("/api/health");
health.setConstraint(noAuthConstraint);
- handler.setConstraintMappings(new ConstraintMapping[] {mapping, health});
+ // TODO: figure out how to allow Digest Authenticator to work with Prometheus
+
+ ConstraintMapping metrics = new ConstraintMapping();
+ metrics.setPathSpec("/api/metrics");
+ metrics.setConstraint(noAuthConstraint);
+
+ ConstraintMapping metricsPrometheus = new ConstraintMapping();
+ metricsPrometheus.setPathSpec("/api/metrics?format=prometheus");
+ metricsPrometheus.setConstraint(noAuthConstraint);
+
+ handler.setConstraintMappings(
+ new ConstraintMapping[] {mapping, health, metrics, metricsPrometheus});
handler.setAuthenticator(new DigestAuthenticator());
return handler;
}
diff --git a/src/main/java/emissary/server/api/HealthCheckAction.java b/src/main/java/emissary/server/api/HealthCheckAction.java
index b78fd80852..7949319149 100644
--- a/src/main/java/emissary/server/api/HealthCheckAction.java
+++ b/src/main/java/emissary/server/api/HealthCheckAction.java
@@ -3,14 +3,18 @@
import emissary.core.MetricsManager;
import emissary.core.NamespaceException;
+import com.codahale.metrics.health.HealthCheck;
import jakarta.ws.rs.GET;
import jakarta.ws.rs.Path;
import jakarta.ws.rs.Produces;
+import jakarta.ws.rs.QueryParam;
import jakarta.ws.rs.core.MediaType;
import jakarta.ws.rs.core.Response;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.util.Map;
+
@Path("")
// context is /api, set in EmissaryServer
public class HealthCheckAction {
@@ -20,13 +24,56 @@ public class HealthCheckAction {
@GET
@Path("/" + HEALTH)
- @Produces(MediaType.APPLICATION_JSON)
- public Response clusterAgents() {
+ @Produces({MediaType.APPLICATION_JSON, MediaType.TEXT_PLAIN})
+ public Response healthCheck(@QueryParam("format") String format) {
try {
- return Response.ok().entity(MetricsManager.lookup().getHealthCheckRegistry().runHealthChecks()).build();
+ Map results = MetricsManager.lookup().getHealthCheckRegistry().runHealthChecks();
+
+ if ("prometheus".equals(format)) {
+ return generatePrometheusResponse(results);
+ } else {
+ // Default to JSON format
+ return generateJsonResponse(results);
+ }
} catch (NamespaceException ex) {
logger.warn("Could not lookup MetricsManager", ex);
- return Response.serverError().entity("Could not lookup MetricsManager").build();
+
+ if ("prometheus".equals(format)) {
+ return generatePrometheusErrorResponse();
+ } else {
+ return generateJsonErrorResponse();
+ }
+ }
+ }
+
+ protected Response generateJsonResponse(Map results) {
+ return Response.ok().entity(results).build();
+ }
+
+ protected Response generateJsonErrorResponse() {
+ return Response.serverError().entity("Could not lookup MetricsManager").build();
+ }
+
+ private static Response generatePrometheusResponse(Map results) {
+ boolean allHealthy = results.values().stream().allMatch(HealthCheck.Result::isHealthy);
+
+ StringBuilder metricsOutput = new StringBuilder();
+ metricsOutput.append("# HELP emissary_health_status Health status of emissary service (1=healthy, 0=unhealthy)\n");
+ metricsOutput.append("# TYPE emissary_health_status gauge\n");
+ metricsOutput.append("emissary_health_status ").append(allHealthy ? "1" : "0").append("\n");
+
+ if (allHealthy) {
+ return Response.ok(metricsOutput.toString()).build();
+ } else {
+ return Response.status(Response.Status.SERVICE_UNAVAILABLE).entity(metricsOutput.toString()).build();
}
}
+
+ private static Response generatePrometheusErrorResponse() {
+ StringBuilder errorMetrics = new StringBuilder();
+ errorMetrics.append("# HELP emissary_health_status Health status of emissary service (1=healthy, 0=unhealthy)\n");
+ errorMetrics.append("# TYPE emissary_health_status gauge\n");
+ errorMetrics.append("emissary_health_status 0\n");
+ return Response.status(Response.Status.SERVICE_UNAVAILABLE).entity(errorMetrics.toString()).build();
+ }
}
diff --git a/src/main/java/emissary/server/api/MetricsAction.java b/src/main/java/emissary/server/api/MetricsAction.java
index 9b92706b94..6d67a90a86 100644
--- a/src/main/java/emissary/server/api/MetricsAction.java
+++ b/src/main/java/emissary/server/api/MetricsAction.java
@@ -3,14 +3,20 @@
import emissary.core.MetricsManager;
import emissary.core.NamespaceException;
+import io.prometheus.client.CollectorRegistry;
+import io.prometheus.client.dropwizard.DropwizardExports;
+import io.prometheus.client.exporter.common.TextFormat;
import jakarta.ws.rs.GET;
import jakarta.ws.rs.Path;
-import jakarta.ws.rs.Produces;
+import jakarta.ws.rs.QueryParam;
import jakarta.ws.rs.core.MediaType;
import jakarta.ws.rs.core.Response;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.io.StringWriter;
+
@Path("")
// context is /api, set in EmissaryServer
public class MetricsAction {
@@ -18,14 +24,33 @@ public class MetricsAction {
@GET
@Path("/metrics")
- @Produces(MediaType.APPLICATION_JSON)
- public Response clusterAgents() {
+ public Response metrics(@QueryParam("format") String format) {
try {
-
- return Response.ok().entity(MetricsManager.lookup().getMetricRegistry()).build();
- } catch (NamespaceException ex) {
- logger.warn("Could not lookup MetricsManager", ex);
- return Response.serverError().entity("Could not lookup MetricsManager").build();
+ if ("prometheus".equals(format)) {
+ return prometheusMetrics();
+ } else {
+ return jsonMetrics();
+ }
+ } catch (Exception e) {
+ logger.warn("Could not retrieve metrics", e);
+ return Response.serverError().entity("Could not retrieve metrics: " + e.getMessage()).build();
}
}
+
+ private static Response jsonMetrics() throws NamespaceException {
+ return Response.ok()
+ .entity(MetricsManager.lookup().getMetricRegistry())
+ .type(MediaType.APPLICATION_JSON)
+ .build();
+ }
+
+ private static Response prometheusMetrics() throws NamespaceException, IOException {
+ CollectorRegistry registry = new CollectorRegistry();
+ registry.register(new DropwizardExports(MetricsManager.lookup().getMetricRegistry()));
+ StringWriter writer = new StringWriter();
+ TextFormat.writeFormat(TextFormat.CONTENT_TYPE_004, writer, registry.metricFamilySamples());
+ return Response.ok(writer.toString())
+ .type(TextFormat.CONTENT_TYPE_004)
+ .build();
+ }
}
diff --git a/src/main/resources/emissary/core/MetricsManager.cfg b/src/main/resources/emissary/core/MetricsManager.cfg
index fab7af2c88..ff8f2bf05c 100644
--- a/src/main/resources/emissary/core/MetricsManager.cfg
+++ b/src/main/resources/emissary/core/MetricsManager.cfg
@@ -1,4 +1,6 @@
# Placeholder for now
+JVM_METRICS_ENABLED = true
+JVM_ADDITIONAL_METRICS_ENABLED = true
JMX_METRICS_ENABLED = true
SLF4J_METRICS_ENABLED = false
GRAPHITE_METRICS_ENABLED = false
@@ -9,5 +11,3 @@ GRAPHITE_METRICS_ENABLED = false
# If not specified or commented out, there are no limits
#MAX_FILE_COUNT_BEFORE_UNHEALTHY = 2000
#MAX_AGGREGATE_FIZE_SIZE_BEFORE_UNHEALTHY_BYTES = 1000000000
-
-
diff --git a/src/test/java/emissary/server/api/EmissaryApiTest.java b/src/test/java/emissary/server/api/EmissaryApiTest.java
index 05e009e848..b8f2ac273c 100644
--- a/src/test/java/emissary/server/api/EmissaryApiTest.java
+++ b/src/test/java/emissary/server/api/EmissaryApiTest.java
@@ -293,7 +293,7 @@ void metrics() {
when(manager.getMetricRegistry()).thenReturn(registry);
Namespace.bind("MetricsManager", manager);
- try (Response response = metrics.clusterAgents()) {
+ try (Response response = metrics.metrics(null)) {
assertEquals(Response.Status.OK.getStatusCode(), response.getStatus());
Map metricsMap = ((MetricRegistry) response.getEntity()).getMetrics();
assertEquals(100, ((Counter) metricsMap.get("testing")).getCount());
@@ -314,7 +314,7 @@ void healthcheck() {
when(registry.runHealthChecks()).thenReturn(results);
Namespace.bind("MetricsManager", manager);
- try (Response response = health.clusterAgents()) {
+ try (Response response = health.generateJsonResponse(results)) {
assertEquals(Response.Status.OK.getStatusCode(), response.getStatus());
assertTrue(response.getEntity().toString().contains("isHealthy=true"));
assertTrue(response.getEntity().toString().contains("message=Okay"));