elastic · muthu-mps · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
@@ -0,0 +1,3 @@
+dependencies:
+  ecs:
+    reference: [email protected]
@@ -6,12 +6,54 @@ Vertex AI is a platform that enables the training and deployment of machine lear
 
 The integration with Google Cloud Platform (GCP) Vertex AI allows you to gather metrics such as token usage, latency, overall invocations, and error rates for deployed models. Additionally, it tracks resource utilization metrics for the model replicas as well as [prediction metrics](https://cloud.google.com/vertex-ai/docs/predictions/overview) of endpoints.
 
-## Configuration
+## Data streams
 
-For fetching the metrics, users need to enter the project_id and the credentials file/json.
-
-## Metrics
+### Metrics
 
 The GCP Vertex AI includes **Vertex AI Model Garden Publisher Model** metrics under the publisher category and the **Vertex AI Endpoint** metrics under the prediction category.
 
+#### Requirements
+
+You need Elasticsearch for storing and searching your data and Kibana for visualizing and managing it.
+You can use our hosted Elasticsearch Service on Elastic Cloud, which is recommended, or self-manage the Elastic Stack on your own hardware.
+
+Before using any GCP integration you will need:
+
+* **GCP Credentials** to connect with your GCP account.
+* **GCP Permissions** to make sure the service account you're using to connect has permission to share the relevant data.
+
+#### Roles & Permissions
+
+There isn't a single, specific role required to view metrics for Vertex AI. Access depends on how the models are deployed and the permissions granted to your Google Cloud project and user account. 
+
+However, to summarize the necessary permissions and implied roles, you'll generally need a role that includes the following permissions:
+
+- **monitoring.metricDescriptor.list:** Allows you to list available metric descriptors.
+- **monitoring.timeSeries.list:** Allows you to list time series data for the metrics.
+
+These permissions are included in many roles, but here are some of the most common ones:
+
+- **roles/monitoring.viewer:** This role provides read-only access to Cloud Monitoring metrics.
+- **roles/aiplatform.user:** This role grants broader access to Vertex AI, including model viewing and potentially metric access.
+- **More granular roles:** For fine-grained control (recommended for security best practices), consider using a custom role built with the specific permissions needed. This would only include the necessary permissions to view model metrics, rather than broader access to all Vertex AI or Cloud Monitoring resources. This requires expertise in IAM (Identity and Access Management).
+- **Predefined roles with broader access:** These roles provide extensive permissions within the Google Cloud project, giving access to metrics but granting much broader abilities than necessary for just viewing metrics. These are generally too permissive unless necessary for other tasks. Examples are `roles/aiplatform`.user or `roles/editor`.
+
+#### Configuration
+
+To fetch the metrics, enter the project_id and the credentials file/json.
+
+Refer to [Google Cloud Platform configuration](https://www.elastic.co/docs/current/integrations/gcp#configure-the-integration-settings) for more information about the configuration.
+
+#### Troubleshooting
+
+Refer to [Google Cloud Platform troubleshooting](https://www.elastic.co/docs/current/integrations/gcp#metrics-collection-configuration:~:text=to%20collect%20metrics.-,Troubleshooting,-If%20you%20don%27t) for more information about troubleshooting the issue.
+
+#### Metrics reference
+
+{{event "metrics"}}
+
+**ECS Field Reference**
+
+Please refer to the following [document](https://www.elastic.co/guide/en/ecs/current/ecs-field-reference.html) for detailed information on ECS fields.
+
 {{fields "metrics"}}
@@ -1,11 +1,16 @@
 # newer versions go on top
+- version: "0.1.0"
+  changes:
+    - description: Update documentation with roles and permissions.
+      type: enhancement
+      link: https://github.com/elastic/integrations/pull/11659
 - version: "0.0.2"
   changes:
-    - description:  Enhancements to dashboards, configuration, documentation
+    - description: Enhancements to dashboards, configuration, documentation.
       type: enhancement
       link: https://github.com/elastic/integrations/pull/11373
 - version: "0.0.1"
   changes:
-    - description: Initial draft of the GCP Vertex AI package
+    - description: Initial draft of the GCP Vertex AI package.
       type: enhancement
       link: https://github.com/elastic/integrations/pull/11225
@@ -13,7 +13,7 @@
         - name: model_invocation_count
           type: long
           metric_type: gauge
-          description: Number of model invocations (prediction requests). 
+          description: Number of model invocations (prediction requests).
         - name: character_count
           type: long
           metric_type: gauge
@@ -75,6 +75,70 @@
         - name: prediction_latencies
           type: histogram
           description: Online prediction latency of the deployed model.
-- name: gcp.labels.resource.location
-  type: keyword
-  description: Location of the resource
+## Label Fields
+- name: gcp.labels
+  description: Google Vertex AI dimension fields.
+  type: group
+  fields:
+    - name: resource
+      description: Resource fields.
+      type: group
+      fields:
+        - name: location
+          type: keyword
+          description: The region in which the service is running.
+        - name: model_user_id
+          type: keyword
+          description: The resource ID of the PublisherModel.
+        - name: model_version_id
+          type: keyword
+          description: The version ID of the PublisherModel.
+        - name: publisher
+          type: keyword
+          description: The publisher of the model.
+        - name: endpoint_id
+          type: keyword
+          description: The ID of the Endpoint.
+        - name: resource_container
+          type: keyword
+          description: The identifier of the GCP Project owning the Endpoint.
+    - name: metrics
+      description: Metrics fields.
+      type: group
+      fields:
+        - name: request_type
+          type: keyword
+          description: The type of traffic of the request (dedicated/shared).
+        - name: type
+          type: keyword
+          description: Type of token (input/output).
+        - name: max_token_size
+          type: keyword
+          description: The bucketized max size of number of tokens in the prediction request/response.
+        - name: input_token_size
+          type: keyword
+          description: The bucketized size of number of tokens in the prediction request.
+        - name: output_token_size
+          type: keyword
+          description: The bucketized size of number of tokens in the prediction response.
+        - name: response_code
+          type: keyword
+          description: Response code of prediction request.
+        - name: method
+          type: keyword
+          description: The type of method of the request (RawPredict/StreamRawPredict/ChatCompletions/etc).
+        - name: error_category
+          type: keyword
+          description: Response error category of the request (user/system/capacity).
+        - name: latency_type
+          type: keyword
+          description: The type of latency for the prediction request (either model or overhead).
+        - name: deployed_model_id
+          type: keyword
+          description: The ID of the DeployedModel which serves the prediction request.
+        - name: spot
+          type: keyword
+          description: Whether this deployment is on Spot VMs. Has values of True or False.
+        - name: replica_id
+          type: keyword
+          description: Unique ID corresponding to the model replica.
@@ -17,4 +17,3 @@ streams:
         multi: false
         required: false
         show_user: true
-
@@ -0,0 +1,71 @@
+{
+    "cloud": {
+        "provider": "gcp",
+        "account": {
+            "name": "elastic-sa",
+            "id": "elastic-sa"
+        }
+    },
+    "agent": {
+        "name": "docker-fleet-agent",
+        "id": "f9c4beb9-c0c0-47ca-963a-a9dc00e2df5e",
+        "ephemeral_id": "6c42a949-d522-44bf-818b-12c4a5908b90",
+        "type": "metricbeat",
+        "version": "8.15.2"
+    },
+    "@timestamp": "2024-11-07T05:50:40.000Z",
+    "ecs": {
+        "version": "8.0.0"
+    },
+    "gcp": {
+        "vertexai": {
+            "publisher": {
+                "online_serving": {
+                    "token_count": 13
+                }
+            }
+        },
+        "labels": {
+            "resource": {
+                "model_user_id": "gemini-1.5-flash-002",
+                "model_version_id": "",
+                "publisher": "google",
+                "location": "us-central1"
+            },
+            "metrics": {
+                "request_type": "shared",
+                "type": "input"
+            }
+        }
+    },
+    "service": {
+        "type": "gcp"
+    },
+    "data_stream": {
+        "namespace": "default",
+        "type": "metrics",
+        "dataset": "gcp_vertexai.metrics"
+    },
+    "elastic_agent": {
+        "id": "f9c4beb9-c0c0-47ca-963a-a9dc00e2df5e",
+        "version": "8.15.2",
+        "snapshot": false
+    },
+    "host": {
+        "hostname": "docker-fleet-agent",
+        "ip": [
+            "172.25.0.7"
+        ]
+    },
+    "metricset": {
+        "period": 60000,
+        "name": "metrics"
+    },
+    "event": {
+        "duration": 913154084,
+        "agent_id_status": "verified",
+        "ingested": "2024-11-07T05:57:17Z",
+        "module": "gcp",
+        "dataset": "gcp_vertexai.metrics"
+    }
+}