Fix log parser

david1542 · david1542 · commit deb2fb02dbe2 · 2024-09-22T10:50:52.000-07:00
diff --git a/config/envoy/envoy.yaml b/config/envoy/envoy.yaml
@@ -24,7 +24,7 @@ static_resources:
                           route:
                             cluster: api_service
                             prefix_rewrite: "/"
-                            timeout: 90s
+                            timeout: 180s
                         - match:
                             prefix: "/slackbot/"
                           route:
diff --git a/services/log-parser/src/providers/coralogix/processor.py b/services/log-parser/src/providers/coralogix/processor.py
@@ -90,18 +90,21 @@ def process_template_group(group):
         return df
 
     df_enriched_clusters = df.groupby("EventTemplate").apply(process_template_group)
-    df_enriched_clusters = df_enriched_clusters.reset_index().drop(
-        columns=["level_1", "Content"]
-    )
+    df_enriched_clusters = df_enriched_clusters.reset_index()
     df_enriched_clusters["percentage"] = (
         df_enriched_clusters["occurrences"] / len(df) * 100
     )
     df_enriched_clusters = df_enriched_clusters.sort_values(
         "occurrences", ascending=False
     )
+    try:
+        df_enriched_clusters = df_enriched_clusters.drop(columns=["level_1", "Content"])
+    except Exception as e:
+        print("Error: ", e)
 
-    # Remove log groups with only one occurrence
-    df_enriched_clusters = df_enriched_clusters[df_enriched_clusters["occurrences"] > 1]
+    # Get top 10 clusters by occurrences
+    # TODO: we can use the elbow method here to find the optimal number of clusters
+    df_enriched_clusters = df_enriched_clusters.head(10)
 
     records = df_enriched_clusters.to_dict(orient="records")
     for record in records:
@@ -132,6 +135,8 @@ def parse_raw_logs(logs: str):
         # Maybe we should use that in the future.
         if warning and not pd.isna(warning):
             continue
+        if type(batch) != dict:
+            continue
 
         for result in batch["results"]:
             logs.append(result)