Skip to content

Commit 27ba58c

Browse files
committed
update script for CC
1 parent 209ad2c commit 27ba58c

File tree

1 file changed

+20
-98
lines changed

1 file changed

+20
-98
lines changed

python/src/scripts/export_langsmith_human_queries.py

Lines changed: 20 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
#!/usr/bin/env python3
21
"""
32
Extract human queries from LangSmith runs and export to JSON.
43
@@ -11,8 +10,8 @@
1110
LANGSMITH_PROJECT: Optional project name (default: "default")
1211
1312
Example usage:
14-
python extract_langsmith.py
15-
python extract_langsmith.py --days 7 --output queries.json
13+
uv run export_langsmith_human_queries.py
14+
uv run export_langsmith_human_queries.py --days 7 --output queries.json
1615
"""
1716

1817
from __future__ import annotations
@@ -32,29 +31,23 @@
3231
load_dotenv("../.env")
3332

3433

35-
class MessageType(Enum):
36-
"""Supported message types in LangChain format."""
37-
HUMAN = "human"
38-
HUMANMESSAGE = "humanmessage"
39-
CONSTRUCTOR = "constructor"
40-
41-
4234
@dataclass
4335
class RunQueries:
4436
"""Container for queries extracted from a single run."""
4537
run_id: str
4638
queries: list[str]
39+
mcp_mode: bool
4740

4841
def to_dict(self) -> dict[str, Any]:
49-
return {"run_id": str(self.run_id), "queries": self.queries}
42+
return {"run_id": self.run_id, "queries": self.queries, "mcp_mode": self.mcp_mode}
5043

5144

5245
@dataclass
5346
class Config:
5447
"""Script configuration."""
5548
output_path: Path
5649
days_back: int
57-
run_name_filter: str
50+
run_name_filters: list[str]
5851
project_name: str
5952

6053
@classmethod
@@ -63,83 +56,11 @@ def from_args(cls, args: argparse.Namespace) -> Config:
6356
return cls(
6457
output_path=Path(args.output),
6558
days_back=args.days,
66-
run_name_filter=args.name,
59+
run_name_filters=args.names,
6760
project_name=project_name,
6861
)
6962

7063

71-
class HumanMessageExtractor:
72-
"""Extracts human messages from LangChain message structures."""
73-
74-
HUMAN_MESSAGE_KEYS = ["chat_history", "messages", "inputs", "input"]
75-
76-
@staticmethod
77-
def _is_human_message_by_id(message: dict[str, Any]) -> bool:
78-
"""Check if message is human type based on ID field."""
79-
msg_id = message["id"]
80-
if isinstance(msg_id, list) and msg_id:
81-
return str(msg_id[-1]).lower() == MessageType.HUMANMESSAGE.value
82-
return False
83-
84-
@staticmethod
85-
def _is_human_message_by_type(message: dict[str, Any]) -> bool:
86-
"""Check if message is human type based on type field."""
87-
msg_type = str(message["type"]).lower()
88-
return msg_type in {MessageType.HUMAN.value, MessageType.HUMANMESSAGE.value}
89-
90-
def _process_message_object(self, obj: Any) -> list[str]:
91-
"""Process a single message object and extract human content."""
92-
# Check if this is a human message
93-
is_human = (
94-
self._is_human_message_by_id(obj) or
95-
self._is_human_message_by_type(obj)
96-
)
97-
if not is_human:
98-
return []
99-
100-
if "kwargs" not in obj or "content" not in obj["kwargs"]:
101-
raise ValueError(f"Expected kwargs and content in message object: {obj}")
102-
103-
content = obj["kwargs"]["content"]
104-
return [content] if content else []
105-
106-
def _process_value(self, value: Any) -> list[str]:
107-
if not isinstance(value, list):
108-
raise ValueError(f"Expected list, got {type(value)}: {value}")
109-
results = []
110-
for item in value:
111-
results.extend(self._process_message_object(item))
112-
return results
113-
114-
def extract(self, inputs: dict[str, Any]) -> list[str]:
115-
"""
116-
Extract human messages from LangChain inputs.
117-
118-
Args:
119-
inputs: Dictionary containing message data in various formats
120-
121-
Returns:
122-
List of unique human message strings in order of appearance
123-
"""
124-
results = []
125-
126-
for key in self.HUMAN_MESSAGE_KEYS:
127-
if key not in inputs:
128-
continue
129-
130-
results.extend(self._process_value(inputs[key]))
131-
132-
# Deduplicate while preserving order
133-
seen = set()
134-
unique_results = []
135-
for query in results:
136-
if query not in seen:
137-
unique_results.append(query)
138-
seen.add(query)
139-
140-
return unique_results
141-
142-
14364
class RunDeduplicator:
14465
"""Removes runs whose queries are prefixes of other runs."""
14566

@@ -181,7 +102,6 @@ class LangSmithExporter:
181102
def __init__(self, config: Config):
182103
self.config = config
183104
self.client = Client()
184-
self.extractor = HumanMessageExtractor()
185105
self.deduplicator = RunDeduplicator()
186106

187107
def _get_time_range(self) -> tuple[datetime, datetime]:
@@ -193,24 +113,25 @@ def _get_time_range(self) -> tuple[datetime, datetime]:
193113
def fetch_runs(self) -> list[RunQueries]:
194114
start_time, end_time = self._get_time_range()
195115

116+
117+
filter_clauses = [f'eq(name, "{name}")' for name in self.config.run_name_filters]
196118
query_params = {
197119
"start_time": start_time,
198120
"end_time": end_time,
199-
"filter": f'eq(name, "{self.config.run_name_filter}")',
121+
"filter": f'or({",".join(filter_clauses)})',
200122
"project_name": self.config.project_name,
201123
}
202124

203125
runs = []
204-
for run in self.client.list_runs(**query_params):
126+
all_runs = self.client.list_runs(**query_params)
127+
for run in all_runs:
205128
run_data = run.dict()
206129
inputs = run_data["inputs"]
207-
208-
queries = self.extractor.extract(inputs)
209-
if queries:
210-
runs.append(RunQueries(
211-
run_id=run_data["id"],
212-
queries=queries,
213-
))
130+
query = inputs["query"]
131+
chat_history = inputs["chat_history"]
132+
user_queries_in_history = [msg['content'] for msg in chat_history if msg["role"] == "user"]
133+
full_query = user_queries_in_history + [query]
134+
runs.append(RunQueries(run_id=str(run_data["id"]), queries=full_query, mcp_mode=inputs["mcp_mode"]))
214135

215136
return runs
216137

@@ -253,9 +174,10 @@ def parse_arguments() -> argparse.Namespace:
253174
help="Number of days to look back (default: %(default)s)",
254175
)
255176
parser.add_argument(
256-
"--name",
257-
default="RunnableSequence",
258-
help="Filter runs by name (default: %(default)s)",
177+
"--names",
178+
default=["RagPipeline", "RagPipelineStreaming"],
179+
nargs="+",
180+
help="Filter runs by names (default: %(default)s)",
259181
)
260182

261183
return parser.parse_args()

0 commit comments

Comments
 (0)