Skip to content

Commit 3f7c237

Browse files
committed
chore: update user_queries dataset
1 parent b23b9bc commit 3f7c237

File tree

7 files changed

+778
-3301
lines changed

7 files changed

+778
-3301
lines changed

python/optimizers/datasets/user_queries.json

Lines changed: 694 additions & 3258 deletions
Large diffs are not rendered by default.

python/optimizers/results/optimized_mcp_program.json

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

python/optimizers/results/optimized_rag.json

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

python/optimizers/results/optimized_retrieval_program.json

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

python/src/cairo_coder/dspy/query_processor.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@
1919
RESOURCE_DESCRIPTIONS = {
2020
DocumentSource.CAIRO_BOOK: "The Cairo Programming Language Book. Essential for core language syntax, semantics, types (felt252, structs, enums, Vec), traits, generics, control flow, memory management, writing tests, organizing a project, standard library usage, starknet interactions. Crucial for smart contract structure, storage, events, ABI, syscalls, contract deployment, interaction, L1<>L2 messaging, Starknet-specific attributes. Very important for interactions with the Starknet state and context (e.g. block, transaction) through syscalls.",
2121
DocumentSource.STARKNET_DOCS: "The Starknet Documentation. For the Starknet protocol, the STWO prover, architecture, APIs, syscalls, network interaction, deployment, ecosystem tools (Starkli, indexers, StarknetJS, wallets), general Starknet knowledge. This should not be included for Coding and Programming questions, but rather, only for questions about Starknet, Proving, ZK, STWO, SHARP itself.",
22-
DocumentSource.STARKNET_FOUNDRY: "The Starknet Foundry Documentation. For using the Foundry toolchain: writing, compiling, testing (unit tests, integration tests), and debugging Starknet contracts.",
22+
DocumentSource.STARKNET_FOUNDRY: "The Starknet Foundry Documentation. For using the Foundry toolchain: `snforge` for writing, compiling, testing (unit tests, integration tests), and debugging Starknet contracts. `sncast` for deploying and interacting with contracts to Starknet.",
2323
DocumentSource.CAIRO_BY_EXAMPLE: "Cairo by Example Documentation. Provides practical Cairo code snippets for specific language features or common patterns. Useful for how-to syntax questions. This should not be included for Smart Contract questions, but for all other Cairo programming questions.",
2424
DocumentSource.OPENZEPPELIN_DOCS: "OpenZeppelin Cairo Contracts Documentation. For using the OZ library: standard implementations (ERC20, ERC721), access control, security patterns, contract upgradeability. Crucial for building standard-compliant contracts.",
2525
DocumentSource.CORELIB_DOCS: "Cairo Core Library Documentation. For using the Cairo core library: basic types, stdlib functions, stdlib structs, macros, and other core concepts. Essential for Cairo programming questions.",
2626
DocumentSource.SCARB_DOCS: "Scarb Documentation. For using the Scarb package manager: building, compiling, generating compilation artifacts, managing dependencies, configuration of Scarb.toml.",
27-
DocumentSource.STARKNET_JS: "StarknetJS Documentation. For using the StarknetJS library: interacting with Starknet contracts, StarknetJS APIs, StarknetJS examples, StarknetJS tutorials, StarknetJS guides, StarknetJS documentation.",
27+
DocumentSource.STARKNET_JS: "StarknetJS Documentation. For using the StarknetJS library: interacting with Starknet contracts, (calls and transactions), deploying Starknet contracts, front-end APIs, javascript integration examples, guides, tutorials and general JS/TS documentation for starknet.",
2828
}
2929

3030
# Ensure all DocumentSource variants are covered
@@ -61,7 +61,7 @@ class CairoQueryAnalysis(dspy.Signature):
6161
)
6262

6363
resources: list[str] = dspy.OutputField(
64-
desc="List of documentation sources. Available sources: "
64+
desc="List of documentation sources. If unsure what to use or if the query is not clear, use all of the available sources. Available sources: "
6565
+ ", ".join([f"{key.value}: {value}" for key, value in RESOURCE_DESCRIPTIONS.items()])
6666
)
6767

python/src/cairo_coder/optimizers/retrieval_optimizer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def _(dspy, os, retriever):
108108
with open(dataset_path, encoding="utf-8") as f:
109109
example_dataset = json.load(f)
110110

111-
data = [dspy.Example(**d).with_inputs("query") for d in example_dataset]
111+
data = [dspy.Example({"query": d}).with_inputs("query") for d in example_dataset]
112112

113113
# Take maximum 300 random values from the dataset
114114
random.seed(42)
@@ -253,7 +253,7 @@ def compute_overall_score_with_feedback(
253253
score=score,
254254
feedback=feedback_text,
255255
)
256-
return RESOURCE_DESCRIPTIONS, compute_overall_score_with_feedback
256+
return (compute_overall_score_with_feedback,)
257257

258258

259259
@app.cell
@@ -368,6 +368,7 @@ def _(dspy, example, loading_progr):
368368
def _():
369369
return
370370

371+
371372
@app.cell
372373
def _():
373374
return

python/src/scripts/datasets/cli.py

Lines changed: 69 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,37 +18,14 @@ class HelpOnInvalidCommand(TyperGroup):
1818
def get_command(self, ctx, cmd_name): # type: ignore[override]
1919
cmd = super().get_command(ctx, cmd_name)
2020
if cmd is None:
21+
# Show a friendly message and the group's help, then exit with code 2
2122
typer.secho(f"Error: Unknown command '{cmd_name}'.", fg=typer.colors.RED, err=True)
2223
typer.echo()
2324
typer.echo(ctx.get_help())
24-
ctx.exit(2)
25+
# Use Click's normal control flow to avoid rich tracebacks
26+
raise click.exceptions.Exit(2)
2527
return cmd
2628

27-
def main(self, *args, **kwargs): # type: ignore[override]
28-
# Ensure we handle errors ourselves to show full help
29-
kwargs.setdefault("standalone_mode", False)
30-
try:
31-
return super().main(*args, **kwargs)
32-
except click.exceptions.UsageError as e: # missing option, bad invocation, etc.
33-
ctx = e.ctx or click.Context(self)
34-
# Show the specific command help if available
35-
typer.secho(f"Error: {e.format_message().strip()}", fg=typer.colors.RED, err=True)
36-
typer.echo()
37-
typer.echo(ctx.get_help())
38-
ctx.exit(2)
39-
40-
def __call__(self, *args, **kwargs): # type: ignore[override]
41-
# Force standalone_mode=False so exceptions propagate
42-
kwargs["standalone_mode"] = False
43-
try:
44-
return super().__call__(*args, **kwargs)
45-
except click.exceptions.UsageError as e:
46-
ctx = e.ctx or click.Context(self)
47-
typer.secho(f"Error: {e.format_message().strip()}", fg=typer.colors.RED, err=True)
48-
typer.echo()
49-
typer.echo(ctx.get_help())
50-
ctx.exit(2)
51-
5229

5330
app = typer.Typer(
5431
cls=HelpOnInvalidCommand,
@@ -87,16 +64,48 @@ def extract_starknet_agent(
8764
"--output",
8865
help="Where to write the extracted [{query, answer}] JSON array.",
8966
),
67+
query_only: bool = typer.Option(
68+
False,
69+
"--query",
70+
help="Write only queries as a JSON array instead of [{query, answer}] objects.",
71+
),
9072
) -> None:
9173
"""Extract de-duplicated QA pairs from a Starknet Agent chat dataset."""
9274
pairs = extract_starknet_agent_pairs(str(input))
9375

76+
# De-duplicate pairs by (query, answer) while preserving order
77+
seen_pairs: set[tuple[str, str]] = set()
78+
unique_pairs: list[dict] = []
79+
for p in pairs:
80+
q = p.get("query")
81+
a = p.get("answer")
82+
key = (q, a)
83+
if key not in seen_pairs:
84+
seen_pairs.add(key)
85+
unique_pairs.append(p)
86+
87+
if query_only:
88+
# De-duplicate queries while preserving order
89+
seen_queries: set[str] = set()
90+
queries_only: list[str] = []
91+
for p in unique_pairs:
92+
q = p.get("query")
93+
if q not in seen_queries:
94+
seen_queries.add(q)
95+
queries_only.append(q)
96+
data_to_write = queries_only
97+
else:
98+
data_to_write = unique_pairs
99+
94100
output = Path(output).expanduser()
95101
output.parent.mkdir(parents=True, exist_ok=True)
96102
with output.open("w", encoding="utf-8") as f:
97-
json.dump(pairs, f, ensure_ascii=False, indent=2)
103+
json.dump(data_to_write, f, ensure_ascii=False, indent=2)
98104

99-
typer.echo(f"Wrote {len(pairs)} pairs to {output}")
105+
if query_only:
106+
typer.echo(f"Wrote {len(data_to_write)} queries to {output}")
107+
else:
108+
typer.echo(f"Wrote {len(data_to_write)} pairs to {output}")
100109

101110

102111
@extract_app.command("cairo-coder")
@@ -133,6 +142,11 @@ def extract_cairo_coder(
133142
"`reasoning=`."
134143
),
135144
),
145+
query_only: bool = typer.Option(
146+
False,
147+
"--query",
148+
help="Write only queries as a JSON array instead of [{query, answer}] objects.",
149+
),
136150
) -> None:
137151
"""Extract QA pairs from a Cairo-Coder LangSmith export with `outputs.output`.
138152
@@ -145,16 +159,42 @@ def extract_cairo_coder(
145159
str(input), only_mcp=only_mcp, only_generated_answers=only_generated_answers
146160
)
147161

162+
# De-duplicate pairs by (query, answer) while preserving order
163+
seen_pairs: set[tuple[str, str]] = set()
164+
unique_pairs: list[dict] = []
165+
for p in pairs:
166+
q = p.get("query")
167+
a = p.get("answer")
168+
key = (q, a)
169+
if key not in seen_pairs:
170+
seen_pairs.add(key)
171+
unique_pairs.append(p)
172+
173+
if query_only:
174+
# De-duplicate queries while preserving order
175+
seen_queries: set[str] = set()
176+
queries_only: list[str] = []
177+
for p in unique_pairs:
178+
q = p.get("query")
179+
if q not in seen_queries:
180+
seen_queries.add(q)
181+
queries_only.append(q)
182+
data_to_write = queries_only
183+
else:
184+
data_to_write = unique_pairs
185+
148186
output = Path(output).expanduser()
149187
output.parent.mkdir(parents=True, exist_ok=True)
150188
with output.open("w", encoding="utf-8") as f:
151-
json.dump(pairs, f, ensure_ascii=False, indent=2)
189+
json.dump(data_to_write, f, ensure_ascii=False, indent=2)
152190

153191
typer.echo(
154192
json.dumps(
155193
{
156194
"input": str(Path(input).expanduser()),
157195
"output": str(output),
196+
"wrote": len(data_to_write),
197+
"format": "queries" if query_only else "pairs",
158198
**stats,
159199
},
160200
indent=2,

0 commit comments

Comments
 (0)