Skip to content

Commit e6749e5

Browse files
Merge pull request #163 from ls1intum/add-grading-critera-support-llm
Add grading criteria support for llm modules
2 parents 24caa9b + 6f8d009 commit e6749e5

File tree

8 files changed

+102
-14
lines changed

8 files changed

+102
-14
lines changed

athena/athena/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pathlib import Path
44

55
from .app import app
6-
from .schemas import ExerciseType
6+
from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction
77
from .metadata import emit_meta, get_meta
88
from .experiment import get_experiment_environment
99
from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider # type: ignore
@@ -32,5 +32,7 @@ def run_module():
3232
"get_meta",
3333
"get_experiment_environment",
3434
"ExerciseType",
35+
"GradingCriterion",
36+
"StructuredGradingInstruction",
3537
"app"
3638
]

athena/athena/schemas/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010
from .programming_feedback import ProgrammingFeedback
1111
from .programming_exercise import ProgrammingExercise
1212
from .programming_submission import ProgrammingSubmission
13+
from .grading_criterion import GradingCriterion, StructuredGradingInstruction

athena/athena/schemas/feedback.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,23 @@
33

44
from pydantic import Field
55

6-
from .grading_criterion import StructuredGradingInstruction
76
from .schema import Schema
87

98

109
class Feedback(Schema, ABC):
1110
id: Optional[int] = Field(None, example=1)
12-
title: Optional[str] = Field(None, description="The title of the feedback that is shown to the student.",
13-
example="File src/pe1/MergeSort.java at line 12")
14-
description: Optional[str] = Field(None, description="The detailed feedback description that is shown to the student.",
15-
example="Your solution is correct.")
16-
credits: float = Field(0.0, description="The number of points that the student received for this feedback.",
11+
title: Optional[str] = Field(None,
12+
description="The title of the feedback that is shown to the student.",
13+
example="File src/pe1/MergeSort.java at line 12")
14+
description: Optional[str] = Field(None,
15+
description="The detailed feedback description that is shown to the student.",
16+
example="Your solution is correct.")
17+
credits: float = Field(0.0,
18+
description="The number of points that the student received for this feedback.",
1719
example=1.0)
18-
structured_grading_instruction_id: Optional[int] = Field(None, description="The id of the structured grading instruction that this feedback belongs to.", example=1)
20+
structured_grading_instruction_id: Optional[int] = Field(None,
21+
description="The id of the structured grading instruction that this feedback belongs to.",
22+
example=1)
1923

2024
meta: dict = Field({}, example={})
2125

module_programming_llm/module_programming_llm/generate_suggestions_by_file.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ class FeedbackModel(BaseModel):
2929
line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
3030
line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
3131
credits: float = Field(0.0, description="Number of points received/deducted")
32+
grading_instruction_id: Optional[int] = Field(
33+
description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
34+
)
3235

3336
class Config:
3437
title = "Feedback"
@@ -239,6 +242,7 @@ async def generate_suggestions_by_file(exercise: Exercise, submission: Submissio
239242
line_start=feedback.line_start,
240243
line_end=feedback.line_end,
241244
credits=feedback.credits,
245+
structured_grading_instruction_id=feedback.grading_instruction_id,
242246
meta={}
243247
))
244248

module_programming_llm/module_programming_llm/helpers/utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from git.repo import Repo
99
from langchain.document_loaders import GitLoader
1010

11+
from athena import GradingCriterion
1112

1213
def load_files_from_repo(repo: Repo, file_filter: Optional[Callable[[str], bool]] = None) -> Dict[str, str]:
1314
return {
@@ -24,6 +25,40 @@ def merge_repos_by_filepath(*repos: Repo, file_filter: Optional[Callable[[str],
2425
yield (file, [doc.get(file) for doc in docs])
2526

2627

28+
def format_grading_instructions(grading_instructions: Optional[str], grading_criteria: Optional[List[GradingCriterion]]) -> Optional[str]:
29+
"""Formats grading instructions and the grading criteria with nested structured grading instructions into a single string.
30+
31+
Args:
32+
grading_instructions (Optional[str]): Grading instructions
33+
grading_criteria (Optional[List[GradingCriterion]]): Grading criteria with nested structured grading instructions
34+
35+
Returns:
36+
Optional[str]: Formatted grading instructions or None if no grading instructions or grading criteria are provided
37+
"""
38+
39+
if not grading_instructions and not grading_criteria:
40+
return None
41+
42+
result = ""
43+
if grading_instructions:
44+
result += grading_instructions + "\n\n"
45+
46+
if grading_criteria:
47+
for grading_criterion in grading_criteria:
48+
result += f'Criterion > "{(grading_criterion.title or "Unnamed criterion")}":\n'
49+
for grading_instruction in grading_criterion.structured_grading_instructions:
50+
result += f' - grading_instruction_id={grading_instruction.id} > "{grading_instruction.feedback}": ('
51+
if grading_instruction.usage_count > 0:
52+
result += f'can be used {grading_instruction.usage_count} times in total'
53+
else:
54+
result += "can be used unlimited times"
55+
result += f', gives {grading_instruction.credits} credits for "{grading_instruction.grading_scale}" grading scale, '
56+
result += f'usage description: "{grading_instruction.instruction_description}")\n'
57+
result += "\n"
58+
59+
return result.strip()
60+
61+
2762
def add_line_numbers(content: str) -> str:
2863
lines = content.splitlines()
2964
line_number_max_length = len(str(len(lines)))

module_programming_llm/module_programming_llm/split_grading_instructions_by_file.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
num_tokens_from_prompt,
1515
predict_and_parse
1616
)
17-
from module_programming_llm.helpers.utils import get_diff
17+
from module_programming_llm.helpers.utils import format_grading_instructions, get_diff
1818

1919

2020
class FileGradingInstruction(BaseModel):
@@ -47,9 +47,11 @@ async def split_grading_instructions_by_file(
4747
Optional[SplitGradingInstructions]: Split grading instructions, None if it is too short or too long
4848
"""
4949

50+
grading_instructions = format_grading_instructions(exercise.grading_instructions, exercise.grading_criteria)
51+
5052
# Return None if the grading instructions are too short
51-
if (exercise.grading_instructions is None
52-
or num_tokens_from_string(exercise.grading_instructions) <= config.split_grading_instructions_by_file_prompt.tokens_before_split):
53+
if (grading_instructions is None
54+
or num_tokens_from_string(grading_instructions) <= config.split_grading_instructions_by_file_prompt.tokens_before_split):
5355
return None
5456

5557
# Return None if the grading instructions are not in the prompt
@@ -84,7 +86,7 @@ async def split_grading_instructions_by_file(
8486
)
8587

8688
prompt_input = {
87-
"grading_instructions": exercise.grading_instructions,
89+
"grading_instructions": grading_instructions,
8890
"changed_files_from_template_to_solution": ", ".join(changed_files_from_template_to_solution),
8991
"changed_files_from_template_to_submission": ", ".join(changed_files_from_template_to_submission)
9092
}

module_text_llm/module_text_llm/generate_suggestions.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,17 @@
1212
num_tokens_from_prompt,
1313
predict_and_parse
1414
)
15-
from module_text_llm.helpers.utils import add_sentence_numbers, get_index_range_from_line_range
15+
from module_text_llm.helpers.utils import add_sentence_numbers, get_index_range_from_line_range, format_grading_instructions
1616

1717
class FeedbackModel(BaseModel):
1818
title: str = Field(description="Very short title, i.e. feedback category", example="Logic Error")
1919
description: str = Field(description="Feedback description")
2020
line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
2121
line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
2222
credits: float = Field(0.0, description="Number of points received/deducted")
23+
grading_instruction_id: Optional[int] = Field(
24+
description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
25+
)
2326

2427
class Config:
2528
title = "Feedback"
@@ -40,7 +43,7 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
4043
prompt_input = {
4144
"max_points": exercise.max_points,
4245
"bonus_points": exercise.bonus_points,
43-
"grading_instructions": exercise.grading_instructions,
46+
"grading_instructions": format_grading_instructions(exercise.grading_instructions, exercise.grading_criteria),
4447
"problem_statement": exercise.problem_statement or "No problem statement.",
4548
"example_solution": exercise.example_solution,
4649
"submission": add_sentence_numbers(submission.text)
@@ -102,6 +105,7 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
102105
index_start=index_start,
103106
index_end=index_end,
104107
credits=feedback.credits,
108+
structured_grading_instruction_id=feedback.grading_instruction_id,
105109
meta={}
106110
))
107111

module_text_llm/module_text_llm/helpers/utils.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import tiktoken
44
from nltk.tokenize import sent_tokenize
55

6+
from athena import GradingCriterion
7+
68
# This is correct for gpt-4 and chat gpt3.5 but might be different for other models
79
def num_tokens_from_string(string: str) -> int:
810
"""Returns the number of tokens in a text string."""
@@ -11,6 +13,40 @@ def num_tokens_from_string(string: str) -> int:
1113
return num_tokens
1214

1315

16+
def format_grading_instructions(grading_instructions: Optional[str], grading_criteria: Optional[List[GradingCriterion]]) -> Optional[str]:
17+
"""Formats grading instructions and the grading criteria with nested structured grading instructions into a single string.
18+
19+
Args:
20+
grading_instructions (Optional[str]): Grading instructions
21+
grading_criteria (Optional[List[GradingCriterion]]): Grading criteria with nested structured grading instructions
22+
23+
Returns:
24+
Optional[str]: Formatted grading instructions or None if no grading instructions or grading criteria are provided
25+
"""
26+
27+
if not grading_instructions and not grading_criteria:
28+
return None
29+
30+
result = ""
31+
if grading_instructions:
32+
result += grading_instructions + "\n\n"
33+
34+
if grading_criteria:
35+
for grading_criterion in grading_criteria:
36+
result += f'Criterion > "{(grading_criterion.title or "Unnamed criterion")}":\n'
37+
for grading_instruction in grading_criterion.structured_grading_instructions:
38+
result += f' - grading_instruction_id={grading_instruction.id} > "{grading_instruction.feedback}": ('
39+
if grading_instruction.usage_count > 0:
40+
result += f'can be used {grading_instruction.usage_count} times in total'
41+
else:
42+
result += "can be used unlimited times"
43+
result += f', gives {grading_instruction.credits} credits for "{grading_instruction.grading_scale}" grading scale, '
44+
result += f'usage description: "{grading_instruction.instruction_description}")\n'
45+
result += "\n"
46+
47+
return result.strip()
48+
49+
1450
def add_sentence_numbers(content: str) -> str:
1551
sentences = sent_tokenize(content)
1652
sentences = [line for sentence in sentences for line in sentence.split("\n")]

0 commit comments

Comments
 (0)