Skip to content

Commit

Permalink
Preliminary AI Feedback for all Apollon UML Diagrams (#335)
Browse files Browse the repository at this point in the history
Co-authored-by: Dmytro Polityka <[email protected]>
  • Loading branch information
LeonWehrhahn and dmytropolityka authored Sep 12, 2024
1 parent 5d3b517 commit acf5d26
Show file tree
Hide file tree
Showing 23 changed files with 1,315 additions and 1,973 deletions.
4 changes: 2 additions & 2 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
"request": "launch",
"cwd": "${workspaceFolder}/modules/programming/module_programming_llm",
"module": "module_programming_llm",
"justMyCode": true
"justMyCode": false
},
{
"name": "Module Programming ThemisML",
Expand Down Expand Up @@ -62,7 +62,7 @@
"type": "python",
"request": "launch",
"cwd": "${workspaceFolder}/modules/modeling/module_modeling_llm",
"module": "module_text_cofee"
"module": "module_modeling_llm"
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion assessment_module_manager/modules.docker.ini
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,6 @@ supports_graded_feedback_requests = false
url = http://module-modeling-llm:5008
type = modeling
supports_evaluation = false
supports_non_graded_feedback_requests = false
supports_non_graded_feedback_requests = true
supports_graded_feedback_requests = true

2 changes: 1 addition & 1 deletion assessment_module_manager/modules.ini
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,6 @@ supports_graded_feedback_requests = false
url = http://localhost:5008
type = modeling
supports_evaluation = false
supports_non_graded_feedback_requests = false
supports_non_graded_feedback_requests = true
supports_graded_feedback_requests = true

10 changes: 5 additions & 5 deletions modules/modeling/module_modeling_llm/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ DATABASE_URL=sqlite:///../data/data.sqlite

# Default model to use
# See below for options, available models are also logged on startup
LLM_DEFAULT_MODEL="azure_openai_gpt-35"
LLM_DEFAULT_MODEL="azure_openai_gpt-4o"

# Enable LLM-as-a-judge approach 0 = disabled, 1 = enabled
LLM_ENABLE_LLM_AS_A_JUDGE=1
Expand All @@ -23,13 +23,13 @@ LLM_EVALUATION_MODEL="azure_openai_gpt-4"
# Standard OpenAI (Non-Azure) [leave blank if not used]
# Model names prefixed with `openai_` followed by the model name, e.g. `openai_text-davinci-003`
# A list of models can be found in `module_text_llm/helpers/models/openai.py` (openai_models)
LLM_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

# Azure OpenAI [leave blank if not used]
# Model names prefixed with `azure_openai_` followed by the deployment id, e.g. `azure_openai_gpt-35`
LLM_AZURE_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
LLM_AZURE_OPENAI_API_BASE="https://ase-eu01.openai.azure.com/" # change base if needed
LLM_AZURE_OPENAI_API_VERSION="2023-07-01-preview" # change base if needed
AZURE_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
AZURE_OPENAI_ENDPOINT="https://ase-eu01.openai.azure.com/" # change base if needed
OPENAI_API_VERSION="2023-07-01-preview" # change base if needed

# Replicate [leave blank if not used]
# See https://replicate.com and adjust model config options in `module_text_llm/helpers/models/replicate.py`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def process_incoming_feedback(exercise: Exercise, submission: Submission, feedba
async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
logger.info("suggest_feedback: Suggestions for submission %d of exercise %d were requested", submission.id,
exercise.id)
return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug)
return await generate_suggestions(exercise, submission, is_graded, module_config.approach, module_config.debug)


if __name__ == "__main__":
Expand Down
19 changes: 13 additions & 6 deletions modules/modeling/module_modeling_llm/module_modeling_llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from athena import config_schema_provider
from module_modeling_llm.helpers.models import ModelConfigType, DefaultModelConfig
from module_modeling_llm.prompts.generate_suggestions import (
system_message as generate_suggestions_system_message,
human_message as generate_suggestions_human_message
graded_feedback_system_message as default_graded_feedback_system_message,
graded_feedback_human_message as default_graded_feedback_human_message,
filter_feedback_system_message as default_filter_feedback_system_message,
filter_feedback_human_message as default_filter_feedback_human_message
)


Expand All @@ -16,10 +18,15 @@ class GenerateSuggestionsPrompt(BaseModel):
_Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input
is too long._
"""
system_message: str = Field(default=generate_suggestions_system_message,
description="Message for priming AI behavior and instructing it what to do.")
human_message: str = Field(default=generate_suggestions_human_message,
description="Message from a human. The input on which the AI is supposed to act.")
graded_feedback_system_message: str = Field(default=default_graded_feedback_system_message,
description="Message for priming AI behavior and instructing it what to do.")
graded_feedback_human_message: str = Field(default=default_graded_feedback_human_message,
description="Message from a human. The input on which the AI is supposed to act.")
filter_feedback_system_message: str = Field(default=default_filter_feedback_system_message,
description="Message for priming AI behavior for filtering ungraded feedback.")
filter_feedback_human_message: str = Field(default=default_filter_feedback_human_message,
description="Message for instructing AI to filter ungraded feedback.")



class BasicApproachConfig(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
import json
from typing import List, Optional, Sequence

from module_modeling_llm.prompts.apollon_format import apollon_format_description
from pydantic import BaseModel, Field

from athena import emit_meta
from athena.logger import logger
from athena.modeling import Exercise, Submission, Feedback
from module_modeling_llm.config import BasicApproachConfig
from module_modeling_llm.helpers.llm_utils import (
get_chat_prompt_with_formatting_instructions,
check_prompt_length_and_omit_features_if_necessary,
num_tokens_from_prompt,
predict_and_parse
)
from module_modeling_llm.helpers.models.diagram_types import DiagramType
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from module_modeling_llm.helpers.llm_utils import predict_and_parse
from module_modeling_llm.helpers.serializers.diagram_model_serializer import DiagramModelSerializer
from module_modeling_llm.helpers.utils import format_grading_instructions, get_elements
from module_modeling_llm.prompts.submission_format.submission_format_remarks import get_submission_format_remarks
from module_modeling_llm.helpers.utils import format_grading_instructions


class FeedbackModel(BaseModel):
title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
description: str = Field(description="Feedback description")
element_ids: Optional[str] = Field(description="Referenced diagram element IDs, or empty if unreferenced")
element_names: Optional[List[str]] = Field(description="Referenced diagram element names, and relations (R<number>) or empty if unreferenced")
credits: float = Field(0.0, description="Number of points received/deducted")
grading_instruction_id: Optional[int] = Field(
description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
Expand All @@ -35,25 +30,13 @@ class Config:
class AssessmentModel(BaseModel):
"""Collection of feedbacks making up an assessment"""

feedbacks: Sequence[FeedbackModel] = Field(description="Assessment feedbacks")
feedbacks: Sequence[FeedbackModel] = Field(description="Assessment feedbacks, make sure to include all grading instructions")

class Config:
title = "Assessment"


def filter_ids_for_model(ids: List[str], model: dict) -> List[str]:
"""
Filter a list of element ids based on whether a corresponding element is present in a given diagram model.
:param ids: List of ids that should be filtered
:param model: Diagram model in which elements with the given ids should be contained
:return The filtered list of IDs
"""
elements: list[dict] = get_elements(model)
model_ids: set[str] = {str(element.get("id")) for element in elements}
return list(filter(lambda id: id in model_ids, ids))


async def generate_suggestions(exercise: Exercise, submission: Submission, config: BasicApproachConfig, debug: bool) -> \
async def generate_suggestions(exercise: Exercise, submission: Submission, is_graded: bool, config: BasicApproachConfig, debug: bool) -> \
List[Feedback]:
"""
Generate feedback suggestions for modeling exercise submissions
Expand All @@ -66,55 +49,28 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
model = config.model.get_model() # type: ignore[attr-defined]

serialized_example_solution = None

if exercise.example_solution:
example_solution_diagram = json.loads(exercise.example_solution)
serialized_example_solution, _ = DiagramModelSerializer.serialize_model(example_solution_diagram)

submission_diagram = json.loads(submission.model)
submission_format_remarks = get_submission_format_remarks(submission_diagram.get("type"))

# Having the LLM reference IDs that a specific feedback item applies to seems to work a lot more reliable with
# shorter IDs, especially if they are prefixed with "id_". We therefore map the UUIDs used in Apollon diagrams to
# shortened IDs and have the diagram model serializer return a reverse mapping dictionary which allows us to map
# the shortened IDs back to the original ones.
serialized_submission, reverse_id_map = DiagramModelSerializer.serialize_model(submission_diagram)
serialized_submission, element_id_mapping = DiagramModelSerializer.serialize_model(submission_diagram)

prompt_input = {
"max_points": exercise.max_points,
"bonus_points": exercise.bonus_points,
"grading_instructions": format_grading_instructions(exercise.grading_instructions, exercise.grading_criteria),
"submission_format_remarks": submission_format_remarks,
"submission_format": submission_diagram.get("type"),
"problem_statement": exercise.problem_statement or "No problem statement.",
"example_solution": serialized_example_solution or "No example solution.",
"submission": serialized_submission
"submission": serialized_submission,
"uml_diagram_format": apollon_format_description,
"format_instructions": PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
}

chat_prompt = get_chat_prompt_with_formatting_instructions(
model=model,
system_message=config.generate_suggestions_prompt.system_message,
human_message=config.generate_suggestions_prompt.human_message,
pydantic_object=AssessmentModel
)

# Check if the prompt is too long and omit features if necessary (in order of importance)
omittable_features = ["example_solution", "problem_statement", "grading_instructions"]
prompt_input, should_run = check_prompt_length_and_omit_features_if_necessary(
prompt=chat_prompt,
prompt_input=prompt_input,
max_input_tokens=10000, # config.max_input_tokens,
omittable_features=omittable_features,
debug=debug
)

# Skip if the prompt is too long
if not should_run:
logger.warning("Input too long. Skipping.")
if debug:
emit_meta("prompt", chat_prompt.format(**prompt_input))
emit_meta("error",
f"Input too long {num_tokens_from_prompt(chat_prompt, prompt_input)} > {config.max_input_tokens}")
return []
chat_prompt = ChatPromptTemplate.from_messages([
("system", config.generate_suggestions_prompt.graded_feedback_system_message),
("human", config.generate_suggestions_prompt.graded_feedback_human_message)])

result = await predict_and_parse(
model=model,
Expand All @@ -136,6 +92,38 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
if result is None:
return []

# Check if is graded
if is_graded is False:
filter_chat_prompt = ChatPromptTemplate.from_messages([
("system", config.generate_suggestions_prompt.filter_feedback_system_message),
("human", config.generate_suggestions_prompt.filter_feedback_human_message)
])

filter_prompt_input = {
"original_feedback": result.dict(),
"format_instructions": PydanticOutputParser(pydantic_object=AssessmentModel).get_format_instructions()
}

result = await predict_and_parse(
model=model,
chat_prompt=filter_chat_prompt,
prompt_input=filter_prompt_input,
pydantic_object=AssessmentModel,
tags=[
f"exercise-{exercise.id}-filter",
f"submission-{submission.id}-filter",
]
)

if debug:
emit_meta("filter_feedback", {
"prompt": filter_chat_prompt.format(**filter_prompt_input),
"result": result.dict() if result is not None else None
})

if result is None:
return []

grading_instruction_ids = set(
grading_instruction.id
for criterion in exercise.grading_criteria or []
Expand All @@ -145,21 +133,20 @@ async def generate_suggestions(exercise: Exercise, submission: Submission, confi
feedbacks = []
for feedback in result.feedbacks:
grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
element_ids = list(
map(lambda element_id: reverse_id_map[
element_id.strip()
] if reverse_id_map else element_id.strip(), feedback.element_ids.split(","))
) if feedback.element_ids else []
element_ids = [element_id_mapping[element] for element in (feedback.element_names or [])]


feedbacks.append(Feedback(
exercise_id=exercise.id,
submission_id=submission.id,
title=feedback.title,
description=feedback.description,
element_ids=filter_ids_for_model(element_ids, submission_diagram),
element_ids=element_ids,
credits=feedback.credits,
structured_grading_instruction_id=grading_instruction_id,
meta={}
meta={},
id=None,
is_graded=is_graded
))

return feedbacks
return feedbacks
Loading

0 comments on commit acf5d26

Please sign in to comment.