diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py new file mode 100644 index 0000000000..4ac87b1ba1 --- /dev/null +++ b/prepare/benchmarks/safety.py @@ -0,0 +1,36 @@ +from unitxt.benchmark import Benchmark +from unitxt.catalog import add_to_catalog +from unitxt.standard import DatasetRecipe + +MAX_TEST_INSTANCES = 1000 + +benchmark = Benchmark( + subsets={ + "attaq": DatasetRecipe( + card="cards.safety.attaq_gg", + template_card_index="default", + group_by=["label"], + max_test_instances=MAX_TEST_INSTANCES, + ), + "provoq": DatasetRecipe( + card="cards.safety.provoq_gg", + template_card_index="default", + group_by=["group"], + max_test_instances=MAX_TEST_INSTANCES, + ), + "airbench": DatasetRecipe( + card="cards.safety.airbench2024", + template_card_index="default", + group_by=["l2-name"], + max_test_instances=MAX_TEST_INSTANCES, + ), + "ailuminate": DatasetRecipe( + card="cards.safety.mlcommons_ailuminate", + template_card_index="default", + group_by=["hazard"], + max_test_instances=MAX_TEST_INSTANCES, + ), + } +) + +add_to_catalog(benchmark, "benchmarks.safety", overwrite=True) diff --git a/prepare/cards/attaq.py b/prepare/cards/attaq.py index b042f10b33..8068c0f22c 100644 --- a/prepare/cards/attaq.py +++ b/prepare/cards/attaq.py @@ -17,8 +17,9 @@ Shuffle(page_size=2800), ], task=Task( - input_fields=["input"], - reference_fields=["label"], + input_fields={"input": str}, + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.safety_metric"], ), templates=[ diff --git a/prepare/cards/safety/airbench2024.py b/prepare/cards/safety/airbench2024.py index 2d866dca3b..52b900291e 100644 --- a/prepare/cards/safety/airbench2024.py +++ b/prepare/cards/safety/airbench2024.py @@ -43,6 +43,7 @@ }, reference_fields={}, prediction_type=str, + # TODO: The current implementation uses Granite Guardian as judge. Future versions could use a custom judge and use the judge_prompt field. metrics=[ "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction]", ], diff --git a/prepare/cards/safety/attaq_gg.py b/prepare/cards/safety/attaq_gg.py new file mode 100644 index 0000000000..fb38f21fb2 --- /dev/null +++ b/prepare/cards/safety/attaq_gg.py @@ -0,0 +1,27 @@ +from unitxt import add_to_catalog +from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict +from unitxt.operators import Shuffle +from unitxt.splitters import RenameSplits +from unitxt.test_utils.card import test_card + +card = TaskCard( + loader=LoadHF(path="ibm/AttaQ"), + preprocess_steps=[ + RenameSplits(mapper={"train": "test"}), + Shuffle(page_size=2800), + ], + task=Task( + input_fields={"input": str}, + reference_fields={"label": str}, + prediction_type=str, + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=prediction]", + ], + ), + templates=TemplatesDict( + {"default": InputOutputTemplate(input_format="{input}", output_format="")} + ), +) + +test_card(card, strict=False, demos_taken_from="test", num_demos=0) +add_to_catalog(card, "cards.safety.attaq_gg", overwrite=True) diff --git a/prepare/cards/safety/mlcommons_ailuminate.py b/prepare/cards/safety/mlcommons_ailuminate.py index 6e3c8a58f0..d644b0f8f5 100644 --- a/prepare/cards/safety/mlcommons_ailuminate.py +++ b/prepare/cards/safety/mlcommons_ailuminate.py @@ -26,7 +26,9 @@ }, reference_fields={}, prediction_type=str, - metrics=["metrics.llm_as_judge.safety.llamaguard"], + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt_text,assistant_message_field=prediction]" + ], ), templates=TemplatesDict( { diff --git a/prepare/cards/safety/provoq_gg.py b/prepare/cards/safety/provoq_gg.py new file mode 100644 index 0000000000..54cd4d1adb --- /dev/null +++ b/prepare/cards/safety/provoq_gg.py @@ -0,0 +1,31 @@ +from unitxt import add_to_catalog +from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict +from unitxt.test_utils.card import test_card + +card = TaskCard( + loader=LoadHF(path="IBM/ProvoQ", data_classification_policy=["public"]), + task=Task( + input_fields={"question": str, "stigma": str, "group": str}, + reference_fields={}, + prediction_type=str, + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]", + ], + ), + templates=TemplatesDict( + {"default": InputOutputTemplate(input_format="{question}", output_format="")} + ), + __description__="The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.", + __tags__={ + "languages": ["english"], + }, +) + +test_card( + card, + strict=False, + demos_taken_from="test", + num_demos=0, +) + +add_to_catalog(card, "cards.safety.provoq_gg", overwrite=True) diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json new file mode 100644 index 0000000000..9b5cafff2b --- /dev/null +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -0,0 +1,41 @@ +{ + "__type__": "benchmark", + "subsets": { + "attaq": { + "__type__": "dataset_recipe", + "card": "cards.safety.attaq_gg", + "template_card_index": "default", + "group_by": [ + "label" + ], + "max_test_instances": 1000 + }, + "provoq": { + "__type__": "dataset_recipe", + "card": "cards.safety.provoq_gg", + "template_card_index": "default", + "group_by": [ + "group" + ], + "max_test_instances": 1000 + }, + "airbench": { + "__type__": "dataset_recipe", + "card": "cards.safety.airbench2024", + "template_card_index": "default", + "group_by": [ + "l2-name" + ], + "max_test_instances": 1000 + }, + "ailuminate": { + "__type__": "dataset_recipe", + "card": "cards.safety.mlcommons_ailuminate", + "template_card_index": "default", + "group_by": [ + "hazard" + ], + "max_test_instances": 1000 + } + } +} diff --git a/src/unitxt/catalog/cards/attaq.json b/src/unitxt/catalog/cards/attaq.json index 2c58f37b9a..6bc49ad878 100644 --- a/src/unitxt/catalog/cards/attaq.json +++ b/src/unitxt/catalog/cards/attaq.json @@ -18,12 +18,13 @@ ], "task": { "__type__": "task", - "input_fields": [ - "input" - ], - "reference_fields": [ - "label" - ], + "input_fields": { + "input": "str" + }, + "reference_fields": { + "label": "str" + }, + "prediction_type": "str", "metrics": [ "metrics.safety_metric" ] diff --git a/src/unitxt/catalog/cards/safety/attaq_gg.json b/src/unitxt/catalog/cards/safety/attaq_gg.json new file mode 100644 index 0000000000..5e71370f1d --- /dev/null +++ b/src/unitxt/catalog/cards/safety/attaq_gg.json @@ -0,0 +1,42 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "ibm/AttaQ" + }, + "preprocess_steps": [ + { + "__type__": "rename_splits", + "mapper": { + "train": "test" + } + }, + { + "__type__": "shuffle", + "page_size": 2800 + } + ], + "task": { + "__type__": "task", + "input_fields": { + "input": "str" + }, + "reference_fields": { + "label": "str" + }, + "prediction_type": "str", + "metrics": [ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=prediction]" + ] + }, + "templates": { + "__type__": "templates_dict", + "items": { + "default": { + "__type__": "input_output_template", + "input_format": "{input}", + "output_format": "" + } + } + } +} diff --git a/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json b/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json index 89542cec61..33aa3262ff 100644 --- a/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json +++ b/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json @@ -22,7 +22,7 @@ "reference_fields": {}, "prediction_type": "str", "metrics": [ - "metrics.llm_as_judge.safety.llamaguard" + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt_text,assistant_message_field=prediction]" ] }, "templates": { diff --git a/src/unitxt/catalog/cards/safety/provoq_gg.json b/src/unitxt/catalog/cards/safety/provoq_gg.json new file mode 100644 index 0000000000..290308b8e4 --- /dev/null +++ b/src/unitxt/catalog/cards/safety/provoq_gg.json @@ -0,0 +1,39 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "IBM/ProvoQ", + "data_classification_policy": [ + "public" + ] + }, + "task": { + "__type__": "task", + "input_fields": { + "question": "str", + "stigma": "str", + "group": "str" + }, + "reference_fields": {}, + "prediction_type": "str", + "metrics": [ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]" + ] + }, + "templates": { + "__type__": "templates_dict", + "items": { + "default": { + "__type__": "input_output_template", + "input_format": "{question}", + "output_format": "" + } + } + }, + "__description__": "The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.", + "__tags__": { + "languages": [ + "english" + ] + } +} diff --git a/src/unitxt/processors.py b/src/unitxt/processors.py index 6f13e10a33..a351999a75 100644 --- a/src/unitxt/processors.py +++ b/src/unitxt/processors.py @@ -326,7 +326,7 @@ def process_value(self, text: Any) -> Any: try: return float(match.group(1)) * 0.25 - 0.25 except: - return np.NaN + return np.nan class ExtractMtBenchLabelJudgment(FieldOperator):