|
23 | 23 | import vertexai
|
24 | 24 | from google.cloud.aiplatform import initializer as aiplatform_initializer
|
25 | 25 | from vertexai import _genai
|
| 26 | +from vertexai._genai import _evals_data_converters |
26 | 27 | from vertexai._genai import types as vertexai_genai_types
|
27 | 28 | from google.genai import types as genai_types
|
28 | 29 | import google.genai.errors as genai_errors
|
@@ -771,3 +772,298 @@ def test_prompt_template_assemble_multimodal_variable_integration(self):
|
771 | 772 | assembled_content.parts[2].text
|
772 | 773 | == " and then answer: This is a simple text."
|
773 | 774 | )
|
| 775 | + |
| 776 | + |
| 777 | +class TestGeminiEvalDataConverter: |
| 778 | + """Unit tests for the _GeminiEvalDataConverter class.""" |
| 779 | + |
| 780 | + def setup_method(self): |
| 781 | + self.converter = _evals_data_converters._GeminiEvalDataConverter() |
| 782 | + |
| 783 | + def test_convert_simple_prompt_response(self): |
| 784 | + raw_data = [ |
| 785 | + { |
| 786 | + "request": { |
| 787 | + "contents": [{"role": "user", "parts": [{"text": "Hello"}]}] |
| 788 | + }, |
| 789 | + "response": { |
| 790 | + "candidates": [ |
| 791 | + { |
| 792 | + "content": {"role": "model", "parts": [{"text": "Hi"}]}, |
| 793 | + "finish_reason": "STOP", |
| 794 | + } |
| 795 | + ], |
| 796 | + "usage_metadata": { |
| 797 | + "prompt_token_count": 1, |
| 798 | + "candidates_token_count": 1, |
| 799 | + "total_token_count": 2, |
| 800 | + }, |
| 801 | + }, |
| 802 | + } |
| 803 | + ] |
| 804 | + result_dataset = self.converter.convert(raw_data) |
| 805 | + assert isinstance(result_dataset, vertexai_genai_types.EvaluationDataset) |
| 806 | + assert len(result_dataset.eval_cases) == 1 |
| 807 | + eval_case = result_dataset.eval_cases[0] |
| 808 | + |
| 809 | + assert eval_case.prompt == genai_types.Content( |
| 810 | + parts=[genai_types.Part(text="Hello")], role="user" |
| 811 | + ) |
| 812 | + assert len(eval_case.responses) == 1 |
| 813 | + assert eval_case.responses[0].response == genai_types.Content( |
| 814 | + parts=[genai_types.Part(text="Hi")], role="model" |
| 815 | + ) |
| 816 | + assert eval_case.reference.response is None |
| 817 | + assert eval_case.system_instruction.parts is None |
| 818 | + assert eval_case.conversation_history == [] |
| 819 | + assert eval_case.eval_case_metadata.provider == "google" |
| 820 | + |
| 821 | + def test_convert_with_system_instruction(self): |
| 822 | + raw_data = [ |
| 823 | + { |
| 824 | + "request": { |
| 825 | + "system_instruction": { |
| 826 | + "role": "system", |
| 827 | + "parts": [{"text": "Be nice."}], |
| 828 | + }, |
| 829 | + "contents": [{"role": "user", "parts": [{"text": "Hello"}]}], |
| 830 | + }, |
| 831 | + "response": { |
| 832 | + "candidates": [ |
| 833 | + { |
| 834 | + "content": { |
| 835 | + "role": "model", |
| 836 | + "parts": [{"text": "Hi there!"}], |
| 837 | + } |
| 838 | + } |
| 839 | + ] |
| 840 | + }, |
| 841 | + } |
| 842 | + ] |
| 843 | + result_dataset = self.converter.convert(raw_data) |
| 844 | + eval_case = result_dataset.eval_cases[0] |
| 845 | + assert eval_case.system_instruction == genai_types.Content( |
| 846 | + parts=[genai_types.Part(text="Be nice.")], role="system" |
| 847 | + ) |
| 848 | + assert eval_case.prompt == genai_types.Content( |
| 849 | + parts=[genai_types.Part(text="Hello")], role="user" |
| 850 | + ) |
| 851 | + |
| 852 | + def test_convert_with_conversation_history_and_reference(self): |
| 853 | + raw_data_for_reference = [ |
| 854 | + { |
| 855 | + "request": { |
| 856 | + "contents": [ |
| 857 | + { |
| 858 | + "role": "user", |
| 859 | + "parts": [{"text": "Initial user"}], |
| 860 | + }, # history |
| 861 | + { |
| 862 | + "role": "model", |
| 863 | + "parts": [{"text": "Initial model"}], |
| 864 | + }, # history |
| 865 | + { |
| 866 | + "role": "user", |
| 867 | + "parts": [{"text": "Actual prompt"}], |
| 868 | + }, # prompt |
| 869 | + { |
| 870 | + "role": "model", |
| 871 | + "parts": [{"text": "This is reference"}], |
| 872 | + }, # reference |
| 873 | + ] |
| 874 | + }, |
| 875 | + "response": { |
| 876 | + "candidates": [ |
| 877 | + { |
| 878 | + "content": { |
| 879 | + "role": "model", |
| 880 | + "parts": [{"text": "Actual response"}], |
| 881 | + } |
| 882 | + } |
| 883 | + ] |
| 884 | + }, |
| 885 | + } |
| 886 | + ] |
| 887 | + result_dataset = self.converter.convert(raw_data_for_reference) |
| 888 | + eval_case = result_dataset.eval_cases[0] |
| 889 | + |
| 890 | + assert eval_case.prompt == genai_types.Content( |
| 891 | + parts=[genai_types.Part(text="Actual prompt")], role="user" |
| 892 | + ) |
| 893 | + assert eval_case.reference.response == genai_types.Content( |
| 894 | + parts=[genai_types.Part(text="This is reference")], role="model" |
| 895 | + ) |
| 896 | + assert len(eval_case.conversation_history) == 2 |
| 897 | + assert eval_case.conversation_history[0].content == genai_types.Content( |
| 898 | + parts=[genai_types.Part(text="Initial user")], role="user" |
| 899 | + ) |
| 900 | + assert eval_case.conversation_history[1].content == genai_types.Content( |
| 901 | + parts=[genai_types.Part(text="Initial model")], role="model" |
| 902 | + ) |
| 903 | + assert eval_case.responses[0].response == genai_types.Content( |
| 904 | + parts=[genai_types.Part(text="Actual response")], role="model" |
| 905 | + ) |
| 906 | + |
| 907 | + def test_convert_with_conversation_history_no_reference(self): |
| 908 | + # Last message in contents is from user, so it becomes the prompt. |
| 909 | + raw_data = [ |
| 910 | + { |
| 911 | + "request": { |
| 912 | + "contents": [ |
| 913 | + {"role": "user", "parts": [{"text": "Old user msg"}]}, |
| 914 | + {"role": "model", "parts": [{"text": "Old model msg"}]}, |
| 915 | + {"role": "user", "parts": [{"text": "Current prompt"}]}, |
| 916 | + ] |
| 917 | + }, |
| 918 | + "response": { |
| 919 | + "candidates": [ |
| 920 | + { |
| 921 | + "content": { |
| 922 | + "role": "model", |
| 923 | + "parts": [{"text": "A response"}], |
| 924 | + } |
| 925 | + } |
| 926 | + ] |
| 927 | + }, |
| 928 | + } |
| 929 | + ] |
| 930 | + result_dataset = self.converter.convert(raw_data) |
| 931 | + eval_case = result_dataset.eval_cases[0] |
| 932 | + |
| 933 | + assert eval_case.prompt == genai_types.Content( |
| 934 | + parts=[genai_types.Part(text="Current prompt")], role="user" |
| 935 | + ) |
| 936 | + assert eval_case.reference.response is None |
| 937 | + assert len(eval_case.conversation_history) == 2 |
| 938 | + assert eval_case.conversation_history[0].content.parts[0].text == "Old user msg" |
| 939 | + assert ( |
| 940 | + eval_case.conversation_history[1].content.parts[0].text == "Old model msg" |
| 941 | + ) |
| 942 | + |
| 943 | + def test_convert_no_candidates_in_response(self): |
| 944 | + raw_data = [ |
| 945 | + { |
| 946 | + "request": { |
| 947 | + "contents": [{"role": "user", "parts": [{"text": "Hello"}]}] |
| 948 | + }, |
| 949 | + "response": { |
| 950 | + "candidates": [], |
| 951 | + "prompt_feedback": {"block_reason": "SAFETY"}, |
| 952 | + }, |
| 953 | + } |
| 954 | + ] |
| 955 | + result_dataset = self.converter.convert(raw_data) |
| 956 | + eval_case = result_dataset.eval_cases[0] |
| 957 | + assert len(eval_case.responses) == 1 |
| 958 | + assert eval_case.responses[0].response is None |
| 959 | + |
| 960 | + def test_convert_invalid_content_structure_raises_value_error(self): |
| 961 | + raw_data = [ |
| 962 | + { |
| 963 | + "request": {"contents": ["not a dict"]}, # Invalid content |
| 964 | + "response": { |
| 965 | + "candidates": [ |
| 966 | + {"content": {"role": "model", "parts": [{"text": "Hi"}]}} |
| 967 | + ] |
| 968 | + }, |
| 969 | + } |
| 970 | + ] |
| 971 | + with pytest.raises(ValueError, match="Invalid content structure at turn 0"): |
| 972 | + self.converter.convert(raw_data) |
| 973 | + |
| 974 | + raw_data_missing_parts = [ |
| 975 | + { |
| 976 | + "request": {"contents": [{"role": "user"}]}, # Missing 'parts' |
| 977 | + "response": { |
| 978 | + "candidates": [ |
| 979 | + {"content": {"role": "model", "parts": [{"text": "Hi"}]}} |
| 980 | + ] |
| 981 | + }, |
| 982 | + } |
| 983 | + ] |
| 984 | + with pytest.raises(ValueError, match="Invalid content structure at turn 0"): |
| 985 | + self.converter.convert(raw_data_missing_parts) |
| 986 | + |
| 987 | + def test_convert_multiple_items(self): |
| 988 | + raw_data = [ |
| 989 | + { |
| 990 | + "request": { |
| 991 | + "contents": [{"role": "user", "parts": [{"text": "Item 1"}]}] |
| 992 | + }, |
| 993 | + "response": { |
| 994 | + "candidates": [ |
| 995 | + { |
| 996 | + "content": { |
| 997 | + "role": "model", |
| 998 | + "parts": [{"text": "Resp 1"}], |
| 999 | + } |
| 1000 | + } |
| 1001 | + ] |
| 1002 | + }, |
| 1003 | + }, |
| 1004 | + { |
| 1005 | + "request": { |
| 1006 | + "contents": [{"role": "user", "parts": [{"text": "Item 2"}]}] |
| 1007 | + }, |
| 1008 | + "response": { |
| 1009 | + "candidates": [ |
| 1010 | + { |
| 1011 | + "content": { |
| 1012 | + "role": "model", |
| 1013 | + "parts": [{"text": "Resp 2"}], |
| 1014 | + } |
| 1015 | + } |
| 1016 | + ] |
| 1017 | + }, |
| 1018 | + }, |
| 1019 | + ] |
| 1020 | + result_dataset = self.converter.convert(raw_data) |
| 1021 | + assert len(result_dataset.eval_cases) == 2 |
| 1022 | + assert result_dataset.eval_cases[0].prompt.parts[0].text == "Item 1" |
| 1023 | + assert result_dataset.eval_cases[1].prompt.parts[0].text == "Item 2" |
| 1024 | + |
| 1025 | + def test_model_version_extraction(self): |
| 1026 | + example_batch_prediction_output = """{ |
| 1027 | + "status": "", |
| 1028 | + "processed_time": "2024-11-01T18:13:16.826+00:00", |
| 1029 | + "request": { |
| 1030 | + "contents": [ |
| 1031 | + { |
| 1032 | + "parts": [ |
| 1033 | + { |
| 1034 | + "text": "What is the relation between the following video and image samples?" |
| 1035 | + } |
| 1036 | + ], |
| 1037 | + "role": "user" |
| 1038 | + } |
| 1039 | + ] |
| 1040 | + }, |
| 1041 | + "response": { |
| 1042 | + "candidates": [ |
| 1043 | + { |
| 1044 | + "avgLogprobs": -0.5782725546095107, |
| 1045 | + "content": { |
| 1046 | + "parts": [ |
| 1047 | + { |
| 1048 | + "text": "This video shows a Google Photos marketing." |
| 1049 | + } |
| 1050 | + ], |
| 1051 | + "role": "model" |
| 1052 | + }, |
| 1053 | + "finishReason": "STOP" |
| 1054 | + } |
| 1055 | + ], |
| 1056 | + "modelVersion": "gemini-2.0-flash-001@default", |
| 1057 | + "usageMetadata": { |
| 1058 | + "candidatesTokenCount": 36, |
| 1059 | + "promptTokenCount": 29180, |
| 1060 | + "totalTokenCount": 29216 |
| 1061 | + } |
| 1062 | + } |
| 1063 | + }""" |
| 1064 | + loaded_gemini_data = [json.loads(example_batch_prediction_output)] |
| 1065 | + result_dataset = self.converter.convert(loaded_gemini_data) |
| 1066 | + assert ( |
| 1067 | + result_dataset.eval_cases[0].eval_case_metadata.model |
| 1068 | + == "gemini-2.0-flash-001@default" |
| 1069 | + ) |
0 commit comments