-
Notifications
You must be signed in to change notification settings - Fork 18
CCP Job Scheduling Latency Benchmark #575
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 24 commits
b5fb017
dfd527a
afd28ed
cb46bba
1db0d81
0b3afd5
46335a6
6073fee
c1a1b44
6749859
dab9d18
e36fb96
9469a44
ed71b35
fd354af
362f96c
cd4da6c
8d29445
52186e9
2f2eedc
600809d
c9f6667
799638d
d5475a6
a31c61c
091a8ff
6595878
0274950
12e859a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| { | ||
| "data": { | ||
| "Perc50": 78000, | ||
| "Perc90": 141000, | ||
| "Perc99": 155000 | ||
| }, | ||
| "unit": "ms", | ||
| "labels": { | ||
| "Metric": "create_to_start" | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| <?xml version="1.0" encoding="UTF-8"?> | ||
| <testsuites> | ||
| <testsuite name="ClusterLoaderV2" tests="1" failures="0" errors="0" time="0.123"> | ||
| <testcase name="JobLifecycleLatency" classname="JobScheduling" time="0.123"> | ||
| </testcase> | ||
| </testsuite> | ||
| </testsuites> |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,183 @@ | ||
| import json | ||
| import os | ||
| import tempfile | ||
| import unittest | ||
| from unittest.mock import patch | ||
|
|
||
| from clusterloader2.default.cli import ( | ||
| collect_clusterloader2, | ||
| configure_clusterloader2, | ||
| validate_clusterloader2, | ||
| ) | ||
|
|
||
|
|
||
| class TestConfigureClusterLoader2(unittest.TestCase): | ||
| def test_configure_clusterloader2(self): | ||
| # Create a temporary file for the override file | ||
| fd, tmp_path = tempfile.mkstemp() | ||
|
|
||
| try: | ||
| # Call the function with test data | ||
| configure_clusterloader2( | ||
| cpu_per_node=2, | ||
| node_count=100, | ||
| node_per_step=10, | ||
| max_pods=40, | ||
| repeats=1, | ||
| operation_timeout="15m", | ||
| provider="azure", | ||
| cilium_enabled=False, | ||
| scrape_containerd=False, | ||
| service_test=True, | ||
| cnp_test=False, | ||
| ccnp_test=False, | ||
| num_cnps=0, | ||
| num_ccnps=0, | ||
| dualstack=False, | ||
| cl2_override_file=tmp_path, | ||
| workload_type="job", | ||
| job_count=1000, | ||
| job_parallelism=1, | ||
| job_completions=1, | ||
| job_throughput=1000, | ||
| ) | ||
|
|
||
| # Verify the content of the override file | ||
| with open(tmp_path, "r", encoding="utf-8") as f: | ||
| content = f.read() | ||
|
|
||
| # Assert each key-value pair | ||
| self.assertIn("CL2_NODES: 100", content) | ||
| self.assertIn("CL2_NODES_PER_STEP: 10", content) | ||
| self.assertIn("CL2_OPERATION_TIMEOUT: 15m", content) | ||
| self.assertIn("CL2_REPEATS: 1", content) | ||
| self.assertIn("CL2_STEPS: 10", content) | ||
| self.assertIn("CL2_JOBS: 1000", content) | ||
| self.assertIn("CL2_JOB_PARALLELISM: 1", content) | ||
| self.assertIn("CL2_JOB_COMPLETIONS: 1", content) | ||
| self.assertIn("CL2_LOAD_TEST_THROUGHPUT: 1000", content) | ||
| self.assertIn("CL2_SERVICE_TEST: true", content) | ||
| finally: | ||
| os.close(fd) | ||
|
|
||
|
|
||
| class TestValidateClusterLoader2(unittest.TestCase): | ||
|
|
||
| @patch("clients.kubernetes_client.config.load_kube_config") | ||
| @patch("clients.kubernetes_client.KubernetesClient.get_ready_nodes") | ||
| def test_validate_clusterloader2_timeout( | ||
| self, mock_get_ready_nodes, mock_load_kube_config | ||
| ): | ||
|
|
||
| # kubeconfig is not needed for this test but it has to be loaded to run KubernetesClient | ||
| mock_load_kube_config.return_value = None | ||
| # Mock the KubernetesClient and its get_ready_nodes method | ||
| mock_get_ready_nodes.return_value = ["node1"] # Only 1 node ready | ||
|
|
||
| # Call the function and expect an exception due to timeout | ||
| with self.assertRaises(Exception) as context: | ||
| validate_clusterloader2(node_count=2, operation_timeout_in_minutes=1) | ||
|
|
||
| # Verify the exception message | ||
| self.assertIn( | ||
| "Only 1 nodes are ready, expected 2 nodes!", str(context.exception) | ||
| ) | ||
|
|
||
| @patch("clients.kubernetes_client.config.load_kube_config") | ||
| @patch("clients.kubernetes_client.KubernetesClient.get_ready_nodes") | ||
| def test_validate_clusterloader2_success( | ||
| self, mock_get_ready_nodes, mock_load_kube_config | ||
| ): | ||
| mock_load_kube_config.return_value = None | ||
| # Mock the KubernetesClient and its get_ready_nodes method | ||
| mock_get_ready_nodes.side_effect = [ | ||
| ["node1"], # First call: 1 node ready | ||
| ["node1", "node2"], # Second call: 2 nodes ready | ||
| ] | ||
|
|
||
| # Call the function with test data | ||
| try: | ||
| validate_clusterloader2(node_count=2, operation_timeout_in_minutes=1) | ||
| except Exception as e: | ||
| self.fail(f"validate_clusterloader2 raised an exception unexpectedly: {e}") | ||
|
|
||
| # Verify that get_ready_nodes was at least 2 calls | ||
| # The first call should return 1 node, and the second call should return 2 nodes | ||
| self.assertGreaterEqual(mock_get_ready_nodes.call_count, 2) | ||
|
|
||
|
|
||
| class TestCollectClusterLoader2(unittest.TestCase): | ||
| def test_collect_clusterloader2(self): | ||
| # Create a temporary directory for the report | ||
| cl2_report_dir = os.path.join( | ||
| os.path.dirname(__file__), "mock_data", "default", "report" | ||
| ) | ||
| # Create a temporary file for result output | ||
| fd, result_file = tempfile.mkstemp() | ||
|
|
||
| try: | ||
| # Call the function with test data | ||
| collect_clusterloader2( | ||
| cpu_per_node=2, | ||
| node_count=100, | ||
| max_pods=40, | ||
| repeats=1, | ||
| cl2_report_dir=cl2_report_dir, | ||
| cloud_info=json.dumps({"cloud": "aws"}), | ||
| run_id="run123", | ||
| run_url="http://example.com/run123", | ||
| service_test=True, | ||
| cnp_test=False, | ||
| ccnp_test=False, | ||
| result_file=result_file, | ||
| test_type="unit-test", | ||
| start_timestamp=None, | ||
| workload_type="pod", | ||
| job_count=None, | ||
| job_parallelism=None, | ||
| job_completions=None, | ||
| job_throughput=None, | ||
| ) | ||
|
|
||
| # Verify the content of the result file | ||
| if os.path.exists(result_file): | ||
| with open(result_file, "r", encoding="utf-8") as f: | ||
| content = f.read() | ||
|
|
||
| # Parse the content as JSON | ||
| result_data = json.loads(content) | ||
|
|
||
| # Assert each key-value pair | ||
| self.assertEqual(result_data["node_count"], 100) | ||
| self.assertEqual(result_data["churn_rate"], 1) | ||
| self.assertEqual(result_data["status"], "success") | ||
| self.assertEqual(result_data["group"], "job-scheduling") | ||
| self.assertEqual( | ||
| result_data["measurement"], | ||
| "JobLifecycleLatency_JobLifecycleLatency", | ||
| ) | ||
|
|
||
| # Assert nested result data | ||
| self.assertEqual(result_data["result"]["data"]["Perc50"], 78000) | ||
| self.assertEqual(result_data["result"]["data"]["Perc90"], 141000) | ||
| self.assertEqual(result_data["result"]["data"]["Perc99"], 155000) | ||
| self.assertEqual(result_data["result"]["unit"], "ms") | ||
| self.assertEqual( | ||
| result_data["result"]["labels"]["Metric"], "create_to_start" | ||
| ) | ||
|
|
||
| # Assert other fields | ||
| self.assertEqual(result_data["cloud_info"], '{"cloud": "aws"}') | ||
| self.assertEqual(result_data["run_id"], "run123") | ||
| self.assertEqual(result_data["run_url"], "http://example.com/run123") | ||
| self.assertEqual(result_data["test_type"], "unit-test") | ||
| self.assertEqual(result_data["cpu_per_node"], 2) | ||
| self.assertEqual(result_data["pod_count"], 4000) | ||
| else: | ||
| self.fail("Result file does not exist or is empty.") | ||
| finally: | ||
| os.close(fd) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| trigger: none | ||
| schedules: | ||
| - cron: "30 1 */2 * *" # Every 2 days at 1:30 AM | ||
| displayName: "1:30 AM every 2 days" | ||
| branches: | ||
| include: | ||
| - main | ||
| - vitto/kwok-cl2 # to be removed after the PR is merged | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. private branch scheduled should be removed before the merge. If not, you have to create new PR again to remove this branch.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would suggest leaving this comment unresolved till you resolve all other comments and remove this change after you get an approval for this PR. |
||
| always: true | ||
| variables: | ||
| SCENARIO_TYPE: perf-eval | ||
| SCENARIO_NAME: job-scheduling | ||
| SCENARIO_VERSION: main | ||
| stages: | ||
| - stage: azure_eastus2 | ||
| dependsOn: [] | ||
| jobs: | ||
| - template: /jobs/competitive-test.yml | ||
| parameters: | ||
| cloud: azure | ||
| regions: | ||
| - eastus2 | ||
| engine: clusterloader2 | ||
| engine_input: | ||
| image: "ghcr.io/azure/clusterloader2:v20241022" | ||
vittoriasalim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| topology: kwok | ||
| matrix: | ||
| default: | ||
| node_count: 2000 # 2k kwok nodes | ||
| job_throughput: 800 # qps | ||
| job_count: 20000 | ||
| cilium_enabled: False | ||
| scale_timeout: "1h" | ||
| service_test: False | ||
| workload_type: "job" | ||
| cl2_config_file: config.yaml | ||
| max_parallel: 1 | ||
| timeout_in_minutes: 360 | ||
| credential_type: service_connection | ||
| ssh_key_enabled: false | ||
| - stage: aws_eastus2 | ||
| dependsOn: [] | ||
| jobs: | ||
| - template: /jobs/competitive-test.yml | ||
| parameters: | ||
| cloud: aws | ||
| regions: | ||
| - us-east-2 | ||
| engine: clusterloader2 | ||
| engine_input: | ||
| image: "ghcr.io/azure/clusterloader2:v20241022" | ||
vittoriasalim marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| topology: kwok | ||
| matrix: | ||
| default: | ||
| node_count: 2000 | ||
| job_throughput: 800 | ||
| job_count: 20000 | ||
| cilium_enabled: False | ||
| scale_timeout: "1h" | ||
| service_test: False | ||
| workload_type: "job" | ||
| cl2_config_file: config.yaml | ||
| max_parallel: 1 | ||
| timeout_in_minutes: 360 | ||
| credential_type: service_connection | ||
| ssh_key_enabled: false | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| name: job-scheduling | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be moved to clusterloader2 folder for re-use |
||
|
|
||
| {{$job_count := DefaultParam .CL2_JOBS 20000}} | ||
| {{$qps := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 800}} | ||
|
|
||
| namespace: | ||
| number: 1 | ||
| prefix: job-scheduling | ||
| deleteStaleNamespaces: true | ||
| deleteAutomanagedNamespaces: true | ||
| enableExistingNamespaces: false | ||
|
|
||
| tuningSets: | ||
| - name: Uniform{{$qps}}qps | ||
| qpsLoad: | ||
| qps: {{$qps}} | ||
|
|
||
| steps: | ||
| - name: Start measurements | ||
| measurements: | ||
| - Identifier: JobLifecycleLatency | ||
| Method: JobLifecycleLatency | ||
| Params: | ||
| action: start | ||
| labelSelector: group=job-scheduling | ||
| timeout: 3h | ||
| - Identifier: WaitForFinishedJobs | ||
| Method: WaitForFinishedJobs | ||
| Params: | ||
| action: start | ||
| labelSelector: group=job-scheduling | ||
| timeout: 3h | ||
|
|
||
| {{range $i := Loop $job_count}} | ||
| - name: Create job {{$i}} | ||
| phases: | ||
| - namespaceRange: | ||
| min: 1 | ||
| max: 1 | ||
| replicasPerNamespace: 1 | ||
| tuningSet: Uniform{{$qps}}qps | ||
| objectBundle: | ||
| - basename: test-job-{{$i}} | ||
| objectTemplatePath: job_template.yaml | ||
| templateFillMap: | ||
| Group: job-scheduling | ||
| {{end}} | ||
|
|
||
| - name: Waiting for jobs to be finished | ||
| measurements: | ||
| - Identifier: WaitForFinishedJobs | ||
| Method: WaitForFinishedJobs | ||
| Params: | ||
| action: gather | ||
| timeout: 3h | ||
| - name: Collect measurements | ||
| measurements: | ||
| - Identifier: JobLifecycleLatency | ||
| Method: JobLifecycleLatency | ||
| Params: | ||
| action: gather | ||
| timeout: 3h | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Those files should be moved to clusterloader2 instead of adding ignore here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here's the thought process. Previously cl2 configs are under modules folder. The problem with that is, to write a new config for a new pipeline, we need to copy/past a whole folder of code. This not only include the config files, but also the python files as well. This is very poor in terms of usability. So we propose a new solution, moving config files under scenarios, and created a default python module, that can be reused. Then creating a new pipeline doesn't require to copy the python files anymore.