Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
b5fb017
Add a pipeline to test job scheduling delay
Feb 17, 2025
dfd527a
Add cl2 configs
Feb 19, 2025
afd28ed
Fixes typo
Feb 19, 2025
cb46bba
Runs in eastus
Feb 19, 2025
1db0d81
Add credential_type
Feb 25, 2025
0b3afd5
Update VMs to v5
Feb 28, 2025
46335a6
Update vm sizes based on availability
Feb 28, 2025
6073fee
Bug fix: validate the correct role tag of a node.
Mar 7, 2025
c1a1b44
Bug fix: correct node validation
Mar 7, 2025
6749859
Bug fix: Handles when node.status.condition is None.
Mar 7, 2025
dab9d18
Bug fix: Sets default status_conditions
Mar 7, 2025
e36fb96
Correct kwok_nodes parameter.
Mar 10, 2025
9469a44
fix node count error cloud:aks
vittoriasalim Mar 11, 2025
ed71b35
fix parameter issue
vittoriasalim Mar 11, 2025
fd354af
fixing parameter issue 2
vittoriasalim Mar 11, 2025
362f96c
fix parameter issue
vittoriasalim Mar 11, 2025
cd4da6c
add function to check why is it flagged
vittoriasalim Mar 11, 2025
8d29445
Combined commits: add job scheduling pipeline, add unit test, configu…
vittoriasalim Apr 29, 2025
52186e9
Merge main to vitto/kwok-cl2
vittoriasalim Apr 29, 2025
2f2eedc
Merge branch 'main' into vitto/kwok-cl2
vittoriasalim Apr 29, 2025
600809d
Correction to unit test, rename tuning set and cli.pym, rename scenar…
vittoriasalim May 1, 2025
c9f6667
Merge branch 'main' into vitto/kwok-cl2
vittoriasalim May 1, 2025
799638d
Correction on unit test
vittoriasalim May 1, 2025
d5475a6
Correction: mktemp is deprecated, convert to mkstemp
vittoriasalim May 1, 2025
a31c61c
Update pipelines/perf-eval/Controller/job-scheduling.yml
vittoriasalim May 1, 2025
091a8ff
Merge branch 'main' into vitto/kwok-cl2
vittoriasalim May 1, 2025
6595878
Update scenarios/perf-eval/job-scheduling/config/job_template.yaml
vittoriasalim May 2, 2025
0274950
Update pipelines/perf-eval/Controller/job-scheduling.yml
vittoriasalim May 2, 2025
12e859a
Yaml file corrections
vittoriasalim May 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .yamllint
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# Default configuration: https://yamllint.readthedocs.io/en/stable/configuration.html#default-configuration

extends: default

rules:
document-start: disable
document-end: disable
Expand All @@ -20,3 +19,4 @@ rules:
ignore: |
modules/python/clusterloader2/**/*.yaml
modules/python/clusterloader2/**/*.yml
scenarios/perf-eval/job-scheduler/**/*.yaml
36 changes: 17 additions & 19 deletions modules/python/clients/kubernetes_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
# https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#taint-based-evictions
# https://kubernetes.io/docs/reference/labels-annotations-taints/
builtin_taints_keys = [
"node.kubernetes.io/not-ready",
"node.kubernetes.io/unreachable",
"node.kubernetes.io/pid-pressure",
"node.kubernetes.io/out-of-disk",
"node.kubernetes.io/memory-pressure",
"node.kubernetes.io/disk-pressure",
"node.kubernetes.io/network-unavailable",
"node.kubernetes.io/unschedulable",
"node.cloudprovider.kubernetes.io/uninitialized",
"node.cloudprovider.kubernetes.io/shutdown",
"node.kubernetes.io/not-ready",
"node.kubernetes.io/unreachable",
"node.kubernetes.io/pid-pressure",
"node.kubernetes.io/out-of-disk",
"node.kubernetes.io/memory-pressure",
"node.kubernetes.io/disk-pressure",
"node.kubernetes.io/network-unavailable",
"node.kubernetes.io/unschedulable",
"node.cloudprovider.kubernetes.io/uninitialized",
"node.cloudprovider.kubernetes.io/shutdown",
"kwok.x-k8s.io/kwok",
]

class KubernetesClient:
Expand Down Expand Up @@ -46,15 +47,12 @@ def get_ready_nodes(self, label_selector=None, field_selector=None):
]

def _is_node_schedulable(self, node):
is_schedulable = False
status_conditions = None
if node and node.status and node.status.conditions:
status_conditions = {cond.type: cond.status for cond in node.status.conditions}
is_schedulable = (
status_conditions.get("Ready") == "True"
and status_conditions.get("NetworkUnavailable") != "True"
and node.spec.unschedulable is not True
)
status_conditions = {cond.type: cond.status for cond in node.status.conditions}
is_schedulable = (
status_conditions.get("Ready") == "True"
and status_conditions.get("NetworkUnavailable") != "True"
and node.spec.unschedulable is not True
)
if not is_schedulable:
print(f"Node NOT Ready: '{node.metadata.name}' is not schedulable. status_conditions: {status_conditions}. unschedulable: {node.spec.unschedulable}")

Expand Down
Loading